commit 0c41d43ccc24d542d68dc85b578abf02703cf279 Author: ModelHub XC Date: Sat Jun 6 23:18:23 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Neelectric/Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.08 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..9e3b1c5 --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/OpenR1-Math-220k_all_Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.08 +tags: +- generated_from_trainer +- trl +- open-r1 +- sft +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.08 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/OpenR1-Math-220k_all_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/OpenR1-Math-220k_all_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.08", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_math/runs/5x7wjaxd) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.1.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.5 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..5c4714c --- /dev/null +++ b/all_results.json @@ -0,0 +1,11 @@ +{ + "ewc_loss": 0.008638550527393818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638550207251683e-05, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.3824227017627763, + "train_runtime": 45517.1597, + "train_samples": 125770, + "train_samples_per_second": 8.289, + "train_steps_per_second": 0.518 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..06df27b --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..50f6077 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..4514040 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41fbcf81e0888deae6b6e8841d5d49ee523c45a5c532bd453ae6a1de0ff69b62 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..aa1d99c --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8fbd8ec32ae26289c20e44ac150bcb6d9017209ef73feddecd231199f786ea7 +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..e0c23b6 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3521cf68e3500183d129a005a02a6fa8d179534d16a70c045a01883e962acf3b +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..21e01ca --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01e2475c1402b395efaed75b170112691abf4378de64d57300631cfbcd093e7d +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9d4773c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,11 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3beeacc --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..5c4714c --- /dev/null +++ b/train_results.json @@ -0,0 +1,11 @@ +{ + "ewc_loss": 0.008638550527393818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638550207251683e-05, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.3824227017627763, + "train_runtime": 45517.1597, + "train_samples": 125770, + "train_samples_per_second": 8.289, + "train_steps_per_second": 0.518 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..6a30d82 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,283042 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 23583, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012721027859051011, + "ewc_loss": 0.0, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 0.0, + "grad_norm": 4.835295677185059, + "learning_rate": 0.0, + "loss": 0.7982, + "mean_token_accuracy": 0.7762961387634277, + "num_tokens": 38493.0, + "step": 1 + }, + { + "epoch": 0.00025442055718102023, + "ewc_loss": 0.0, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 0.0, + "grad_norm": 4.588431358337402, + "learning_rate": 4.2390843577787196e-10, + "loss": 0.8329, + "mean_token_accuracy": 0.765798807144165, + "num_tokens": 80419.0, + "step": 2 + }, + { + "epoch": 0.0003816308357715303, + "ewc_loss": 1.0334071948254401e-15, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0334071617382156e-17, + "grad_norm": 4.725151538848877, + "learning_rate": 8.478168715557439e-10, + "loss": 0.7225, + "mean_token_accuracy": 0.7962077856063843, + "num_tokens": 118717.0, + "step": 3 + }, + { + "epoch": 0.0005088411143620405, + "ewc_loss": 4.1345893993672417e-14, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.134589367603506e-16, + "grad_norm": 5.366973876953125, + "learning_rate": 1.271725307333616e-09, + "loss": 0.8139, + "mean_token_accuracy": 0.7712426781654358, + "num_tokens": 150155.0, + "step": 4 + }, + { + "epoch": 0.0006360513929525506, + "ewc_loss": 3.138490173044267e-13, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1384902153959145e-15, + "grad_norm": 4.303130626678467, + "learning_rate": 1.6956337431114878e-09, + "loss": 0.792, + "mean_token_accuracy": 0.774565577507019, + "num_tokens": 193616.0, + "step": 5 + }, + { + "epoch": 0.0007632616715430606, + "ewc_loss": 1.7110038429482555e-12, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7110038429482555e-14, + "grad_norm": 5.208627700805664, + "learning_rate": 2.1195421788893596e-09, + "loss": 0.7894, + "mean_token_accuracy": 0.7781898379325867, + "num_tokens": 227640.0, + "step": 6 + }, + { + "epoch": 0.0008904719501335708, + "ewc_loss": 3.2897842436319102e-12, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.28978414876422e-14, + "grad_norm": 4.832261562347412, + "learning_rate": 2.543450614667232e-09, + "loss": 0.816, + "mean_token_accuracy": 0.7748525142669678, + "num_tokens": 265114.0, + "step": 7 + }, + { + "epoch": 0.001017682228724081, + "ewc_loss": 1.482124492580006e-11, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.482124563053147e-13, + "grad_norm": 4.93231201171875, + "learning_rate": 2.967359050445104e-09, + "loss": 0.7583, + "mean_token_accuracy": 0.7888849973678589, + "num_tokens": 299865.0, + "step": 8 + }, + { + "epoch": 0.001144892507314591, + "ewc_loss": 2.323109413171487e-11, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3231094240135086e-13, + "grad_norm": 4.568447589874268, + "learning_rate": 3.3912674862229757e-09, + "loss": 0.8139, + "mean_token_accuracy": 0.7739652991294861, + "num_tokens": 342063.0, + "step": 9 + }, + { + "epoch": 0.0012721027859051012, + "ewc_loss": 4.366170619496401e-11, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.366170511076184e-13, + "grad_norm": 5.454869270324707, + "learning_rate": 3.815175922000847e-09, + "loss": 0.864, + "mean_token_accuracy": 0.7637333869934082, + "num_tokens": 374864.0, + "step": 10 + }, + { + "epoch": 0.0013993130644956112, + "ewc_loss": 1.310899999662496e-10, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.310899956294409e-12, + "grad_norm": 4.301987171173096, + "learning_rate": 4.239084357778719e-09, + "loss": 0.7764, + "mean_token_accuracy": 0.7752959728240967, + "num_tokens": 416605.0, + "step": 11 + }, + { + "epoch": 0.0015265233430861213, + "ewc_loss": 1.7593881906918796e-10, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7593882080391143e-12, + "grad_norm": 5.453802108764648, + "learning_rate": 4.662992793556591e-09, + "loss": 0.8344, + "mean_token_accuracy": 0.7687666416168213, + "num_tokens": 448798.0, + "step": 12 + }, + { + "epoch": 0.0016537336216766315, + "ewc_loss": 2.2502789909228937e-10, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2502789562284242e-12, + "grad_norm": 5.438741683959961, + "learning_rate": 5.086901229334464e-09, + "loss": 0.874, + "mean_token_accuracy": 0.7538818717002869, + "num_tokens": 480084.0, + "step": 13 + }, + { + "epoch": 0.0017809439002671415, + "ewc_loss": 6.859076639997852e-10, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.8590766746923215e-12, + "grad_norm": 4.382340908050537, + "learning_rate": 5.510809665112336e-09, + "loss": 0.8057, + "mean_token_accuracy": 0.7793654799461365, + "num_tokens": 524543.0, + "step": 14 + }, + { + "epoch": 0.0019081541788576518, + "ewc_loss": 1.0427653185374197e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0427653532318892e-11, + "grad_norm": 4.79224157333374, + "learning_rate": 5.934718100890208e-09, + "loss": 0.752, + "mean_token_accuracy": 0.788784921169281, + "num_tokens": 563314.0, + "step": 15 + }, + { + "epoch": 0.002035364457448162, + "ewc_loss": 1.258940951487375e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.258940954956822e-11, + "grad_norm": 5.073911190032959, + "learning_rate": 6.3586265366680796e-09, + "loss": 0.8421, + "mean_token_accuracy": 0.7625489830970764, + "num_tokens": 598421.0, + "step": 16 + }, + { + "epoch": 0.002162574736038672, + "ewc_loss": 1.5760920346608032e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5760920346608032e-11, + "grad_norm": 5.032065391540527, + "learning_rate": 6.782534972445951e-09, + "loss": 0.832, + "mean_token_accuracy": 0.76270592212677, + "num_tokens": 634690.0, + "step": 17 + }, + { + "epoch": 0.002289785014629182, + "ewc_loss": 1.816144790112162e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8161448039899497e-11, + "grad_norm": 5.123416423797607, + "learning_rate": 7.206443408223823e-09, + "loss": 0.8691, + "mean_token_accuracy": 0.7593603134155273, + "num_tokens": 674653.0, + "step": 18 + }, + { + "epoch": 0.0024169952932196924, + "ewc_loss": 4.7200701125404976e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.720070168051649e-11, + "grad_norm": 5.063513278961182, + "learning_rate": 7.630351844001695e-09, + "loss": 0.8007, + "mean_token_accuracy": 0.7774041891098022, + "num_tokens": 708238.0, + "step": 19 + }, + { + "epoch": 0.0025442055718102024, + "ewc_loss": 7.157356041176399e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.157355957909672e-11, + "grad_norm": 4.632686614990234, + "learning_rate": 8.054260279779567e-09, + "loss": 0.7825, + "mean_token_accuracy": 0.7781663537025452, + "num_tokens": 749312.0, + "step": 20 + }, + { + "epoch": 0.0026714158504007124, + "ewc_loss": 8.696044773159883e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.696044578870854e-11, + "grad_norm": 5.405825614929199, + "learning_rate": 8.478168715557438e-09, + "loss": 0.8526, + "mean_token_accuracy": 0.7665184736251831, + "num_tokens": 783532.0, + "step": 21 + }, + { + "epoch": 0.0027986261289912225, + "ewc_loss": 9.789277832794596e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.789277721772294e-11, + "grad_norm": 5.087620735168457, + "learning_rate": 8.902077151335311e-09, + "loss": 0.83, + "mean_token_accuracy": 0.7733687162399292, + "num_tokens": 817429.0, + "step": 22 + }, + { + "epoch": 0.0029258364075817325, + "ewc_loss": 1.1487577111779501e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1487577555868711e-10, + "grad_norm": 4.821132659912109, + "learning_rate": 9.325985587113182e-09, + "loss": 0.7637, + "mean_token_accuracy": 0.7884141206741333, + "num_tokens": 853964.0, + "step": 23 + }, + { + "epoch": 0.0030530466861722425, + "ewc_loss": 1.2837070073601353e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2837070295645958e-10, + "grad_norm": 5.443609714508057, + "learning_rate": 9.749894022891054e-09, + "loss": 0.8275, + "mean_token_accuracy": 0.7678848505020142, + "num_tokens": 885070.0, + "step": 24 + }, + { + "epoch": 0.003180256964762753, + "ewc_loss": 1.405418270650216e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4054182817524463e-10, + "grad_norm": 4.828397750854492, + "learning_rate": 1.0173802458668929e-08, + "loss": 0.8798, + "mean_token_accuracy": 0.7514750957489014, + "num_tokens": 926893.0, + "step": 25 + }, + { + "epoch": 0.003307467243353263, + "ewc_loss": 3.068821996521365e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.068821863294602e-10, + "grad_norm": 4.8837738037109375, + "learning_rate": 1.05977108944468e-08, + "loss": 0.8164, + "mean_token_accuracy": 0.7678709626197815, + "num_tokens": 964773.0, + "step": 26 + }, + { + "epoch": 0.003434677521943773, + "ewc_loss": 4.706197742621043e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.706197653803201e-10, + "grad_norm": 4.737656116485596, + "learning_rate": 1.1021619330224672e-08, + "loss": 0.7691, + "mean_token_accuracy": 0.7852954268455505, + "num_tokens": 1002725.0, + "step": 27 + }, + { + "epoch": 0.003561887800534283, + "ewc_loss": 5.7176713141871005e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.717671336391561e-10, + "grad_norm": 4.986976623535156, + "learning_rate": 1.1445527766002543e-08, + "loss": 0.8732, + "mean_token_accuracy": 0.7571167945861816, + "num_tokens": 1040296.0, + "step": 28 + }, + { + "epoch": 0.003689098079124793, + "ewc_loss": 6.480674130671105e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.480673775399737e-10, + "grad_norm": 4.442420482635498, + "learning_rate": 1.1869436201780416e-08, + "loss": 0.7658, + "mean_token_accuracy": 0.7853585481643677, + "num_tokens": 1081711.0, + "step": 29 + }, + { + "epoch": 0.0038163083577153036, + "ewc_loss": 7.065973761655187e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06597391708641e-10, + "grad_norm": 4.8024396896362305, + "learning_rate": 1.2293344637558287e-08, + "loss": 0.8122, + "mean_token_accuracy": 0.7755138278007507, + "num_tokens": 1120556.0, + "step": 30 + }, + { + "epoch": 0.003943518636305814, + "ewc_loss": 7.669066803828173e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.669066492965726e-10, + "grad_norm": 4.763520240783691, + "learning_rate": 1.2717253073336159e-08, + "loss": 0.7639, + "mean_token_accuracy": 0.7897307872772217, + "num_tokens": 1157723.0, + "step": 31 + }, + { + "epoch": 0.004070728914896324, + "ewc_loss": 8.491874581295633e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49187442586441e-10, + "grad_norm": 4.68511438369751, + "learning_rate": 1.314116150911403e-08, + "loss": 0.8101, + "mean_token_accuracy": 0.7737994194030762, + "num_tokens": 1197879.0, + "step": 32 + }, + { + "epoch": 0.004197939193486834, + "ewc_loss": 9.121987432081369e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.121987609717053e-10, + "grad_norm": 4.755204200744629, + "learning_rate": 1.3565069944891903e-08, + "loss": 0.8107, + "mean_token_accuracy": 0.7751766443252563, + "num_tokens": 1237342.0, + "step": 33 + }, + { + "epoch": 0.004325149472077344, + "ewc_loss": 9.711732218420366e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.711732529282813e-10, + "grad_norm": 4.486904144287109, + "learning_rate": 1.3988978380669775e-08, + "loss": 0.782, + "mean_token_accuracy": 0.7825092077255249, + "num_tokens": 1280197.0, + "step": 34 + }, + { + "epoch": 0.004452359750667854, + "ewc_loss": 1.0642091297086154e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0642091652357522e-09, + "grad_norm": 4.645351886749268, + "learning_rate": 1.4412886816447646e-08, + "loss": 0.7641, + "mean_token_accuracy": 0.7848525643348694, + "num_tokens": 1318625.0, + "step": 35 + }, + { + "epoch": 0.004579570029258364, + "ewc_loss": 2.076059359978899e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0760593244517622e-09, + "grad_norm": 4.805102348327637, + "learning_rate": 1.4836795252225519e-08, + "loss": 0.822, + "mean_token_accuracy": 0.7723166942596436, + "num_tokens": 1356868.0, + "step": 36 + }, + { + "epoch": 0.004706780307848874, + "ewc_loss": 3.059940638650005e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0599405231868104e-09, + "grad_norm": 4.831749439239502, + "learning_rate": 1.526070368800339e-08, + "loss": 0.7937, + "mean_token_accuracy": 0.7809006571769714, + "num_tokens": 1394696.0, + "step": 37 + }, + { + "epoch": 0.004833990586439385, + "ewc_loss": 3.723313284353935e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7233132132996616e-09, + "grad_norm": 4.367802619934082, + "learning_rate": 1.5684612123781262e-08, + "loss": 0.7826, + "mean_token_accuracy": 0.7825465202331543, + "num_tokens": 1438738.0, + "step": 38 + }, + { + "epoch": 0.004961200865029895, + "ewc_loss": 4.173312504462956e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.173312362354409e-09, + "grad_norm": 4.770002365112305, + "learning_rate": 1.6108520559559135e-08, + "loss": 0.7482, + "mean_token_accuracy": 0.7923873662948608, + "num_tokens": 1475089.0, + "step": 39 + }, + { + "epoch": 0.005088411143620405, + "ewc_loss": 4.528171757556265e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.528171615447718e-09, + "grad_norm": 4.758123874664307, + "learning_rate": 1.6532428995337004e-08, + "loss": 0.8082, + "mean_token_accuracy": 0.7747721672058105, + "num_tokens": 1514566.0, + "step": 40 + }, + { + "epoch": 0.005215621422210915, + "ewc_loss": 4.807221785085858e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.807221731795153e-09, + "grad_norm": 4.949686050415039, + "learning_rate": 1.6956337431114877e-08, + "loss": 0.8231, + "mean_token_accuracy": 0.7733666896820068, + "num_tokens": 1552560.0, + "step": 41 + }, + { + "epoch": 0.005342831700801425, + "ewc_loss": 5.023767357670295e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.023767180034611e-09, + "grad_norm": 5.391010284423828, + "learning_rate": 1.738024586689275e-08, + "loss": 0.8006, + "mean_token_accuracy": 0.7770541310310364, + "num_tokens": 1584759.0, + "step": 42 + }, + { + "epoch": 0.005470041979391935, + "ewc_loss": 5.250593062555708e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.250592849392888e-09, + "grad_norm": 4.845430850982666, + "learning_rate": 1.7804154302670622e-08, + "loss": 0.802, + "mean_token_accuracy": 0.7756665349006653, + "num_tokens": 1621825.0, + "step": 43 + }, + { + "epoch": 0.005597252257982445, + "ewc_loss": 5.609783784166211e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6097837486390745e-09, + "grad_norm": 4.776795387268066, + "learning_rate": 1.8228062738448494e-08, + "loss": 0.8273, + "mean_token_accuracy": 0.7680232524871826, + "num_tokens": 1662946.0, + "step": 44 + }, + { + "epoch": 0.005724462536572955, + "ewc_loss": 5.943875862612913e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.943875613922955e-09, + "grad_norm": 4.653336524963379, + "learning_rate": 1.8651971174226364e-08, + "loss": 0.8153, + "mean_token_accuracy": 0.774024248123169, + "num_tokens": 1699433.0, + "step": 45 + }, + { + "epoch": 0.005851672815163465, + "ewc_loss": 6.213684855538304e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.21368467790262e-09, + "grad_norm": 4.361514091491699, + "learning_rate": 1.9075879610004236e-08, + "loss": 0.7465, + "mean_token_accuracy": 0.7888451814651489, + "num_tokens": 1742812.0, + "step": 46 + }, + { + "epoch": 0.005978883093753975, + "ewc_loss": 6.493976911770005e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.493976911770005e-09, + "grad_norm": 4.854429721832275, + "learning_rate": 1.949978804578211e-08, + "loss": 0.8016, + "mean_token_accuracy": 0.777941882610321, + "num_tokens": 1778725.0, + "step": 47 + }, + { + "epoch": 0.006106093372344485, + "ewc_loss": 6.875090434732556e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.87509027486044e-09, + "grad_norm": 4.804186820983887, + "learning_rate": 1.9923696481559985e-08, + "loss": 0.8476, + "mean_token_accuracy": 0.7617810368537903, + "num_tokens": 1816592.0, + "step": 48 + }, + { + "epoch": 0.006233303650934996, + "ewc_loss": 8.91172362571524e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.911723803350924e-09, + "grad_norm": 4.378138542175293, + "learning_rate": 2.0347604917337857e-08, + "loss": 0.7397, + "mean_token_accuracy": 0.789650022983551, + "num_tokens": 1859907.0, + "step": 49 + }, + { + "epoch": 0.006360513929525506, + "ewc_loss": 1.552403546156711e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5524035390512836e-08, + "grad_norm": 4.678658962249756, + "learning_rate": 2.0771513353115727e-08, + "loss": 0.729, + "mean_token_accuracy": 0.7991288900375366, + "num_tokens": 1896627.0, + "step": 50 + }, + { + "epoch": 0.006487724208116016, + "ewc_loss": 2.1001264940423425e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1001264727260605e-08, + "grad_norm": 4.932518482208252, + "learning_rate": 2.11954217888936e-08, + "loss": 0.8446, + "mean_token_accuracy": 0.7667643427848816, + "num_tokens": 1934041.0, + "step": 51 + }, + { + "epoch": 0.006614934486706526, + "ewc_loss": 2.432607971059042e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.432608070535025e-08, + "grad_norm": 4.549715995788574, + "learning_rate": 2.1619330224671472e-08, + "loss": 0.7738, + "mean_token_accuracy": 0.7844995260238647, + "num_tokens": 1976482.0, + "step": 52 + }, + { + "epoch": 0.006742144765297036, + "ewc_loss": 2.650393753356184e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6503936823019103e-08, + "grad_norm": 5.252098560333252, + "learning_rate": 2.2043238660449344e-08, + "loss": 0.8028, + "mean_token_accuracy": 0.7722218036651611, + "num_tokens": 2009224.0, + "step": 53 + }, + { + "epoch": 0.006869355043887546, + "ewc_loss": 2.8155816380603937e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.815581723325522e-08, + "grad_norm": 4.887631416320801, + "learning_rate": 2.2467147096227214e-08, + "loss": 0.8474, + "mean_token_accuracy": 0.7621102333068848, + "num_tokens": 2049235.0, + "step": 54 + }, + { + "epoch": 0.006996565322478056, + "ewc_loss": 2.944198058685288e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9441981297395614e-08, + "grad_norm": 4.564370155334473, + "learning_rate": 2.2891055532005086e-08, + "loss": 0.8602, + "mean_token_accuracy": 0.7609819173812866, + "num_tokens": 2090260.0, + "step": 55 + }, + { + "epoch": 0.007123775601068566, + "ewc_loss": 3.0730213893548353e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.073021304089707e-08, + "grad_norm": 4.983375549316406, + "learning_rate": 2.331496396778296e-08, + "loss": 0.8657, + "mean_token_accuracy": 0.7602970600128174, + "num_tokens": 2126686.0, + "step": 56 + }, + { + "epoch": 0.007250985879659076, + "ewc_loss": 3.18975207846961e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.189752106891319e-08, + "grad_norm": 4.339780330657959, + "learning_rate": 2.373887240356083e-08, + "loss": 0.7675, + "mean_token_accuracy": 0.7850371599197388, + "num_tokens": 2171355.0, + "step": 57 + }, + { + "epoch": 0.007378196158249586, + "ewc_loss": 3.29624890582636e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.296248962669779e-08, + "grad_norm": 4.577752113342285, + "learning_rate": 2.4162780839338704e-08, + "loss": 0.7407, + "mean_token_accuracy": 0.7932424545288086, + "num_tokens": 2211660.0, + "step": 58 + }, + { + "epoch": 0.007505406436840096, + "ewc_loss": 3.389881840121234e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.38988179748867e-08, + "grad_norm": 5.525413990020752, + "learning_rate": 2.4586689275116573e-08, + "loss": 0.8711, + "mean_token_accuracy": 0.759583592414856, + "num_tokens": 2244411.0, + "step": 59 + }, + { + "epoch": 0.007632616715430607, + "ewc_loss": 3.501418859741534e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5014188881632435e-08, + "grad_norm": 4.335219860076904, + "learning_rate": 2.5010597710894446e-08, + "loss": 0.7975, + "mean_token_accuracy": 0.7714523077011108, + "num_tokens": 2285253.0, + "step": 60 + }, + { + "epoch": 0.007759826994021117, + "ewc_loss": 3.676114602058078e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.676114701534061e-08, + "grad_norm": 4.475262641906738, + "learning_rate": 2.5434506146672318e-08, + "loss": 0.7621, + "mean_token_accuracy": 0.7856341600418091, + "num_tokens": 2328577.0, + "step": 61 + }, + { + "epoch": 0.007887037272611627, + "ewc_loss": 3.850998837151565e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.850998808729855e-08, + "grad_norm": 4.700071334838867, + "learning_rate": 2.585841458245019e-08, + "loss": 0.8423, + "mean_token_accuracy": 0.7694351673126221, + "num_tokens": 2366524.0, + "step": 62 + }, + { + "epoch": 0.008014247551202136, + "ewc_loss": 4.003455615020357e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0034557002854854e-08, + "grad_norm": 4.177399635314941, + "learning_rate": 2.628232301822806e-08, + "loss": 0.7554, + "mean_token_accuracy": 0.7898952960968018, + "num_tokens": 2408628.0, + "step": 63 + }, + { + "epoch": 0.008141457829792647, + "ewc_loss": 4.127115971641615e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.127116071117598e-08, + "grad_norm": 4.255195617675781, + "learning_rate": 2.6706231454005933e-08, + "loss": 0.7948, + "mean_token_accuracy": 0.7753533124923706, + "num_tokens": 2451800.0, + "step": 64 + }, + { + "epoch": 0.008268668108383158, + "ewc_loss": 4.241901933710324e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.241902118451435e-08, + "grad_norm": 5.649069786071777, + "learning_rate": 2.7130139889783805e-08, + "loss": 0.8658, + "mean_token_accuracy": 0.7548410892486572, + "num_tokens": 2481448.0, + "step": 65 + }, + { + "epoch": 0.008395878386973667, + "ewc_loss": 4.3903514779231045e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.390351548977378e-08, + "grad_norm": 4.260750770568848, + "learning_rate": 2.7554048325561678e-08, + "loss": 0.7928, + "mean_token_accuracy": 0.7754795551300049, + "num_tokens": 2526339.0, + "step": 66 + }, + { + "epoch": 0.008523088665564178, + "ewc_loss": 4.603119577950565e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6031196632156934e-08, + "grad_norm": 4.4169135093688965, + "learning_rate": 2.797795676133955e-08, + "loss": 0.8178, + "mean_token_accuracy": 0.7704201936721802, + "num_tokens": 2570691.0, + "step": 67 + }, + { + "epoch": 0.008650298944154687, + "ewc_loss": 5.082549705548445e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.082549492385624e-08, + "grad_norm": 4.548592567443848, + "learning_rate": 2.840186519711742e-08, + "loss": 0.8047, + "mean_token_accuracy": 0.7750202417373657, + "num_tokens": 2609207.0, + "step": 68 + }, + { + "epoch": 0.008777509222745198, + "ewc_loss": 6.619438863708638e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.619438863708638e-08, + "grad_norm": 4.8702392578125, + "learning_rate": 2.8825773632895292e-08, + "loss": 0.7928, + "mean_token_accuracy": 0.7774490118026733, + "num_tokens": 2645494.0, + "step": 69 + }, + { + "epoch": 0.008904719501335707, + "ewc_loss": 9.486817361903377e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.4868177313856e-08, + "grad_norm": 4.3714680671691895, + "learning_rate": 2.9249682068673165e-08, + "loss": 0.7496, + "mean_token_accuracy": 0.7914925217628479, + "num_tokens": 2686258.0, + "step": 70 + }, + { + "epoch": 0.009031929779926218, + "ewc_loss": 1.1803786037489772e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1803786037489772e-07, + "grad_norm": 4.972944736480713, + "learning_rate": 2.9673590504451037e-08, + "loss": 0.8044, + "mean_token_accuracy": 0.7780866026878357, + "num_tokens": 2720544.0, + "step": 71 + }, + { + "epoch": 0.009159140058516728, + "ewc_loss": 1.3437218512990512e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.343721862667735e-07, + "grad_norm": 4.780664443969727, + "learning_rate": 3.0097498940228907e-08, + "loss": 0.7675, + "mean_token_accuracy": 0.7859805822372437, + "num_tokens": 2758068.0, + "step": 72 + }, + { + "epoch": 0.009286350337107238, + "ewc_loss": 1.4829518477199599e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4829518590886437e-07, + "grad_norm": 4.9970879554748535, + "learning_rate": 3.052140737600678e-08, + "loss": 0.8516, + "mean_token_accuracy": 0.7629842758178711, + "num_tokens": 2793342.0, + "step": 73 + }, + { + "epoch": 0.009413560615697748, + "ewc_loss": 1.616246299818158e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6162462657121068e-07, + "grad_norm": 5.214858531951904, + "learning_rate": 3.094531581178465e-08, + "loss": 0.8886, + "mean_token_accuracy": 0.7549574375152588, + "num_tokens": 2828003.0, + "step": 74 + }, + { + "epoch": 0.009540770894288259, + "ewc_loss": 1.7400603610440157e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7400603269379644e-07, + "grad_norm": 4.1880645751953125, + "learning_rate": 3.1369224247562524e-08, + "loss": 0.7639, + "mean_token_accuracy": 0.7840048670768738, + "num_tokens": 2874755.0, + "step": 75 + }, + { + "epoch": 0.00966798117287877, + "ewc_loss": 1.8281833035871387e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8281832581124036e-07, + "grad_norm": 4.577489376068115, + "learning_rate": 3.17931326833404e-08, + "loss": 0.7859, + "mean_token_accuracy": 0.7768564224243164, + "num_tokens": 2909703.0, + "step": 76 + }, + { + "epoch": 0.009795191451469279, + "ewc_loss": 1.901596442621667e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.90159639146259e-07, + "grad_norm": 4.080408573150635, + "learning_rate": 3.221704111911827e-08, + "loss": 0.716, + "mean_token_accuracy": 0.799230694770813, + "num_tokens": 2954020.0, + "step": 77 + }, + { + "epoch": 0.00992240173005979, + "ewc_loss": 1.9623135813162662e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.962313547210215e-07, + "grad_norm": 5.154361724853516, + "learning_rate": 3.264094955489614e-08, + "loss": 0.8113, + "mean_token_accuracy": 0.7746780514717102, + "num_tokens": 2984894.0, + "step": 78 + }, + { + "epoch": 0.010049612008650299, + "ewc_loss": 2.0203448002575897e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0203448514166666e-07, + "grad_norm": 4.904398441314697, + "learning_rate": 3.306485799067401e-08, + "loss": 0.7182, + "mean_token_accuracy": 0.7951926589012146, + "num_tokens": 3017773.0, + "step": 79 + }, + { + "epoch": 0.01017682228724081, + "ewc_loss": 2.0678131477325223e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0678130852047616e-07, + "grad_norm": 4.208227157592773, + "learning_rate": 3.348876642645188e-08, + "loss": 0.7177, + "mean_token_accuracy": 0.7980126142501831, + "num_tokens": 3059739.0, + "step": 80 + }, + { + "epoch": 0.010304032565831319, + "ewc_loss": 2.098597542499192e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0985976334486622e-07, + "grad_norm": 4.807808876037598, + "learning_rate": 3.391267486222975e-08, + "loss": 0.8099, + "mean_token_accuracy": 0.7719709873199463, + "num_tokens": 3094910.0, + "step": 81 + }, + { + "epoch": 0.01043124284442183, + "ewc_loss": 2.1276033294270746e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.127603266899314e-07, + "grad_norm": 4.520991325378418, + "learning_rate": 3.4336583298007626e-08, + "loss": 0.8136, + "mean_token_accuracy": 0.773507833480835, + "num_tokens": 3133884.0, + "step": 82 + }, + { + "epoch": 0.010558453123012339, + "ewc_loss": 2.1500880393432453e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.150088107555348e-07, + "grad_norm": 5.118678092956543, + "learning_rate": 3.47604917337855e-08, + "loss": 0.8462, + "mean_token_accuracy": 0.7606350779533386, + "num_tokens": 3166755.0, + "step": 83 + }, + { + "epoch": 0.01068566340160285, + "ewc_loss": 2.1702418962377124e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1702419417124474e-07, + "grad_norm": 4.6805243492126465, + "learning_rate": 3.518440016956337e-08, + "loss": 0.8152, + "mean_token_accuracy": 0.7709426879882812, + "num_tokens": 3204825.0, + "step": 84 + }, + { + "epoch": 0.010812873680193359, + "ewc_loss": 2.2038793758838437e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.203879461148972e-07, + "grad_norm": 4.678957462310791, + "learning_rate": 3.5608308605341244e-08, + "loss": 0.8506, + "mean_token_accuracy": 0.763107419013977, + "num_tokens": 3244036.0, + "step": 85 + }, + { + "epoch": 0.01094008395878387, + "ewc_loss": 2.254633182019461e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2546332445472217e-07, + "grad_norm": 4.443456172943115, + "learning_rate": 3.6032217041119116e-08, + "loss": 0.8377, + "mean_token_accuracy": 0.7670460343360901, + "num_tokens": 3285969.0, + "step": 86 + }, + { + "epoch": 0.01106729423737438, + "ewc_loss": 2.3031992895994335e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.303199266862066e-07, + "grad_norm": 4.412567138671875, + "learning_rate": 3.645612547689699e-08, + "loss": 0.8089, + "mean_token_accuracy": 0.7746194005012512, + "num_tokens": 3327648.0, + "step": 87 + }, + { + "epoch": 0.01119450451596489, + "ewc_loss": 2.3487975340685807e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3487974942781875e-07, + "grad_norm": 4.558676719665527, + "learning_rate": 3.6880033912674855e-08, + "loss": 0.8119, + "mean_token_accuracy": 0.772871732711792, + "num_tokens": 3367399.0, + "step": 88 + }, + { + "epoch": 0.0113217147945554, + "ewc_loss": 2.390097506577149e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.390097506577149e-07, + "grad_norm": 4.762969970703125, + "learning_rate": 3.730394234845273e-08, + "loss": 0.7954, + "mean_token_accuracy": 0.7795013785362244, + "num_tokens": 3405402.0, + "step": 89 + }, + { + "epoch": 0.01144892507314591, + "ewc_loss": 2.4306995328515768e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430699623801047e-07, + "grad_norm": 4.856949806213379, + "learning_rate": 3.77278507842306e-08, + "loss": 0.8787, + "mean_token_accuracy": 0.7539585828781128, + "num_tokens": 3441791.0, + "step": 90 + }, + { + "epoch": 0.01157613535173642, + "ewc_loss": 2.4702714654267766e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.470271454058093e-07, + "grad_norm": 4.813365459442139, + "learning_rate": 3.815175922000847e-08, + "loss": 0.8383, + "mean_token_accuracy": 0.7686835527420044, + "num_tokens": 3480151.0, + "step": 91 + }, + { + "epoch": 0.01170334563032693, + "ewc_loss": 2.515026062610559e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.515025983029773e-07, + "grad_norm": 4.740540027618408, + "learning_rate": 3.8575667655786345e-08, + "loss": 0.8017, + "mean_token_accuracy": 0.7756146192550659, + "num_tokens": 3516867.0, + "step": 92 + }, + { + "epoch": 0.01183055590891744, + "ewc_loss": 2.5650500901974738e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.565050181146944e-07, + "grad_norm": 4.252223491668701, + "learning_rate": 3.899957609156422e-08, + "loss": 0.7776, + "mean_token_accuracy": 0.7823242545127869, + "num_tokens": 3557466.0, + "step": 93 + }, + { + "epoch": 0.01195776618750795, + "ewc_loss": 2.6259567675879225e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6259567675879225e-07, + "grad_norm": 4.474714756011963, + "learning_rate": 3.94234845273421e-08, + "loss": 0.8072, + "mean_token_accuracy": 0.7735166549682617, + "num_tokens": 3596009.0, + "step": 94 + }, + { + "epoch": 0.012084976466098461, + "ewc_loss": 2.7319078071741387e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.731907784436771e-07, + "grad_norm": 4.6856770515441895, + "learning_rate": 3.984739296311997e-08, + "loss": 0.7994, + "mean_token_accuracy": 0.7740623354911804, + "num_tokens": 3631582.0, + "step": 95 + }, + { + "epoch": 0.01221218674468897, + "ewc_loss": 2.988643609569408e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9886436436754593e-07, + "grad_norm": 4.631988525390625, + "learning_rate": 4.027130139889784e-08, + "loss": 0.8259, + "mean_token_accuracy": 0.7680702209472656, + "num_tokens": 3672855.0, + "step": 96 + }, + { + "epoch": 0.012339397023279481, + "ewc_loss": 3.615925379563123e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6159255500933796e-07, + "grad_norm": 4.337025165557861, + "learning_rate": 4.0695209834675715e-08, + "loss": 0.8172, + "mean_token_accuracy": 0.7726285457611084, + "num_tokens": 3714100.0, + "step": 97 + }, + { + "epoch": 0.012466607301869992, + "ewc_loss": 4.603026536642574e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.603026582117309e-07, + "grad_norm": 4.390087604522705, + "learning_rate": 4.111911827045358e-08, + "loss": 0.7683, + "mean_token_accuracy": 0.7851440906524658, + "num_tokens": 3752834.0, + "step": 98 + }, + { + "epoch": 0.012593817580460501, + "ewc_loss": 5.667367440764792e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.667367304340587e-07, + "grad_norm": 4.3884406089782715, + "learning_rate": 4.154302670623145e-08, + "loss": 0.7786, + "mean_token_accuracy": 0.7810474038124084, + "num_tokens": 3791513.0, + "step": 99 + }, + { + "epoch": 0.012721027859051012, + "ewc_loss": 6.597409810638055e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.59740976516332e-07, + "grad_norm": 4.936411380767822, + "learning_rate": 4.1966935142009326e-08, + "loss": 0.8013, + "mean_token_accuracy": 0.7747751474380493, + "num_tokens": 3825085.0, + "step": 100 + }, + { + "epoch": 0.012848238137641521, + "ewc_loss": 7.411718252114952e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.411717888317071e-07, + "grad_norm": 4.298474311828613, + "learning_rate": 4.23908435777872e-08, + "loss": 0.8015, + "mean_token_accuracy": 0.7761023640632629, + "num_tokens": 3866492.0, + "step": 101 + }, + { + "epoch": 0.012975448416232032, + "ewc_loss": 8.066776354098693e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066776331361325e-07, + "grad_norm": 4.689382076263428, + "learning_rate": 4.281475201356507e-08, + "loss": 0.8286, + "mean_token_accuracy": 0.7632704377174377, + "num_tokens": 3904053.0, + "step": 102 + }, + { + "epoch": 0.013102658694822541, + "ewc_loss": 8.63305467646569e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633055017526203e-07, + "grad_norm": 4.722155570983887, + "learning_rate": 4.3238660449342943e-08, + "loss": 0.7371, + "mean_token_accuracy": 0.7946819067001343, + "num_tokens": 3935118.0, + "step": 103 + }, + { + "epoch": 0.013229868973413052, + "ewc_loss": 9.11249007913284e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.112490033658105e-07, + "grad_norm": 4.2169575691223145, + "learning_rate": 4.3662568885120816e-08, + "loss": 0.7873, + "mean_token_accuracy": 0.7761164307594299, + "num_tokens": 3977899.0, + "step": 104 + }, + { + "epoch": 0.013357079252003561, + "ewc_loss": 9.432325168745592e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.432325214220327e-07, + "grad_norm": 4.226559162139893, + "learning_rate": 4.408647732089869e-08, + "loss": 0.8075, + "mean_token_accuracy": 0.773452877998352, + "num_tokens": 4019416.0, + "step": 105 + }, + { + "epoch": 0.013484289530594072, + "ewc_loss": 9.690649312688038e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.690648994364892e-07, + "grad_norm": 4.135715484619141, + "learning_rate": 4.451038575667656e-08, + "loss": 0.7967, + "mean_token_accuracy": 0.7725944519042969, + "num_tokens": 4063887.0, + "step": 106 + }, + { + "epoch": 0.013611499809184581, + "ewc_loss": 9.914322436088696e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.914322163240286e-07, + "grad_norm": 4.306079387664795, + "learning_rate": 4.493429419245443e-08, + "loss": 0.8274, + "mean_token_accuracy": 0.7680767774581909, + "num_tokens": 4101309.0, + "step": 107 + }, + { + "epoch": 0.013738710087775092, + "ewc_loss": 0.00010101697262143716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0101697398567921e-06, + "grad_norm": 4.459109306335449, + "learning_rate": 4.53582026282323e-08, + "loss": 0.8119, + "mean_token_accuracy": 0.7708051204681396, + "num_tokens": 4136669.0, + "step": 108 + }, + { + "epoch": 0.013865920366365603, + "ewc_loss": 0.00010261759598506615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.026175937113294e-06, + "grad_norm": 3.956848382949829, + "learning_rate": 4.578211106401017e-08, + "loss": 0.7429, + "mean_token_accuracy": 0.7900770306587219, + "num_tokens": 4178583.0, + "step": 109 + }, + { + "epoch": 0.013993130644956112, + "ewc_loss": 0.00010345209011575207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0345208920625737e-06, + "grad_norm": 3.989645481109619, + "learning_rate": 4.6206019499788045e-08, + "loss": 0.7735, + "mean_token_accuracy": 0.7775285840034485, + "num_tokens": 4219552.0, + "step": 110 + }, + { + "epoch": 0.014120340923546623, + "ewc_loss": 0.00010385019413661212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0385019777459092e-06, + "grad_norm": 4.030885696411133, + "learning_rate": 4.662992793556592e-08, + "loss": 0.7659, + "mean_token_accuracy": 0.7849342823028564, + "num_tokens": 4264487.0, + "step": 111 + }, + { + "epoch": 0.014247551202137132, + "ewc_loss": 0.0001043277297867462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.043277279677568e-06, + "grad_norm": 4.151076793670654, + "learning_rate": 4.705383637134379e-08, + "loss": 0.7324, + "mean_token_accuracy": 0.7901092767715454, + "num_tokens": 4303015.0, + "step": 112 + }, + { + "epoch": 0.014374761480727643, + "ewc_loss": 0.00010478847980266437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0478847798367497e-06, + "grad_norm": 4.078001022338867, + "learning_rate": 4.747774480712166e-08, + "loss": 0.7871, + "mean_token_accuracy": 0.7770717144012451, + "num_tokens": 4345446.0, + "step": 113 + }, + { + "epoch": 0.014501971759318152, + "ewc_loss": 0.00010482791549293324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.048279159476806e-06, + "grad_norm": 4.219941139221191, + "learning_rate": 4.7901653242899535e-08, + "loss": 0.7394, + "mean_token_accuracy": 0.7912425994873047, + "num_tokens": 4383854.0, + "step": 114 + }, + { + "epoch": 0.014629182037908663, + "ewc_loss": 0.00010479036427568644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0479036518518114e-06, + "grad_norm": 4.296047687530518, + "learning_rate": 4.832556167867741e-08, + "loss": 0.8083, + "mean_token_accuracy": 0.7731599807739258, + "num_tokens": 4420750.0, + "step": 115 + }, + { + "epoch": 0.014756392316499172, + "ewc_loss": 0.00010456618474563584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.045661861098779e-06, + "grad_norm": 4.426926612854004, + "learning_rate": 4.8749470114455274e-08, + "loss": 0.8054, + "mean_token_accuracy": 0.7730908393859863, + "num_tokens": 4458761.0, + "step": 116 + }, + { + "epoch": 0.014883602595089683, + "ewc_loss": 0.00010430485417600721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0430485417600721e-06, + "grad_norm": 4.178114891052246, + "learning_rate": 4.9173378550233146e-08, + "loss": 0.7655, + "mean_token_accuracy": 0.7804526090621948, + "num_tokens": 4496547.0, + "step": 117 + }, + { + "epoch": 0.015010812873680193, + "ewc_loss": 0.00010384112829342484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0384112556494074e-06, + "grad_norm": 4.2453765869140625, + "learning_rate": 4.959728698601102e-08, + "loss": 0.7636, + "mean_token_accuracy": 0.7894468903541565, + "num_tokens": 4533357.0, + "step": 118 + }, + { + "epoch": 0.015138023152270703, + "ewc_loss": 0.00010374727571615949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0374727708040155e-06, + "grad_norm": 4.052901744842529, + "learning_rate": 5.002119542178889e-08, + "loss": 0.7381, + "mean_token_accuracy": 0.7905097007751465, + "num_tokens": 4573570.0, + "step": 119 + }, + { + "epoch": 0.015265233430861214, + "ewc_loss": 0.0001037460460793227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0374604926255415e-06, + "grad_norm": 4.2761077880859375, + "learning_rate": 5.0445103857566764e-08, + "loss": 0.7928, + "mean_token_accuracy": 0.7750352621078491, + "num_tokens": 4613195.0, + "step": 120 + }, + { + "epoch": 0.015392443709451724, + "ewc_loss": 0.00010416262375656515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.041626205733337e-06, + "grad_norm": 3.899003505706787, + "learning_rate": 5.0869012293344637e-08, + "loss": 0.7135, + "mean_token_accuracy": 0.7980378866195679, + "num_tokens": 4657230.0, + "step": 121 + }, + { + "epoch": 0.015519653988042234, + "ewc_loss": 0.00010446763917570934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.044676423589408e-06, + "grad_norm": 4.075852394104004, + "learning_rate": 5.129292072912251e-08, + "loss": 0.7873, + "mean_token_accuracy": 0.7805888652801514, + "num_tokens": 4701436.0, + "step": 122 + }, + { + "epoch": 0.015646864266632744, + "ewc_loss": 0.00010496591130504385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.049659090313071e-06, + "grad_norm": 4.179802417755127, + "learning_rate": 5.171682916490038e-08, + "loss": 0.7748, + "mean_token_accuracy": 0.7806895971298218, + "num_tokens": 4741806.0, + "step": 123 + }, + { + "epoch": 0.015774074545223254, + "ewc_loss": 0.00010575260239420459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0575259921097313e-06, + "grad_norm": 4.394865036010742, + "learning_rate": 5.2140737600678254e-08, + "loss": 0.812, + "mean_token_accuracy": 0.7653789520263672, + "num_tokens": 4780671.0, + "step": 124 + }, + { + "epoch": 0.015901284823813765, + "ewc_loss": 0.00010673502401914448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0673502401914448e-06, + "grad_norm": 4.33425235748291, + "learning_rate": 5.256464603645612e-08, + "loss": 0.7606, + "mean_token_accuracy": 0.7859100699424744, + "num_tokens": 4816879.0, + "step": 125 + }, + { + "epoch": 0.016028495102404273, + "ewc_loss": 0.0001075809559551999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0758095640994725e-06, + "grad_norm": 4.223564147949219, + "learning_rate": 5.298855447223399e-08, + "loss": 0.8132, + "mean_token_accuracy": 0.7672591209411621, + "num_tokens": 4858704.0, + "step": 126 + }, + { + "epoch": 0.016155705380994784, + "ewc_loss": 0.00010871934500755742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0871934819078888e-06, + "grad_norm": 4.2156548500061035, + "learning_rate": 5.3412462908011865e-08, + "loss": 0.7698, + "mean_token_accuracy": 0.7827036380767822, + "num_tokens": 4896780.0, + "step": 127 + }, + { + "epoch": 0.016282915659585295, + "ewc_loss": 0.00011044953862437978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1044953680539038e-06, + "grad_norm": 4.46584939956665, + "learning_rate": 5.383637134378974e-08, + "loss": 0.7997, + "mean_token_accuracy": 0.7721617817878723, + "num_tokens": 4932108.0, + "step": 128 + }, + { + "epoch": 0.016410125938175806, + "ewc_loss": 0.0001129657102865167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1296571074126405e-06, + "grad_norm": 4.1689934730529785, + "learning_rate": 5.426027977956761e-08, + "loss": 0.7759, + "mean_token_accuracy": 0.782463014125824, + "num_tokens": 4970384.0, + "step": 129 + }, + { + "epoch": 0.016537336216766316, + "ewc_loss": 0.00011655940761556849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.165594085250632e-06, + "grad_norm": 4.524738788604736, + "learning_rate": 5.468418821534548e-08, + "loss": 0.7841, + "mean_token_accuracy": 0.7776435017585754, + "num_tokens": 5005996.0, + "step": 130 + }, + { + "epoch": 0.016664546495356824, + "ewc_loss": 0.0001220669801114127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2206697874717065e-06, + "grad_norm": 4.28096866607666, + "learning_rate": 5.5108096651123356e-08, + "loss": 0.7642, + "mean_token_accuracy": 0.7816352248191833, + "num_tokens": 5044208.0, + "step": 131 + }, + { + "epoch": 0.016791756773947335, + "ewc_loss": 0.00013034764560870826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3034764378971886e-06, + "grad_norm": 4.15233039855957, + "learning_rate": 5.553200508690123e-08, + "loss": 0.7786, + "mean_token_accuracy": 0.7766916751861572, + "num_tokens": 5085173.0, + "step": 132 + }, + { + "epoch": 0.016918967052537846, + "ewc_loss": 0.00014289583486970514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.42895839871926e-06, + "grad_norm": 4.622410774230957, + "learning_rate": 5.59559135226791e-08, + "loss": 0.7945, + "mean_token_accuracy": 0.7738397121429443, + "num_tokens": 5118440.0, + "step": 133 + }, + { + "epoch": 0.017046177331128357, + "ewc_loss": 0.00016252761997748166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6252762407020782e-06, + "grad_norm": 4.646082401275635, + "learning_rate": 5.637982195845697e-08, + "loss": 0.7978, + "mean_token_accuracy": 0.7742935419082642, + "num_tokens": 5155202.0, + "step": 134 + }, + { + "epoch": 0.017173387609718864, + "ewc_loss": 0.0001896893372759223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8968933090945939e-06, + "grad_norm": 4.4236578941345215, + "learning_rate": 5.680373039423484e-08, + "loss": 0.7826, + "mean_token_accuracy": 0.780472993850708, + "num_tokens": 5193240.0, + "step": 135 + }, + { + "epoch": 0.017300597888309375, + "ewc_loss": 0.00021875122911296785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1875123366044136e-06, + "grad_norm": 4.379848957061768, + "learning_rate": 5.722763883001271e-08, + "loss": 0.7702, + "mean_token_accuracy": 0.7805871963500977, + "num_tokens": 5230975.0, + "step": 136 + }, + { + "epoch": 0.017427808166899886, + "ewc_loss": 0.0002450462197884917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4504622615495464e-06, + "grad_norm": 4.791755676269531, + "learning_rate": 5.7651547265790585e-08, + "loss": 0.8311, + "mean_token_accuracy": 0.7639582753181458, + "num_tokens": 5264786.0, + "step": 137 + }, + { + "epoch": 0.017555018445490397, + "ewc_loss": 0.0002676925796549767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.676925760169979e-06, + "grad_norm": 4.3141255378723145, + "learning_rate": 5.807545570156846e-08, + "loss": 0.7936, + "mean_token_accuracy": 0.7743513584136963, + "num_tokens": 5301554.0, + "step": 138 + }, + { + "epoch": 0.017682228724080904, + "ewc_loss": 0.00028708268655464053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8708268473565113e-06, + "grad_norm": 4.2196125984191895, + "learning_rate": 5.849936413734633e-08, + "loss": 0.7322, + "mean_token_accuracy": 0.7907544374465942, + "num_tokens": 5337356.0, + "step": 139 + }, + { + "epoch": 0.017809439002671415, + "ewc_loss": 0.000304433488054201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.04433478959254e-06, + "grad_norm": 4.004998683929443, + "learning_rate": 5.89232725731242e-08, + "loss": 0.8032, + "mean_token_accuracy": 0.7746302485466003, + "num_tokens": 5375767.0, + "step": 140 + }, + { + "epoch": 0.017936649281261926, + "ewc_loss": 0.0003187364200130105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.187364200130105e-06, + "grad_norm": 3.9916319847106934, + "learning_rate": 5.9347181008902075e-08, + "loss": 0.7542, + "mean_token_accuracy": 0.7831220030784607, + "num_tokens": 5413683.0, + "step": 141 + }, + { + "epoch": 0.018063859559852437, + "ewc_loss": 0.00033146876376122236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.314687774036429e-06, + "grad_norm": 3.6769518852233887, + "learning_rate": 5.977108944467995e-08, + "loss": 0.7357, + "mean_token_accuracy": 0.7887019515037537, + "num_tokens": 5457750.0, + "step": 142 + }, + { + "epoch": 0.018191069838442948, + "ewc_loss": 0.00034088813117705286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4088811844412703e-06, + "grad_norm": 3.970010995864868, + "learning_rate": 6.019499788045781e-08, + "loss": 0.7301, + "mean_token_accuracy": 0.7899359464645386, + "num_tokens": 5495743.0, + "step": 143 + }, + { + "epoch": 0.018318280117033455, + "ewc_loss": 0.00034900030004791915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4900028822448803e-06, + "grad_norm": 3.946129083633423, + "learning_rate": 6.061890631623569e-08, + "loss": 0.7431, + "mean_token_accuracy": 0.7821722030639648, + "num_tokens": 5532198.0, + "step": 144 + }, + { + "epoch": 0.018445490395623966, + "ewc_loss": 0.00035560023388825357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.556002411642112e-06, + "grad_norm": 4.0992560386657715, + "learning_rate": 6.104281475201356e-08, + "loss": 0.7557, + "mean_token_accuracy": 0.7829784154891968, + "num_tokens": 5568977.0, + "step": 145 + }, + { + "epoch": 0.018572700674214477, + "ewc_loss": 0.0003614771121647209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.614771230786573e-06, + "grad_norm": 3.9854204654693604, + "learning_rate": 6.146672318779143e-08, + "loss": 0.7678, + "mean_token_accuracy": 0.774768054485321, + "num_tokens": 5606229.0, + "step": 146 + }, + { + "epoch": 0.018699910952804988, + "ewc_loss": 0.0003651452134363353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.651452061603777e-06, + "grad_norm": 3.6308324337005615, + "learning_rate": 6.18906316235693e-08, + "loss": 0.7694, + "mean_token_accuracy": 0.777070164680481, + "num_tokens": 5649828.0, + "step": 147 + }, + { + "epoch": 0.018827121231395495, + "ewc_loss": 0.0003665225813165307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6652256767411018e-06, + "grad_norm": 3.812753438949585, + "learning_rate": 6.231454005934718e-08, + "loss": 0.7152, + "mean_token_accuracy": 0.7971658706665039, + "num_tokens": 5687433.0, + "step": 148 + }, + { + "epoch": 0.018954331509986006, + "ewc_loss": 0.0003678420034702867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.678419943753397e-06, + "grad_norm": 3.8758323192596436, + "learning_rate": 6.273844849512505e-08, + "loss": 0.7464, + "mean_token_accuracy": 0.7828956246376038, + "num_tokens": 5723247.0, + "step": 149 + }, + { + "epoch": 0.019081541788576517, + "ewc_loss": 0.0003692273166961968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.692273139677127e-06, + "grad_norm": 3.7162859439849854, + "learning_rate": 6.316235693090292e-08, + "loss": 0.7613, + "mean_token_accuracy": 0.7790429592132568, + "num_tokens": 5760305.0, + "step": 150 + }, + { + "epoch": 0.019208752067167028, + "ewc_loss": 0.00036969833308830857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6969831853639334e-06, + "grad_norm": 3.4725286960601807, + "learning_rate": 6.35862653666808e-08, + "loss": 0.7617, + "mean_token_accuracy": 0.7774688005447388, + "num_tokens": 5800586.0, + "step": 151 + }, + { + "epoch": 0.01933596234575754, + "ewc_loss": 0.0003680977097246796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6809769881074317e-06, + "grad_norm": 3.54604434967041, + "learning_rate": 6.401017380245867e-08, + "loss": 0.7536, + "mean_token_accuracy": 0.7788660526275635, + "num_tokens": 5840351.0, + "step": 152 + }, + { + "epoch": 0.019463172624348046, + "ewc_loss": 0.00036632618866860867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6632618503062986e-06, + "grad_norm": 3.6035165786743164, + "learning_rate": 6.443408223823654e-08, + "loss": 0.794, + "mean_token_accuracy": 0.7710922956466675, + "num_tokens": 5880162.0, + "step": 153 + }, + { + "epoch": 0.019590382902938557, + "ewc_loss": 0.00036491238279268146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6491237551672384e-06, + "grad_norm": 3.752415418624878, + "learning_rate": 6.485799067401441e-08, + "loss": 0.8498, + "mean_token_accuracy": 0.7557827830314636, + "num_tokens": 5922213.0, + "step": 154 + }, + { + "epoch": 0.019717593181529068, + "ewc_loss": 0.00036347744753584266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6347744298836915e-06, + "grad_norm": 3.6278419494628906, + "learning_rate": 6.528189910979228e-08, + "loss": 0.7311, + "mean_token_accuracy": 0.7862687706947327, + "num_tokens": 5957461.0, + "step": 155 + }, + { + "epoch": 0.01984480346011958, + "ewc_loss": 0.0003614526940509677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6145268040854717e-06, + "grad_norm": 3.4591753482818604, + "learning_rate": 6.570580754557016e-08, + "loss": 0.7164, + "mean_token_accuracy": 0.7853863835334778, + "num_tokens": 5994675.0, + "step": 156 + }, + { + "epoch": 0.019972013738710086, + "ewc_loss": 0.0003589958942029625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.589958851080155e-06, + "grad_norm": 3.55952787399292, + "learning_rate": 6.612971598134802e-08, + "loss": 0.6794, + "mean_token_accuracy": 0.8020228147506714, + "num_tokens": 6032514.0, + "step": 157 + }, + { + "epoch": 0.020099224017300597, + "ewc_loss": 0.00035649409983307123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.564940925571136e-06, + "grad_norm": 3.5808534622192383, + "learning_rate": 6.655362441712589e-08, + "loss": 0.7514, + "mean_token_accuracy": 0.780269205570221, + "num_tokens": 6069160.0, + "step": 158 + }, + { + "epoch": 0.020226434295891108, + "ewc_loss": 0.0003539050230756402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5390501125220908e-06, + "grad_norm": 3.6656975746154785, + "learning_rate": 6.697753285290376e-08, + "loss": 0.8057, + "mean_token_accuracy": 0.7652347087860107, + "num_tokens": 6106652.0, + "step": 159 + }, + { + "epoch": 0.02035364457448162, + "ewc_loss": 0.00035182209103368223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.518220864862087e-06, + "grad_norm": 3.581108570098877, + "learning_rate": 6.740144128868163e-08, + "loss": 0.7401, + "mean_token_accuracy": 0.7860910296440125, + "num_tokens": 6143070.0, + "step": 160 + }, + { + "epoch": 0.020480854853072127, + "ewc_loss": 0.0003489922091830522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.489922164590098e-06, + "grad_norm": 3.2496259212493896, + "learning_rate": 6.78253497244595e-08, + "loss": 0.7325, + "mean_token_accuracy": 0.7869266867637634, + "num_tokens": 6186495.0, + "step": 161 + }, + { + "epoch": 0.020608065131662637, + "ewc_loss": 0.0003453808312769979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4538084037194494e-06, + "grad_norm": 3.540105104446411, + "learning_rate": 6.824925816023738e-08, + "loss": 0.7325, + "mean_token_accuracy": 0.7863866090774536, + "num_tokens": 6222454.0, + "step": 162 + }, + { + "epoch": 0.02073527541025315, + "ewc_loss": 0.00034338500699959695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4338499972363934e-06, + "grad_norm": 4.049264430999756, + "learning_rate": 6.867316659601525e-08, + "loss": 0.7445, + "mean_token_accuracy": 0.7859707474708557, + "num_tokens": 6253260.0, + "step": 163 + }, + { + "epoch": 0.02086248568884366, + "ewc_loss": 0.0003430463548284024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.430463493714342e-06, + "grad_norm": 3.3347156047821045, + "learning_rate": 6.909707503179312e-08, + "loss": 0.7264, + "mean_token_accuracy": 0.7904532551765442, + "num_tokens": 6294773.0, + "step": 164 + }, + { + "epoch": 0.02098969596743417, + "ewc_loss": 0.00034026347566396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.402634774829494e-06, + "grad_norm": 3.397390127182007, + "learning_rate": 6.9520983467571e-08, + "loss": 0.7695, + "mean_token_accuracy": 0.7759827375411987, + "num_tokens": 6335334.0, + "step": 165 + }, + { + "epoch": 0.021116906246024678, + "ewc_loss": 0.00033816814539022744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3816813811426982e-06, + "grad_norm": 3.4506521224975586, + "learning_rate": 6.994489190334887e-08, + "loss": 0.7044, + "mean_token_accuracy": 0.7922151684761047, + "num_tokens": 6372255.0, + "step": 166 + }, + { + "epoch": 0.02124411652461519, + "ewc_loss": 0.0003373103972990066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3731039366102777e-06, + "grad_norm": 3.19042706489563, + "learning_rate": 7.036880033912674e-08, + "loss": 0.6813, + "mean_token_accuracy": 0.800818681716919, + "num_tokens": 6411971.0, + "step": 167 + }, + { + "epoch": 0.0213713268032057, + "ewc_loss": 0.0003355391672812402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.35539152729325e-06, + "grad_norm": 3.6956348419189453, + "learning_rate": 7.079270877490461e-08, + "loss": 0.7512, + "mean_token_accuracy": 0.7800983786582947, + "num_tokens": 6450366.0, + "step": 168 + }, + { + "epoch": 0.02149853708179621, + "ewc_loss": 0.00033615852589719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3615851862123236e-06, + "grad_norm": 3.634810447692871, + "learning_rate": 7.121661721068249e-08, + "loss": 0.7137, + "mean_token_accuracy": 0.7895742654800415, + "num_tokens": 6482199.0, + "step": 169 + }, + { + "epoch": 0.021625747360386718, + "ewc_loss": 0.00033661932684481144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.366193141118856e-06, + "grad_norm": 3.322000026702881, + "learning_rate": 7.164052564646036e-08, + "loss": 0.6955, + "mean_token_accuracy": 0.7984988689422607, + "num_tokens": 6519697.0, + "step": 170 + }, + { + "epoch": 0.02175295763897723, + "ewc_loss": 0.0003364519798196852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.364519670867594e-06, + "grad_norm": 3.5600907802581787, + "learning_rate": 7.206443408223823e-08, + "loss": 0.7554, + "mean_token_accuracy": 0.7789697647094727, + "num_tokens": 6554687.0, + "step": 171 + }, + { + "epoch": 0.02188016791756774, + "ewc_loss": 0.0003375217202119529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.375217147549847e-06, + "grad_norm": 3.5988576412200928, + "learning_rate": 7.24883425180161e-08, + "loss": 0.7185, + "mean_token_accuracy": 0.783344030380249, + "num_tokens": 6593066.0, + "step": 172 + }, + { + "epoch": 0.02200737819615825, + "ewc_loss": 0.0003397689142730087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.397689170014928e-06, + "grad_norm": 3.175225257873535, + "learning_rate": 7.291225095379398e-08, + "loss": 0.6769, + "mean_token_accuracy": 0.8025361895561218, + "num_tokens": 6635484.0, + "step": 173 + }, + { + "epoch": 0.02213458847474876, + "ewc_loss": 0.00034114159643650055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4114159461751115e-06, + "grad_norm": 3.4203908443450928, + "learning_rate": 7.333615938957185e-08, + "loss": 0.769, + "mean_token_accuracy": 0.7768411040306091, + "num_tokens": 6677090.0, + "step": 174 + }, + { + "epoch": 0.02226179875333927, + "ewc_loss": 0.00034391990629956126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4391989629511954e-06, + "grad_norm": 3.662269115447998, + "learning_rate": 7.376006782534971e-08, + "loss": 0.6836, + "mean_token_accuracy": 0.7973049879074097, + "num_tokens": 6710250.0, + "step": 175 + }, + { + "epoch": 0.02238900903192978, + "ewc_loss": 0.00034838163992390037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4838162719097454e-06, + "grad_norm": 3.2468338012695312, + "learning_rate": 7.418397626112758e-08, + "loss": 0.7105, + "mean_token_accuracy": 0.7888486385345459, + "num_tokens": 6752998.0, + "step": 176 + }, + { + "epoch": 0.02251621931052029, + "ewc_loss": 0.00035199616104364395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5199616377212806e-06, + "grad_norm": 3.4440743923187256, + "learning_rate": 7.460788469690545e-08, + "loss": 0.7113, + "mean_token_accuracy": 0.7942492961883545, + "num_tokens": 6789568.0, + "step": 177 + }, + { + "epoch": 0.0226434295891108, + "ewc_loss": 0.00035682227462530136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5682228372024838e-06, + "grad_norm": 4.045323371887207, + "learning_rate": 7.503179313268333e-08, + "loss": 0.7518, + "mean_token_accuracy": 0.7786601781845093, + "num_tokens": 6822810.0, + "step": 178 + }, + { + "epoch": 0.02277063986770131, + "ewc_loss": 0.00036471482599154115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6471483326749876e-06, + "grad_norm": 3.1763625144958496, + "learning_rate": 7.54557015684612e-08, + "loss": 0.6905, + "mean_token_accuracy": 0.794792115688324, + "num_tokens": 6861598.0, + "step": 179 + }, + { + "epoch": 0.02289785014629182, + "ewc_loss": 0.00037053460255265236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7053459891467355e-06, + "grad_norm": 3.154878854751587, + "learning_rate": 7.587961000423907e-08, + "loss": 0.7128, + "mean_token_accuracy": 0.7914278507232666, + "num_tokens": 6900220.0, + "step": 180 + }, + { + "epoch": 0.02302506042488233, + "ewc_loss": 0.0003778205718845129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.778205609705765e-06, + "grad_norm": 3.2488250732421875, + "learning_rate": 7.630351844001694e-08, + "loss": 0.8207, + "mean_token_accuracy": 0.7609337568283081, + "num_tokens": 6946983.0, + "step": 181 + }, + { + "epoch": 0.02315227070347284, + "ewc_loss": 0.0003871825465466827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.871825356327463e-06, + "grad_norm": 3.342353343963623, + "learning_rate": 7.672742687579482e-08, + "loss": 0.725, + "mean_token_accuracy": 0.7887452244758606, + "num_tokens": 6986019.0, + "step": 182 + }, + { + "epoch": 0.02327948098206335, + "ewc_loss": 0.00039782904786989093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.978290351369651e-06, + "grad_norm": 3.5618739128112793, + "learning_rate": 7.715133531157269e-08, + "loss": 0.6987, + "mean_token_accuracy": 0.7903072237968445, + "num_tokens": 7020722.0, + "step": 183 + }, + { + "epoch": 0.02340669126065386, + "ewc_loss": 0.0004107029235456139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.107029326405609e-06, + "grad_norm": 3.742722272872925, + "learning_rate": 7.757524374735056e-08, + "loss": 0.7699, + "mean_token_accuracy": 0.7754470705986023, + "num_tokens": 7062691.0, + "step": 184 + }, + { + "epoch": 0.02353390153924437, + "ewc_loss": 0.0004261224530637264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.26122460339684e-06, + "grad_norm": 3.5655972957611084, + "learning_rate": 7.799915218312844e-08, + "loss": 0.7174, + "mean_token_accuracy": 0.7903645634651184, + "num_tokens": 7101090.0, + "step": 185 + }, + { + "epoch": 0.02366111181783488, + "ewc_loss": 0.00044188444735482335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.418844582687598e-06, + "grad_norm": 3.208709955215454, + "learning_rate": 7.842306061890631e-08, + "loss": 0.6901, + "mean_token_accuracy": 0.7916736006736755, + "num_tokens": 7139891.0, + "step": 186 + }, + { + "epoch": 0.023788322096425393, + "ewc_loss": 0.0004573480982799083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.573480964609189e-06, + "grad_norm": 3.450378656387329, + "learning_rate": 7.88469690546842e-08, + "loss": 0.7381, + "mean_token_accuracy": 0.7828174233436584, + "num_tokens": 7179501.0, + "step": 187 + }, + { + "epoch": 0.0239155323750159, + "ewc_loss": 0.0004746659251395613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.746659215015825e-06, + "grad_norm": 3.5313074588775635, + "learning_rate": 7.927087749046207e-08, + "loss": 0.7069, + "mean_token_accuracy": 0.7896876931190491, + "num_tokens": 7213809.0, + "step": 188 + }, + { + "epoch": 0.02404274265360641, + "ewc_loss": 0.0004921420477330685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.921420440950897e-06, + "grad_norm": 3.549903392791748, + "learning_rate": 7.969478592623994e-08, + "loss": 0.7811, + "mean_token_accuracy": 0.7681836485862732, + "num_tokens": 7254493.0, + "step": 189 + }, + { + "epoch": 0.024169952932196922, + "ewc_loss": 0.0005092078354209661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.09207848153892e-06, + "grad_norm": 4.485507965087891, + "learning_rate": 8.011869436201781e-08, + "loss": 0.6389, + "mean_token_accuracy": 0.8108700513839722, + "num_tokens": 7292643.0, + "step": 190 + }, + { + "epoch": 0.024297163210787433, + "ewc_loss": 0.000530730583705008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.307305855239974e-06, + "grad_norm": 3.386260509490967, + "learning_rate": 8.054260279779568e-08, + "loss": 0.7607, + "mean_token_accuracy": 0.7782121300697327, + "num_tokens": 7329810.0, + "step": 191 + }, + { + "epoch": 0.02442437348937794, + "ewc_loss": 0.0005456525832414627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.456525741465157e-06, + "grad_norm": 3.595283031463623, + "learning_rate": 8.096651123357356e-08, + "loss": 0.7467, + "mean_token_accuracy": 0.7795183658599854, + "num_tokens": 7367630.0, + "step": 192 + }, + { + "epoch": 0.02455158376796845, + "ewc_loss": 0.0005609397776424885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.609397703665309e-06, + "grad_norm": 3.060606002807617, + "learning_rate": 8.139041966935143e-08, + "loss": 0.7281, + "mean_token_accuracy": 0.7853956818580627, + "num_tokens": 7411580.0, + "step": 193 + }, + { + "epoch": 0.024678794046558962, + "ewc_loss": 0.000571579032111913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.715790393878706e-06, + "grad_norm": 3.5034303665161133, + "learning_rate": 8.181432810512929e-08, + "loss": 0.6819, + "mean_token_accuracy": 0.7971084117889404, + "num_tokens": 7451631.0, + "step": 194 + }, + { + "epoch": 0.024806004325149473, + "ewc_loss": 0.0005849464214406908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.849464287166484e-06, + "grad_norm": 3.8535215854644775, + "learning_rate": 8.223823654090716e-08, + "loss": 0.6908, + "mean_token_accuracy": 0.7942193746566772, + "num_tokens": 7493645.0, + "step": 195 + }, + { + "epoch": 0.024933214603739984, + "ewc_loss": 0.0006005047471262515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0050474530726206e-06, + "grad_norm": 3.8461544513702393, + "learning_rate": 8.266214497668503e-08, + "loss": 0.651, + "mean_token_accuracy": 0.8081040382385254, + "num_tokens": 7538042.0, + "step": 196 + }, + { + "epoch": 0.02506042488233049, + "ewc_loss": 0.0006156490417197347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.156490599096287e-06, + "grad_norm": 3.778759241104126, + "learning_rate": 8.30860534124629e-08, + "loss": 0.7128, + "mean_token_accuracy": 0.7881405353546143, + "num_tokens": 7575374.0, + "step": 197 + }, + { + "epoch": 0.025187635160921002, + "ewc_loss": 0.000630229595117271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.302296242211014e-06, + "grad_norm": 4.013764381408691, + "learning_rate": 8.350996184824078e-08, + "loss": 0.7051, + "mean_token_accuracy": 0.7894933819770813, + "num_tokens": 7606880.0, + "step": 198 + }, + { + "epoch": 0.025314845439511513, + "ewc_loss": 0.0006445495528168976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.445495728257811e-06, + "grad_norm": 3.7664992809295654, + "learning_rate": 8.393387028401865e-08, + "loss": 0.6829, + "mean_token_accuracy": 0.79892897605896, + "num_tokens": 7644840.0, + "step": 199 + }, + { + "epoch": 0.025442055718102024, + "ewc_loss": 0.0006559249013662338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.559248959092656e-06, + "grad_norm": 4.366433620452881, + "learning_rate": 8.435777871979652e-08, + "loss": 0.724, + "mean_token_accuracy": 0.7844232320785522, + "num_tokens": 7683856.0, + "step": 200 + }, + { + "epoch": 0.02556926599669253, + "ewc_loss": 0.0006684476393274963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.684476375085069e-06, + "grad_norm": 4.211787223815918, + "learning_rate": 8.47816871555744e-08, + "loss": 0.6808, + "mean_token_accuracy": 0.7983664274215698, + "num_tokens": 7715306.0, + "step": 201 + }, + { + "epoch": 0.025696476275283042, + "ewc_loss": 0.0006793496431782842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.793496595491888e-06, + "grad_norm": 5.2747697830200195, + "learning_rate": 8.520559559135227e-08, + "loss": 0.6418, + "mean_token_accuracy": 0.8080286979675293, + "num_tokens": 7752442.0, + "step": 202 + }, + { + "epoch": 0.025823686553873553, + "ewc_loss": 0.0006928083603270352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.928083621460246e-06, + "grad_norm": 5.975701808929443, + "learning_rate": 8.562950402713014e-08, + "loss": 0.7272, + "mean_token_accuracy": 0.7870534658432007, + "num_tokens": 7799891.0, + "step": 203 + }, + { + "epoch": 0.025950896832464064, + "ewc_loss": 0.0007074225577525795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.074225777614629e-06, + "grad_norm": 4.823026180267334, + "learning_rate": 8.605341246290801e-08, + "loss": 0.7137, + "mean_token_accuracy": 0.7853132486343384, + "num_tokens": 7830696.0, + "step": 204 + }, + { + "epoch": 0.026078107111054575, + "ewc_loss": 0.0007160219829529524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160219865909312e-06, + "grad_norm": 3.8518521785736084, + "learning_rate": 8.647732089868589e-08, + "loss": 0.6851, + "mean_token_accuracy": 0.7979569435119629, + "num_tokens": 7870968.0, + "step": 205 + }, + { + "epoch": 0.026205317389645082, + "ewc_loss": 0.000714397756382823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.1439776547777e-06, + "grad_norm": 2.9823665618896484, + "learning_rate": 8.690122933446376e-08, + "loss": 0.7097, + "mean_token_accuracy": 0.7894988059997559, + "num_tokens": 7914418.0, + "step": 206 + }, + { + "epoch": 0.026332527668235593, + "ewc_loss": 0.0007015129085630178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0151290856301785e-06, + "grad_norm": 4.886409759521484, + "learning_rate": 8.732513777024163e-08, + "loss": 0.6475, + "mean_token_accuracy": 0.8116672039031982, + "num_tokens": 7954544.0, + "step": 207 + }, + { + "epoch": 0.026459737946826104, + "ewc_loss": 0.0007002110360190272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.002110578469001e-06, + "grad_norm": 5.681060314178467, + "learning_rate": 8.77490462060195e-08, + "loss": 0.6963, + "mean_token_accuracy": 0.7958777546882629, + "num_tokens": 7997373.0, + "step": 208 + }, + { + "epoch": 0.026586948225416615, + "ewc_loss": 0.0007049511186778545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.049511168588651e-06, + "grad_norm": 4.552581310272217, + "learning_rate": 8.817295464179738e-08, + "loss": 0.659, + "mean_token_accuracy": 0.7985463738441467, + "num_tokens": 8033314.0, + "step": 209 + }, + { + "epoch": 0.026714158504007122, + "ewc_loss": 0.0007072828593663871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.072828793752706e-06, + "grad_norm": 3.1363964080810547, + "learning_rate": 8.859686307757525e-08, + "loss": 0.6789, + "mean_token_accuracy": 0.7988103032112122, + "num_tokens": 8069362.0, + "step": 210 + }, + { + "epoch": 0.026841368782597633, + "ewc_loss": 0.000697341572958976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.973415565880714e-06, + "grad_norm": 3.667736053466797, + "learning_rate": 8.902077151335312e-08, + "loss": 0.7111, + "mean_token_accuracy": 0.7866877317428589, + "num_tokens": 8109126.0, + "step": 211 + }, + { + "epoch": 0.026968579061188144, + "ewc_loss": 0.0006909466465003788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.909466264914954e-06, + "grad_norm": 4.620452404022217, + "learning_rate": 8.944467994913098e-08, + "loss": 0.7643, + "mean_token_accuracy": 0.7787548303604126, + "num_tokens": 8143083.0, + "step": 212 + }, + { + "epoch": 0.027095789339778655, + "ewc_loss": 0.0006919669103808701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.919668976479443e-06, + "grad_norm": 5.124137878417969, + "learning_rate": 8.986858838490885e-08, + "loss": 0.615, + "mean_token_accuracy": 0.810796856880188, + "num_tokens": 8176636.0, + "step": 213 + }, + { + "epoch": 0.027222999618369163, + "ewc_loss": 0.0006963027408346534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.963027317397064e-06, + "grad_norm": 3.620912790298462, + "learning_rate": 9.029249682068673e-08, + "loss": 0.6755, + "mean_token_accuracy": 0.7974115014076233, + "num_tokens": 8220262.0, + "step": 214 + }, + { + "epoch": 0.027350209896959674, + "ewc_loss": 0.0006912194658070803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.912194749020273e-06, + "grad_norm": 3.2665514945983887, + "learning_rate": 9.07164052564646e-08, + "loss": 0.6924, + "mean_token_accuracy": 0.7942905426025391, + "num_tokens": 8256303.0, + "step": 215 + }, + { + "epoch": 0.027477420175550184, + "ewc_loss": 0.0006833206862211227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.833206953160698e-06, + "grad_norm": 4.849654197692871, + "learning_rate": 9.114031369224247e-08, + "loss": 0.6593, + "mean_token_accuracy": 0.8054441213607788, + "num_tokens": 8293888.0, + "step": 216 + }, + { + "epoch": 0.027604630454140695, + "ewc_loss": 0.0006850952049717307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.850952104286989e-06, + "grad_norm": 5.546675205230713, + "learning_rate": 9.156422212802034e-08, + "loss": 0.6206, + "mean_token_accuracy": 0.8129088878631592, + "num_tokens": 8335368.0, + "step": 217 + }, + { + "epoch": 0.027731840732731206, + "ewc_loss": 0.0006899221916683018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.899222171341535e-06, + "grad_norm": 5.137253761291504, + "learning_rate": 9.198813056379822e-08, + "loss": 0.6749, + "mean_token_accuracy": 0.8005613088607788, + "num_tokens": 8370338.0, + "step": 218 + }, + { + "epoch": 0.027859051011321714, + "ewc_loss": 0.0006913031102158129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.913031029398553e-06, + "grad_norm": 3.933678388595581, + "learning_rate": 9.241203899957609e-08, + "loss": 0.6607, + "mean_token_accuracy": 0.8012058138847351, + "num_tokens": 8409904.0, + "step": 219 + }, + { + "epoch": 0.027986261289912225, + "ewc_loss": 0.0006857641856186092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.85764189256588e-06, + "grad_norm": 3.515981674194336, + "learning_rate": 9.283594743535396e-08, + "loss": 0.787, + "mean_token_accuracy": 0.768871545791626, + "num_tokens": 8446579.0, + "step": 220 + }, + { + "epoch": 0.028113471568502735, + "ewc_loss": 0.0006750237662345171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.750237844244111e-06, + "grad_norm": 3.442312240600586, + "learning_rate": 9.325985587113183e-08, + "loss": 0.6355, + "mean_token_accuracy": 0.8109914064407349, + "num_tokens": 8483663.0, + "step": 221 + }, + { + "epoch": 0.028240681847093246, + "ewc_loss": 0.0006635622121393681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.635621957684634e-06, + "grad_norm": 4.02705192565918, + "learning_rate": 9.368376430690971e-08, + "loss": 0.6643, + "mean_token_accuracy": 0.8013046979904175, + "num_tokens": 8516346.0, + "step": 222 + }, + { + "epoch": 0.028367892125683754, + "ewc_loss": 0.0006583615322597325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.583615231647855e-06, + "grad_norm": 3.6334228515625, + "learning_rate": 9.410767274268758e-08, + "loss": 0.6616, + "mean_token_accuracy": 0.8032100200653076, + "num_tokens": 8556914.0, + "step": 223 + }, + { + "epoch": 0.028495102404274265, + "ewc_loss": 0.0006531512481160462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.531512553920038e-06, + "grad_norm": 3.9791533946990967, + "learning_rate": 9.453158117846545e-08, + "loss": 0.6507, + "mean_token_accuracy": 0.8023205399513245, + "num_tokens": 8596548.0, + "step": 224 + }, + { + "epoch": 0.028622312682864776, + "ewc_loss": 0.0006520694005303085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.520694114442449e-06, + "grad_norm": 4.877070903778076, + "learning_rate": 9.495548961424333e-08, + "loss": 0.6807, + "mean_token_accuracy": 0.8000100255012512, + "num_tokens": 8634366.0, + "step": 225 + }, + { + "epoch": 0.028749522961455286, + "ewc_loss": 0.0006565462681464851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.565462626895169e-06, + "grad_norm": 4.909416198730469, + "learning_rate": 9.53793980500212e-08, + "loss": 0.6205, + "mean_token_accuracy": 0.8117645978927612, + "num_tokens": 8666480.0, + "step": 226 + }, + { + "epoch": 0.028876733240045797, + "ewc_loss": 0.0006589732365682721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.589732493011979e-06, + "grad_norm": 5.39588737487793, + "learning_rate": 9.580330648579907e-08, + "loss": 0.7706, + "mean_token_accuracy": 0.7759625911712646, + "num_tokens": 8705880.0, + "step": 227 + }, + { + "epoch": 0.029003943518636305, + "ewc_loss": 0.0006616112077608705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.616111932089552e-06, + "grad_norm": 4.180953502655029, + "learning_rate": 9.622721492157694e-08, + "loss": 0.6319, + "mean_token_accuracy": 0.8088146448135376, + "num_tokens": 8743772.0, + "step": 228 + }, + { + "epoch": 0.029131153797226816, + "ewc_loss": 0.0006555215804837644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.555215804837644e-06, + "grad_norm": 4.537073612213135, + "learning_rate": 9.665112335735482e-08, + "loss": 0.6568, + "mean_token_accuracy": 0.7993515729904175, + "num_tokens": 8775919.0, + "step": 229 + }, + { + "epoch": 0.029258364075817327, + "ewc_loss": 0.0006502695032395422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.502694759547012e-06, + "grad_norm": 3.519524097442627, + "learning_rate": 9.707503179313267e-08, + "loss": 0.6052, + "mean_token_accuracy": 0.8149606585502625, + "num_tokens": 8809336.0, + "step": 230 + }, + { + "epoch": 0.029385574354407838, + "ewc_loss": 0.0006383247091434896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.383247182384366e-06, + "grad_norm": 4.236699104309082, + "learning_rate": 9.749894022891055e-08, + "loss": 0.6589, + "mean_token_accuracy": 0.8014929294586182, + "num_tokens": 8845952.0, + "step": 231 + }, + { + "epoch": 0.029512784632998345, + "ewc_loss": 0.0006327963783405721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.327963546937099e-06, + "grad_norm": 3.417647361755371, + "learning_rate": 9.792284866468842e-08, + "loss": 0.679, + "mean_token_accuracy": 0.7930188775062561, + "num_tokens": 8889801.0, + "step": 232 + }, + { + "epoch": 0.029639994911588856, + "ewc_loss": 0.0006227918202057481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.227918220247375e-06, + "grad_norm": 4.258319854736328, + "learning_rate": 9.834675710046629e-08, + "loss": 0.6383, + "mean_token_accuracy": 0.8042208552360535, + "num_tokens": 8925429.0, + "step": 233 + }, + { + "epoch": 0.029767205190179367, + "ewc_loss": 0.0006194916786625981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.194916750246193e-06, + "grad_norm": 3.97647762298584, + "learning_rate": 9.877066553624416e-08, + "loss": 0.6709, + "mean_token_accuracy": 0.7976471781730652, + "num_tokens": 8963360.0, + "step": 234 + }, + { + "epoch": 0.029894415468769878, + "ewc_loss": 0.0006158780888654292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.158781161502702e-06, + "grad_norm": 4.137073516845703, + "learning_rate": 9.919457397202204e-08, + "loss": 0.6816, + "mean_token_accuracy": 0.7974491715431213, + "num_tokens": 8998314.0, + "step": 235 + }, + { + "epoch": 0.030021625747360385, + "ewc_loss": 0.0006137770251370966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.137770469649695e-06, + "grad_norm": 3.9738404750823975, + "learning_rate": 9.961848240779991e-08, + "loss": 0.7508, + "mean_token_accuracy": 0.7786131501197815, + "num_tokens": 9034804.0, + "step": 236 + }, + { + "epoch": 0.030148836025950896, + "ewc_loss": 0.0006113963900133967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.113963991083438e-06, + "grad_norm": 4.381372451782227, + "learning_rate": 1.0004239084357778e-07, + "loss": 0.6737, + "mean_token_accuracy": 0.7987175583839417, + "num_tokens": 9070840.0, + "step": 237 + }, + { + "epoch": 0.030276046304541407, + "ewc_loss": 0.000611435913015157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.114359166531358e-06, + "grad_norm": 4.6586809158325195, + "learning_rate": 1.0046629927935566e-07, + "loss": 0.6558, + "mean_token_accuracy": 0.8036947250366211, + "num_tokens": 9109370.0, + "step": 238 + }, + { + "epoch": 0.030403256583131918, + "ewc_loss": 0.0006117543671280146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.117543762229616e-06, + "grad_norm": 3.788240432739258, + "learning_rate": 1.0089020771513353e-07, + "loss": 0.5996, + "mean_token_accuracy": 0.8199456930160522, + "num_tokens": 9148191.0, + "step": 239 + }, + { + "epoch": 0.03053046686172243, + "ewc_loss": 0.0006054450059309602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.054449841030873e-06, + "grad_norm": 4.7593584060668945, + "learning_rate": 1.013141161509114e-07, + "loss": 0.6662, + "mean_token_accuracy": 0.8003988265991211, + "num_tokens": 9182923.0, + "step": 240 + }, + { + "epoch": 0.030657677140312936, + "ewc_loss": 0.0006051924428902566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.051924628991401e-06, + "grad_norm": 4.504434108734131, + "learning_rate": 1.0173802458668927e-07, + "loss": 0.6654, + "mean_token_accuracy": 0.7971804141998291, + "num_tokens": 9215882.0, + "step": 241 + }, + { + "epoch": 0.030784887418903447, + "ewc_loss": 0.0006026868359185755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0268685047049075e-06, + "grad_norm": 6.157735347747803, + "learning_rate": 1.0216193302246715e-07, + "loss": 0.6974, + "mean_token_accuracy": 0.7863467931747437, + "num_tokens": 9249718.0, + "step": 242 + }, + { + "epoch": 0.030912097697493958, + "ewc_loss": 0.0006072124233469367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0721245063177776e-06, + "grad_norm": 5.028306007385254, + "learning_rate": 1.0258584145824502e-07, + "loss": 0.6204, + "mean_token_accuracy": 0.8132050037384033, + "num_tokens": 9291146.0, + "step": 243 + }, + { + "epoch": 0.03103930797608447, + "ewc_loss": 0.0006053001852706075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.053001925465651e-06, + "grad_norm": 4.93041467666626, + "learning_rate": 1.0300974989402289e-07, + "loss": 0.6118, + "mean_token_accuracy": 0.8132947087287903, + "num_tokens": 9326403.0, + "step": 244 + }, + { + "epoch": 0.031166518254674976, + "ewc_loss": 0.0006021556328050792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.021556600899203e-06, + "grad_norm": 4.58070707321167, + "learning_rate": 1.0343365832980076e-07, + "loss": 0.6651, + "mean_token_accuracy": 0.8000174760818481, + "num_tokens": 9368491.0, + "step": 245 + }, + { + "epoch": 0.03129372853326549, + "ewc_loss": 0.0005959724076092243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.959724148851819e-06, + "grad_norm": 5.286329746246338, + "learning_rate": 1.0385756676557864e-07, + "loss": 0.6604, + "mean_token_accuracy": 0.8024551272392273, + "num_tokens": 9409041.0, + "step": 246 + }, + { + "epoch": 0.031420938811855995, + "ewc_loss": 0.0005937123205512762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.937123205512762e-06, + "grad_norm": 3.91230845451355, + "learning_rate": 1.0428147520135651e-07, + "loss": 0.6333, + "mean_token_accuracy": 0.8090817928314209, + "num_tokens": 9448591.0, + "step": 247 + }, + { + "epoch": 0.03154814909044651, + "ewc_loss": 0.0005843279068358243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.843278813699726e-06, + "grad_norm": 4.620428562164307, + "learning_rate": 1.0470538363713437e-07, + "loss": 0.6635, + "mean_token_accuracy": 0.798359215259552, + "num_tokens": 9486604.0, + "step": 248 + }, + { + "epoch": 0.031675359369037016, + "ewc_loss": 0.0005807469133287668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8074692788068205e-06, + "grad_norm": 4.775378227233887, + "learning_rate": 1.0512929207291224e-07, + "loss": 0.6695, + "mean_token_accuracy": 0.7938359975814819, + "num_tokens": 9525285.0, + "step": 249 + }, + { + "epoch": 0.03180256964762753, + "ewc_loss": 0.0005779371713288128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7793718042375986e-06, + "grad_norm": 8.052947044372559, + "learning_rate": 1.0555320050869011e-07, + "loss": 0.6514, + "mean_token_accuracy": 0.7996713519096375, + "num_tokens": 9557898.0, + "step": 250 + }, + { + "epoch": 0.03192977992621804, + "ewc_loss": 0.0005862234975211322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.862234957021428e-06, + "grad_norm": 5.216027736663818, + "learning_rate": 1.0597710894446799e-07, + "loss": 0.709, + "mean_token_accuracy": 0.7834841012954712, + "num_tokens": 9590438.0, + "step": 251 + }, + { + "epoch": 0.032056990204808546, + "ewc_loss": 0.0005853589973412454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.853589755133726e-06, + "grad_norm": 3.606081247329712, + "learning_rate": 1.0640101738024586e-07, + "loss": 0.6203, + "mean_token_accuracy": 0.812225341796875, + "num_tokens": 9631402.0, + "step": 252 + }, + { + "epoch": 0.03218420048339906, + "ewc_loss": 0.0005726215313188732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.726215476897778e-06, + "grad_norm": 5.885962963104248, + "learning_rate": 1.0682492581602373e-07, + "loss": 0.6011, + "mean_token_accuracy": 0.8181947469711304, + "num_tokens": 9663702.0, + "step": 253 + }, + { + "epoch": 0.03231141076198957, + "ewc_loss": 0.0005735940067097545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.735939794249134e-06, + "grad_norm": 5.293251037597656, + "learning_rate": 1.072488342518016e-07, + "loss": 0.5899, + "mean_token_accuracy": 0.8177168965339661, + "num_tokens": 9698969.0, + "step": 254 + }, + { + "epoch": 0.03243862104058008, + "ewc_loss": 0.00057168398052454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.716839950764552e-06, + "grad_norm": 4.575771331787109, + "learning_rate": 1.0767274268757948e-07, + "loss": 0.6595, + "mean_token_accuracy": 0.7989037036895752, + "num_tokens": 9738075.0, + "step": 255 + }, + { + "epoch": 0.03256583131917059, + "ewc_loss": 0.0005657428409904242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.657428573613288e-06, + "grad_norm": 3.9425644874572754, + "learning_rate": 1.0809665112335735e-07, + "loss": 0.6639, + "mean_token_accuracy": 0.801108717918396, + "num_tokens": 9782675.0, + "step": 256 + }, + { + "epoch": 0.0326930415977611, + "ewc_loss": 0.0005575395189225674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.575395334744826e-06, + "grad_norm": 5.486766338348389, + "learning_rate": 1.0852055955913522e-07, + "loss": 0.5648, + "mean_token_accuracy": 0.8263261318206787, + "num_tokens": 9820585.0, + "step": 257 + }, + { + "epoch": 0.03282025187635161, + "ewc_loss": 0.0005595479742623866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5954797062440775e-06, + "grad_norm": 4.557173728942871, + "learning_rate": 1.089444679949131e-07, + "loss": 0.6886, + "mean_token_accuracy": 0.7931230664253235, + "num_tokens": 9860693.0, + "step": 258 + }, + { + "epoch": 0.03294746215494212, + "ewc_loss": 0.0005574451643042266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5744517339917365e-06, + "grad_norm": 4.131824016571045, + "learning_rate": 1.0936837643069097e-07, + "loss": 0.6218, + "mean_token_accuracy": 0.8114891052246094, + "num_tokens": 9902629.0, + "step": 259 + }, + { + "epoch": 0.03307467243353263, + "ewc_loss": 0.0005533982766792178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.533982857741648e-06, + "grad_norm": 3.697699546813965, + "learning_rate": 1.0979228486646884e-07, + "loss": 0.5934, + "mean_token_accuracy": 0.8154913187026978, + "num_tokens": 9937304.0, + "step": 260 + }, + { + "epoch": 0.03320188271212314, + "ewc_loss": 0.000547426228877157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.474262252391782e-06, + "grad_norm": 5.376210689544678, + "learning_rate": 1.1021619330224671e-07, + "loss": 0.7268, + "mean_token_accuracy": 0.7818509340286255, + "num_tokens": 9976911.0, + "step": 261 + }, + { + "epoch": 0.03332909299071365, + "ewc_loss": 0.0005530447233468294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.530447197088506e-06, + "grad_norm": 4.771268844604492, + "learning_rate": 1.1064010173802458e-07, + "loss": 0.6619, + "mean_token_accuracy": 0.8006026744842529, + "num_tokens": 10015740.0, + "step": 262 + }, + { + "epoch": 0.03345630326930416, + "ewc_loss": 0.0005535127129405737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.535127002076479e-06, + "grad_norm": 4.938675880432129, + "learning_rate": 1.1106401017380246e-07, + "loss": 0.5986, + "mean_token_accuracy": 0.8185814023017883, + "num_tokens": 10053054.0, + "step": 263 + }, + { + "epoch": 0.03358351354789467, + "ewc_loss": 0.0005555480602197349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.555480584007455e-06, + "grad_norm": 5.505648612976074, + "learning_rate": 1.1148791860958033e-07, + "loss": 0.6931, + "mean_token_accuracy": 0.786504864692688, + "num_tokens": 10082088.0, + "step": 264 + }, + { + "epoch": 0.03371072382648518, + "ewc_loss": 0.0005610584048554301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6105841395037714e-06, + "grad_norm": 3.436429977416992, + "learning_rate": 1.119118270453582e-07, + "loss": 0.6677, + "mean_token_accuracy": 0.796760082244873, + "num_tokens": 10121122.0, + "step": 265 + }, + { + "epoch": 0.03383793410507569, + "ewc_loss": 0.0005507505848072469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.507506102730986e-06, + "grad_norm": 4.355510711669922, + "learning_rate": 1.1233573548113607e-07, + "loss": 0.6619, + "mean_token_accuracy": 0.8010507225990295, + "num_tokens": 10161017.0, + "step": 266 + }, + { + "epoch": 0.0339651443836662, + "ewc_loss": 0.0005524583393707871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.524583684746176e-06, + "grad_norm": 3.2245233058929443, + "learning_rate": 1.1275964391691393e-07, + "loss": 0.6289, + "mean_token_accuracy": 0.8103833794593811, + "num_tokens": 10204913.0, + "step": 267 + }, + { + "epoch": 0.03409235466225671, + "ewc_loss": 0.0005451946635730565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.45194643564173e-06, + "grad_norm": 3.9666805267333984, + "learning_rate": 1.131835523526918e-07, + "loss": 0.6766, + "mean_token_accuracy": 0.7930315732955933, + "num_tokens": 10245154.0, + "step": 268 + }, + { + "epoch": 0.03421956494084722, + "ewc_loss": 0.0005487823509611189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.487823273142567e-06, + "grad_norm": 6.689341068267822, + "learning_rate": 1.1360746078846968e-07, + "loss": 0.684, + "mean_token_accuracy": 0.7927323579788208, + "num_tokens": 10283006.0, + "step": 269 + }, + { + "epoch": 0.03434677521943773, + "ewc_loss": 0.0005655147833749652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.655148015648592e-06, + "grad_norm": 4.759220600128174, + "learning_rate": 1.1403136922424755e-07, + "loss": 0.6422, + "mean_token_accuracy": 0.8030502796173096, + "num_tokens": 10316689.0, + "step": 270 + }, + { + "epoch": 0.03447398549802824, + "ewc_loss": 0.0005701862974092364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.701862846763106e-06, + "grad_norm": 4.0591864585876465, + "learning_rate": 1.1445527766002542e-07, + "loss": 0.6373, + "mean_token_accuracy": 0.8038213849067688, + "num_tokens": 10358145.0, + "step": 271 + }, + { + "epoch": 0.03460119577661875, + "ewc_loss": 0.0005667218356393278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.667218374583172e-06, + "grad_norm": 4.599855899810791, + "learning_rate": 1.148791860958033e-07, + "loss": 0.7377, + "mean_token_accuracy": 0.7781000137329102, + "num_tokens": 10393424.0, + "step": 272 + }, + { + "epoch": 0.034728406055209264, + "ewc_loss": 0.0005701656336896122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.701656391465804e-06, + "grad_norm": 4.6986188888549805, + "learning_rate": 1.1530309453158117e-07, + "loss": 0.6528, + "mean_token_accuracy": 0.8013260960578918, + "num_tokens": 10433560.0, + "step": 273 + }, + { + "epoch": 0.03485561633379977, + "ewc_loss": 0.0005724346847273409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7243469200329855e-06, + "grad_norm": 4.346771717071533, + "learning_rate": 1.1572700296735904e-07, + "loss": 0.6488, + "mean_token_accuracy": 0.8017664551734924, + "num_tokens": 10472024.0, + "step": 274 + }, + { + "epoch": 0.03498282661239028, + "ewc_loss": 0.0005728566320613027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.728566520701861e-06, + "grad_norm": 4.608171463012695, + "learning_rate": 1.1615091140313691e-07, + "loss": 0.6871, + "mean_token_accuracy": 0.7920354604721069, + "num_tokens": 10510874.0, + "step": 275 + }, + { + "epoch": 0.03511003689098079, + "ewc_loss": 0.0005762394284829497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.762394266639603e-06, + "grad_norm": 5.353006839752197, + "learning_rate": 1.1657481983891479e-07, + "loss": 0.6256, + "mean_token_accuracy": 0.8095922470092773, + "num_tokens": 10552411.0, + "step": 276 + }, + { + "epoch": 0.0352372471695713, + "ewc_loss": 0.000583806075155735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.838060587848304e-06, + "grad_norm": 4.983737468719482, + "learning_rate": 1.1699872827469266e-07, + "loss": 0.6413, + "mean_token_accuracy": 0.8062621355056763, + "num_tokens": 10591537.0, + "step": 277 + }, + { + "epoch": 0.03536445744816181, + "ewc_loss": 0.0005888703162781894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.888703071832424e-06, + "grad_norm": 4.906955242156982, + "learning_rate": 1.1742263671047053e-07, + "loss": 0.6494, + "mean_token_accuracy": 0.8036720752716064, + "num_tokens": 10632228.0, + "step": 278 + }, + { + "epoch": 0.03549166772675232, + "ewc_loss": 0.0005918172537349164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9181725191592705e-06, + "grad_norm": 7.58941125869751, + "learning_rate": 1.178465451462484e-07, + "loss": 0.7051, + "mean_token_accuracy": 0.7852426171302795, + "num_tokens": 10672829.0, + "step": 279 + }, + { + "epoch": 0.03561887800534283, + "ewc_loss": 0.0006047269562259316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.047269835107727e-06, + "grad_norm": 5.639432430267334, + "learning_rate": 1.1827045358202628e-07, + "loss": 0.6781, + "mean_token_accuracy": 0.7992276549339294, + "num_tokens": 10706146.0, + "step": 280 + }, + { + "epoch": 0.035746088283933344, + "ewc_loss": 0.0006066033965907991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.066034075047355e-06, + "grad_norm": 4.668436050415039, + "learning_rate": 1.1869436201780415e-07, + "loss": 0.596, + "mean_token_accuracy": 0.8189263939857483, + "num_tokens": 10744112.0, + "step": 281 + }, + { + "epoch": 0.03587329856252385, + "ewc_loss": 0.0006010616198182106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.010616289131576e-06, + "grad_norm": 4.996715545654297, + "learning_rate": 1.1911827045358202e-07, + "loss": 0.626, + "mean_token_accuracy": 0.8137978315353394, + "num_tokens": 10783726.0, + "step": 282 + }, + { + "epoch": 0.03600050884111436, + "ewc_loss": 0.0006003874004818499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.003873750159983e-06, + "grad_norm": 4.280725002288818, + "learning_rate": 1.195421788893599e-07, + "loss": 0.6904, + "mean_token_accuracy": 0.793612003326416, + "num_tokens": 10824476.0, + "step": 283 + }, + { + "epoch": 0.036127719119704874, + "ewc_loss": 0.0005962257855571806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.962258001090959e-06, + "grad_norm": 5.7379865646362305, + "learning_rate": 1.1996608732513778e-07, + "loss": 0.6497, + "mean_token_accuracy": 0.8024210929870605, + "num_tokens": 10859997.0, + "step": 284 + }, + { + "epoch": 0.03625492939829538, + "ewc_loss": 0.0006028096540831029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.028096322552301e-06, + "grad_norm": 4.911397933959961, + "learning_rate": 1.2038999576091563e-07, + "loss": 0.6208, + "mean_token_accuracy": 0.8115010261535645, + "num_tokens": 10898800.0, + "step": 285 + }, + { + "epoch": 0.036382139676885895, + "ewc_loss": 0.0006044839392416179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.044839210517239e-06, + "grad_norm": 4.850776195526123, + "learning_rate": 1.208139041966935e-07, + "loss": 0.5866, + "mean_token_accuracy": 0.8218711614608765, + "num_tokens": 10937462.0, + "step": 286 + }, + { + "epoch": 0.0365093499554764, + "ewc_loss": 0.000605048961006105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.050489446352003e-06, + "grad_norm": 4.734531879425049, + "learning_rate": 1.2123781263247137e-07, + "loss": 0.6469, + "mean_token_accuracy": 0.7999273538589478, + "num_tokens": 10970739.0, + "step": 287 + }, + { + "epoch": 0.03663656023406691, + "ewc_loss": 0.0006077190046198666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.077189937059302e-06, + "grad_norm": 4.999114513397217, + "learning_rate": 1.2166172106824924e-07, + "loss": 0.6996, + "mean_token_accuracy": 0.7947798371315002, + "num_tokens": 11003000.0, + "step": 288 + }, + { + "epoch": 0.036763770512657425, + "ewc_loss": 0.0006121606566011906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.1216064750624355e-06, + "grad_norm": 4.605237007141113, + "learning_rate": 1.2208562950402712e-07, + "loss": 0.6047, + "mean_token_accuracy": 0.8123353719711304, + "num_tokens": 11039665.0, + "step": 289 + }, + { + "epoch": 0.03689098079124793, + "ewc_loss": 0.0006151440902613103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.151441084512044e-06, + "grad_norm": 4.292795181274414, + "learning_rate": 1.22509537939805e-07, + "loss": 0.6321, + "mean_token_accuracy": 0.8027940988540649, + "num_tokens": 11078368.0, + "step": 290 + }, + { + "epoch": 0.03701819106983844, + "ewc_loss": 0.0006149284308776259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.149284217826789e-06, + "grad_norm": 3.6493115425109863, + "learning_rate": 1.2293344637558286e-07, + "loss": 0.6578, + "mean_token_accuracy": 0.8011413812637329, + "num_tokens": 11122654.0, + "step": 291 + }, + { + "epoch": 0.037145401348428954, + "ewc_loss": 0.0006124831270426512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.124831543274922e-06, + "grad_norm": 4.931789398193359, + "learning_rate": 1.2335735481136073e-07, + "loss": 0.6123, + "mean_token_accuracy": 0.8076484203338623, + "num_tokens": 11162622.0, + "step": 292 + }, + { + "epoch": 0.03727261162701946, + "ewc_loss": 0.0006220381474122405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.220381237653783e-06, + "grad_norm": 5.188961982727051, + "learning_rate": 1.237812632471386e-07, + "loss": 0.6241, + "mean_token_accuracy": 0.8103985786437988, + "num_tokens": 11202564.0, + "step": 293 + }, + { + "epoch": 0.037399821905609976, + "ewc_loss": 0.0006316731451079249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.316731742117554e-06, + "grad_norm": 5.170628547668457, + "learning_rate": 1.2420517168291648e-07, + "loss": 0.6419, + "mean_token_accuracy": 0.8009041547775269, + "num_tokens": 11243422.0, + "step": 294 + }, + { + "epoch": 0.03752703218420048, + "ewc_loss": 0.0006402456783689559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.402456619980512e-06, + "grad_norm": 4.183228492736816, + "learning_rate": 1.2462908011869435e-07, + "loss": 0.6234, + "mean_token_accuracy": 0.8069759011268616, + "num_tokens": 11280867.0, + "step": 295 + }, + { + "epoch": 0.03765424246279099, + "ewc_loss": 0.0006393319345079362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.3933193814591505e-06, + "grad_norm": 8.113860130310059, + "learning_rate": 1.2505298855447223e-07, + "loss": 0.5733, + "mean_token_accuracy": 0.8226199150085449, + "num_tokens": 11318454.0, + "step": 296 + }, + { + "epoch": 0.037781452741381505, + "ewc_loss": 0.0006592168938368559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.592169029318029e-06, + "grad_norm": 4.905411720275879, + "learning_rate": 1.254768969902501e-07, + "loss": 0.7128, + "mean_token_accuracy": 0.7858046293258667, + "num_tokens": 11354721.0, + "step": 297 + }, + { + "epoch": 0.03790866301997201, + "ewc_loss": 0.0006602643989026546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.602644134545699e-06, + "grad_norm": 4.75932502746582, + "learning_rate": 1.2590080542602797e-07, + "loss": 0.6696, + "mean_token_accuracy": 0.7970826625823975, + "num_tokens": 11385938.0, + "step": 298 + }, + { + "epoch": 0.03803587329856253, + "ewc_loss": 0.0006581639172509313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.581638899660902e-06, + "grad_norm": 4.580192565917969, + "learning_rate": 1.2632471386180584e-07, + "loss": 0.6496, + "mean_token_accuracy": 0.8015916347503662, + "num_tokens": 11424911.0, + "step": 299 + }, + { + "epoch": 0.038163083577153034, + "ewc_loss": 0.0006559122120961547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.559122084581759e-06, + "grad_norm": 3.831930160522461, + "learning_rate": 1.2674862229758372e-07, + "loss": 0.6187, + "mean_token_accuracy": 0.8118434548377991, + "num_tokens": 11460580.0, + "step": 300 + }, + { + "epoch": 0.03829029385574354, + "ewc_loss": 0.0006474680849350989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.474680958490353e-06, + "grad_norm": 4.148677349090576, + "learning_rate": 1.271725307333616e-07, + "loss": 0.6091, + "mean_token_accuracy": 0.8152616620063782, + "num_tokens": 11501031.0, + "step": 301 + }, + { + "epoch": 0.038417504134334056, + "ewc_loss": 0.0006476587150245905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.476587259385269e-06, + "grad_norm": 3.4504594802856445, + "learning_rate": 1.2759643916913946e-07, + "loss": 0.6183, + "mean_token_accuracy": 0.8107993006706238, + "num_tokens": 11544053.0, + "step": 302 + }, + { + "epoch": 0.03854471441292456, + "ewc_loss": 0.0006436100811697543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.436100647988496e-06, + "grad_norm": 4.711528301239014, + "learning_rate": 1.2802034760491733e-07, + "loss": 0.6339, + "mean_token_accuracy": 0.8051204085350037, + "num_tokens": 11576944.0, + "step": 303 + }, + { + "epoch": 0.03867192469151508, + "ewc_loss": 0.0006566558731719851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.566559022758156e-06, + "grad_norm": 5.7587103843688965, + "learning_rate": 1.284442560406952e-07, + "loss": 0.6848, + "mean_token_accuracy": 0.7908235788345337, + "num_tokens": 11619735.0, + "step": 304 + }, + { + "epoch": 0.038799134970105585, + "ewc_loss": 0.0006758776144124568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.758776180504356e-06, + "grad_norm": 4.106053352355957, + "learning_rate": 1.2886816447647308e-07, + "loss": 0.7208, + "mean_token_accuracy": 0.7782257795333862, + "num_tokens": 11655025.0, + "step": 305 + }, + { + "epoch": 0.03892634524869609, + "ewc_loss": 0.0006767594022676349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.767594186385395e-06, + "grad_norm": 4.711560249328613, + "learning_rate": 1.2929207291225095e-07, + "loss": 0.6193, + "mean_token_accuracy": 0.8114961385726929, + "num_tokens": 11691970.0, + "step": 306 + }, + { + "epoch": 0.03905355552728661, + "ewc_loss": 0.0006825992604717612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.825992386438884e-06, + "grad_norm": 5.031098365783691, + "learning_rate": 1.2971598134802882e-07, + "loss": 0.6345, + "mean_token_accuracy": 0.8055046796798706, + "num_tokens": 11729561.0, + "step": 307 + }, + { + "epoch": 0.039180765805877114, + "ewc_loss": 0.0006899032741785049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.899032996443566e-06, + "grad_norm": 3.9665799140930176, + "learning_rate": 1.301398897838067e-07, + "loss": 0.6524, + "mean_token_accuracy": 0.7991321682929993, + "num_tokens": 11765707.0, + "step": 308 + }, + { + "epoch": 0.03930797608446762, + "ewc_loss": 0.0006851215148344636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.8512149482558016e-06, + "grad_norm": 5.067592620849609, + "learning_rate": 1.3056379821958457e-07, + "loss": 0.6616, + "mean_token_accuracy": 0.8028861880302429, + "num_tokens": 11806223.0, + "step": 309 + }, + { + "epoch": 0.039435186363058136, + "ewc_loss": 0.0006931553944014013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.9315537984948605e-06, + "grad_norm": 4.391654014587402, + "learning_rate": 1.3098770665536244e-07, + "loss": 0.6171, + "mean_token_accuracy": 0.8129979372024536, + "num_tokens": 11845477.0, + "step": 310 + }, + { + "epoch": 0.039562396641648644, + "ewc_loss": 0.0006945692584849894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.9456928031286225e-06, + "grad_norm": 4.287742614746094, + "learning_rate": 1.3141161509114031e-07, + "loss": 0.6013, + "mean_token_accuracy": 0.8100271224975586, + "num_tokens": 11877973.0, + "step": 311 + }, + { + "epoch": 0.03968960692023916, + "ewc_loss": 0.0006939641898497939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.939641934877727e-06, + "grad_norm": 5.532313823699951, + "learning_rate": 1.3183552352691819e-07, + "loss": 0.6076, + "mean_token_accuracy": 0.8138797283172607, + "num_tokens": 11912093.0, + "step": 312 + }, + { + "epoch": 0.039816817198829665, + "ewc_loss": 0.0007073947926983237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.073947926983237e-06, + "grad_norm": 4.208630084991455, + "learning_rate": 1.3225943196269603e-07, + "loss": 0.6334, + "mean_token_accuracy": 0.8105136156082153, + "num_tokens": 11952541.0, + "step": 313 + }, + { + "epoch": 0.03994402747742017, + "ewc_loss": 0.0007047655526548624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0476557993970346e-06, + "grad_norm": 4.3284430503845215, + "learning_rate": 1.3268334039847393e-07, + "loss": 0.6101, + "mean_token_accuracy": 0.8109803795814514, + "num_tokens": 11990414.0, + "step": 314 + }, + { + "epoch": 0.04007123775601069, + "ewc_loss": 0.0007057961774989963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0579617386101745e-06, + "grad_norm": 4.030457973480225, + "learning_rate": 1.3310724883425178e-07, + "loss": 0.6771, + "mean_token_accuracy": 0.7905059456825256, + "num_tokens": 12025925.0, + "step": 315 + }, + { + "epoch": 0.040198448034601195, + "ewc_loss": 0.000705430400557816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.054303750919644e-06, + "grad_norm": 5.897923946380615, + "learning_rate": 1.3353115727002968e-07, + "loss": 0.6457, + "mean_token_accuracy": 0.8015948534011841, + "num_tokens": 12068879.0, + "step": 316 + }, + { + "epoch": 0.04032565831319171, + "ewc_loss": 0.0007226517773233354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2265179369424e-06, + "grad_norm": 5.230038642883301, + "learning_rate": 1.3395506570580752e-07, + "loss": 0.612, + "mean_token_accuracy": 0.8109591007232666, + "num_tokens": 12111931.0, + "step": 317 + }, + { + "epoch": 0.040452868591782216, + "ewc_loss": 0.0007312481757253408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.312482011911925e-06, + "grad_norm": 4.653091907501221, + "learning_rate": 1.3437897414158542e-07, + "loss": 0.6781, + "mean_token_accuracy": 0.790436327457428, + "num_tokens": 12140431.0, + "step": 318 + }, + { + "epoch": 0.040580078870372724, + "ewc_loss": 0.000734339642804116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.343396191572538e-06, + "grad_norm": 6.216714859008789, + "learning_rate": 1.3480288257736327e-07, + "loss": 0.61, + "mean_token_accuracy": 0.8114020824432373, + "num_tokens": 12173932.0, + "step": 319 + }, + { + "epoch": 0.04070728914896324, + "ewc_loss": 0.000745887344237417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.458873369614594e-06, + "grad_norm": 4.714237689971924, + "learning_rate": 1.3522679101314117e-07, + "loss": 0.5864, + "mean_token_accuracy": 0.8168299198150635, + "num_tokens": 12210040.0, + "step": 320 + }, + { + "epoch": 0.040834499427553746, + "ewc_loss": 0.0007410038961097598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.410038961097598e-06, + "grad_norm": 4.163800239562988, + "learning_rate": 1.35650699448919e-07, + "loss": 0.6197, + "mean_token_accuracy": 0.810889482498169, + "num_tokens": 12248556.0, + "step": 321 + }, + { + "epoch": 0.04096170970614425, + "ewc_loss": 0.0007321524899452925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.321524662984302e-06, + "grad_norm": 4.342552185058594, + "learning_rate": 1.360746078846969e-07, + "loss": 0.6086, + "mean_token_accuracy": 0.811197817325592, + "num_tokens": 12285664.0, + "step": 322 + }, + { + "epoch": 0.04108891998473477, + "ewc_loss": 0.000729729887098074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.297298907360528e-06, + "grad_norm": 4.328380584716797, + "learning_rate": 1.3649851632047476e-07, + "loss": 0.6618, + "mean_token_accuracy": 0.797269880771637, + "num_tokens": 12326964.0, + "step": 323 + }, + { + "epoch": 0.041216130263325275, + "ewc_loss": 0.0007299003191292286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29900330043165e-06, + "grad_norm": 5.305307388305664, + "learning_rate": 1.3692242475625266e-07, + "loss": 0.5715, + "mean_token_accuracy": 0.825940728187561, + "num_tokens": 12366541.0, + "step": 324 + }, + { + "epoch": 0.04134334054191579, + "ewc_loss": 0.0007410452817566693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.410452781186905e-06, + "grad_norm": 5.127161026000977, + "learning_rate": 1.373463331920305e-07, + "loss": 0.6332, + "mean_token_accuracy": 0.810162365436554, + "num_tokens": 12405664.0, + "step": 325 + }, + { + "epoch": 0.0414705508205063, + "ewc_loss": 0.0007502293447032571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.502293556171935e-06, + "grad_norm": 5.5664167404174805, + "learning_rate": 1.377702416278084e-07, + "loss": 0.6968, + "mean_token_accuracy": 0.7879408597946167, + "num_tokens": 12445039.0, + "step": 326 + }, + { + "epoch": 0.041597761099096804, + "ewc_loss": 0.0007576700882054865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.576700681966031e-06, + "grad_norm": 3.726390838623047, + "learning_rate": 1.3819415006358625e-07, + "loss": 0.5666, + "mean_token_accuracy": 0.8245552778244019, + "num_tokens": 12485481.0, + "step": 327 + }, + { + "epoch": 0.04172497137768732, + "ewc_loss": 0.0007413232233375311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.413232196995523e-06, + "grad_norm": 5.4396772384643555, + "learning_rate": 1.3861805849936415e-07, + "loss": 0.5917, + "mean_token_accuracy": 0.8200091123580933, + "num_tokens": 12530272.0, + "step": 328 + }, + { + "epoch": 0.041852181656277826, + "ewc_loss": 0.000748419261071831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.48419279261725e-06, + "grad_norm": 4.616644859313965, + "learning_rate": 1.39041966935142e-07, + "loss": 0.6368, + "mean_token_accuracy": 0.8035645484924316, + "num_tokens": 12565854.0, + "step": 329 + }, + { + "epoch": 0.04197939193486834, + "ewc_loss": 0.0007465999806299806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.4659997153503355e-06, + "grad_norm": 3.5712714195251465, + "learning_rate": 1.394658753709199e-07, + "loss": 0.6228, + "mean_token_accuracy": 0.8093496561050415, + "num_tokens": 12607313.0, + "step": 330 + }, + { + "epoch": 0.04210660221345885, + "ewc_loss": 0.000733179971575737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.331799679377582e-06, + "grad_norm": 4.474488735198975, + "learning_rate": 1.3988978380669774e-07, + "loss": 0.6609, + "mean_token_accuracy": 0.7978501319885254, + "num_tokens": 12642057.0, + "step": 331 + }, + { + "epoch": 0.042233812492049355, + "ewc_loss": 0.000738414004445076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.384140189969912e-06, + "grad_norm": 4.4877705574035645, + "learning_rate": 1.403136922424756e-07, + "loss": 0.6522, + "mean_token_accuracy": 0.7999012470245361, + "num_tokens": 12684118.0, + "step": 332 + }, + { + "epoch": 0.04236102277063987, + "ewc_loss": 0.0007440083427354693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.440083209075965e-06, + "grad_norm": 4.417016983032227, + "learning_rate": 1.4073760067825348e-07, + "loss": 0.617, + "mean_token_accuracy": 0.8101460933685303, + "num_tokens": 12721945.0, + "step": 333 + }, + { + "epoch": 0.04248823304923038, + "ewc_loss": 0.0007493427256122231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.493427347071702e-06, + "grad_norm": 4.208912372589111, + "learning_rate": 1.4116150911403136e-07, + "loss": 0.6317, + "mean_token_accuracy": 0.8054217100143433, + "num_tokens": 12764175.0, + "step": 334 + }, + { + "epoch": 0.04261544332782089, + "ewc_loss": 0.0007507249829359353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507249847549247e-06, + "grad_norm": 4.533174991607666, + "learning_rate": 1.4158541754980923e-07, + "loss": 0.5815, + "mean_token_accuracy": 0.822414219379425, + "num_tokens": 12801600.0, + "step": 335 + }, + { + "epoch": 0.0427426536064114, + "ewc_loss": 0.0007550962036475539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550961981905857e-06, + "grad_norm": 5.343262195587158, + "learning_rate": 1.420093259855871e-07, + "loss": 0.614, + "mean_token_accuracy": 0.8166785836219788, + "num_tokens": 12843584.0, + "step": 336 + }, + { + "epoch": 0.042869863885001906, + "ewc_loss": 0.000765529868658632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.655298759345897e-06, + "grad_norm": 5.263065814971924, + "learning_rate": 1.4243323442136497e-07, + "loss": 0.6271, + "mean_token_accuracy": 0.8075623512268066, + "num_tokens": 12878105.0, + "step": 337 + }, + { + "epoch": 0.04299707416359242, + "ewc_loss": 0.0007722455193288624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722454938630108e-06, + "grad_norm": 3.597064733505249, + "learning_rate": 1.4285714285714285e-07, + "loss": 0.6344, + "mean_token_accuracy": 0.806126594543457, + "num_tokens": 12911712.0, + "step": 338 + }, + { + "epoch": 0.04312428444218293, + "ewc_loss": 0.0007533842581324279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.533842563134385e-06, + "grad_norm": 4.391000747680664, + "learning_rate": 1.4328105129292072e-07, + "loss": 0.6161, + "mean_token_accuracy": 0.812279462814331, + "num_tokens": 12951303.0, + "step": 339 + }, + { + "epoch": 0.043251494720773435, + "ewc_loss": 0.0007532333256676793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532333256676793e-06, + "grad_norm": 6.050390243530273, + "learning_rate": 1.437049597286986e-07, + "loss": 0.6032, + "mean_token_accuracy": 0.8144460916519165, + "num_tokens": 12990544.0, + "step": 340 + }, + { + "epoch": 0.04337870499936395, + "ewc_loss": 0.0007710279314778745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.710279533057474e-06, + "grad_norm": 6.636691093444824, + "learning_rate": 1.4412886816447646e-07, + "loss": 0.6611, + "mean_token_accuracy": 0.7977594137191772, + "num_tokens": 13023766.0, + "step": 341 + }, + { + "epoch": 0.04350591527795446, + "ewc_loss": 0.000792174891103059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.921748874650802e-06, + "grad_norm": 4.344638824462891, + "learning_rate": 1.4455277660025434e-07, + "loss": 0.5751, + "mean_token_accuracy": 0.8219255208969116, + "num_tokens": 13056337.0, + "step": 342 + }, + { + "epoch": 0.04363312555654497, + "ewc_loss": 0.0007804663036949933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.804663255228661e-06, + "grad_norm": 4.713498115539551, + "learning_rate": 1.449766850360322e-07, + "loss": 0.6341, + "mean_token_accuracy": 0.8068342208862305, + "num_tokens": 13097345.0, + "step": 343 + }, + { + "epoch": 0.04376033583513548, + "ewc_loss": 0.0007716551190242171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.7165514085209e-06, + "grad_norm": 7.15571928024292, + "learning_rate": 1.4540059347181008e-07, + "loss": 0.6187, + "mean_token_accuracy": 0.8090389966964722, + "num_tokens": 13127332.0, + "step": 344 + }, + { + "epoch": 0.043887546113725986, + "ewc_loss": 0.0007878660690039396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878660653659608e-06, + "grad_norm": 4.653478622436523, + "learning_rate": 1.4582450190758795e-07, + "loss": 0.6485, + "mean_token_accuracy": 0.7998195886611938, + "num_tokens": 13170760.0, + "step": 345 + }, + { + "epoch": 0.0440147563923165, + "ewc_loss": 0.0007778885192237794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778885446896311e-06, + "grad_norm": 5.314611911773682, + "learning_rate": 1.4624841034336583e-07, + "loss": 0.6165, + "mean_token_accuracy": 0.8135287761688232, + "num_tokens": 13207061.0, + "step": 346 + }, + { + "epoch": 0.04414196667090701, + "ewc_loss": 0.0007766347844153643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766348062432371e-06, + "grad_norm": 4.64633846282959, + "learning_rate": 1.466723187791437e-07, + "loss": 0.5865, + "mean_token_accuracy": 0.8195983171463013, + "num_tokens": 13247903.0, + "step": 347 + }, + { + "epoch": 0.04426917694949752, + "ewc_loss": 0.0007664671866223216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.664672011742368e-06, + "grad_norm": 4.272007465362549, + "learning_rate": 1.4709622721492157e-07, + "loss": 0.5962, + "mean_token_accuracy": 0.8123724460601807, + "num_tokens": 13287193.0, + "step": 348 + }, + { + "epoch": 0.04439638722808803, + "ewc_loss": 0.0007543150568380952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54315078665968e-06, + "grad_norm": 4.098629951477051, + "learning_rate": 1.4752013565069942e-07, + "loss": 0.5999, + "mean_token_accuracy": 0.8160294890403748, + "num_tokens": 13323112.0, + "step": 349 + }, + { + "epoch": 0.04452359750667854, + "ewc_loss": 0.0007481299689970911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.4812996899709105e-06, + "grad_norm": 4.13569450378418, + "learning_rate": 1.4794404408647732e-07, + "loss": 0.5962, + "mean_token_accuracy": 0.8169996738433838, + "num_tokens": 13359653.0, + "step": 350 + }, + { + "epoch": 0.04465080778526905, + "ewc_loss": 0.0007467717514373362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.467717296094634e-06, + "grad_norm": 4.420858860015869, + "learning_rate": 1.4836795252225516e-07, + "loss": 0.5665, + "mean_token_accuracy": 0.82285076379776, + "num_tokens": 13400571.0, + "step": 351 + }, + { + "epoch": 0.04477801806385956, + "ewc_loss": 0.0007525444380007684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525444289058214e-06, + "grad_norm": 4.625761032104492, + "learning_rate": 1.4879186095803306e-07, + "loss": 0.5445, + "mean_token_accuracy": 0.8264693021774292, + "num_tokens": 13437656.0, + "step": 352 + }, + { + "epoch": 0.04490522834245007, + "ewc_loss": 0.000760336231905967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6033620644011535e-06, + "grad_norm": 5.90394401550293, + "learning_rate": 1.492157693938109e-07, + "loss": 0.6032, + "mean_token_accuracy": 0.8157166242599487, + "num_tokens": 13474688.0, + "step": 353 + }, + { + "epoch": 0.04503243862104058, + "ewc_loss": 0.0007802853360772133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.802853360772133e-06, + "grad_norm": 4.186671257019043, + "learning_rate": 1.496396778295888e-07, + "loss": 0.5419, + "mean_token_accuracy": 0.8304131627082825, + "num_tokens": 13518256.0, + "step": 354 + }, + { + "epoch": 0.04515964889963109, + "ewc_loss": 0.0007682180730625987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.682180694246199e-06, + "grad_norm": 4.425918102264404, + "learning_rate": 1.5006358626536665e-07, + "loss": 0.6284, + "mean_token_accuracy": 0.8086034059524536, + "num_tokens": 13554315.0, + "step": 355 + }, + { + "epoch": 0.0452868591782216, + "ewc_loss": 0.0007622925331816077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.622925295436289e-06, + "grad_norm": 4.2656145095825195, + "learning_rate": 1.5048749470114455e-07, + "loss": 0.6493, + "mean_token_accuracy": 0.8020050525665283, + "num_tokens": 13594388.0, + "step": 356 + }, + { + "epoch": 0.04541406945681211, + "ewc_loss": 0.0007577815558761358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.577815267723054e-06, + "grad_norm": 3.9627468585968018, + "learning_rate": 1.509114031369224e-07, + "loss": 0.5751, + "mean_token_accuracy": 0.8191956281661987, + "num_tokens": 13633704.0, + "step": 357 + }, + { + "epoch": 0.04554127973540262, + "ewc_loss": 0.0007528851856477559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.528851710958406e-06, + "grad_norm": 4.143768310546875, + "learning_rate": 1.513353115727003e-07, + "loss": 0.6243, + "mean_token_accuracy": 0.8042837977409363, + "num_tokens": 13671183.0, + "step": 358 + }, + { + "epoch": 0.04566849001399313, + "ewc_loss": 0.0007556508062407374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.556508080597268e-06, + "grad_norm": 4.107242107391357, + "learning_rate": 1.5175922000847814e-07, + "loss": 0.58, + "mean_token_accuracy": 0.821284294128418, + "num_tokens": 13709640.0, + "step": 359 + }, + { + "epoch": 0.04579570029258364, + "ewc_loss": 0.0007572124013677239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57212410462671e-06, + "grad_norm": 4.423256874084473, + "learning_rate": 1.5218312844425604e-07, + "loss": 0.6032, + "mean_token_accuracy": 0.8067677021026611, + "num_tokens": 13751114.0, + "step": 360 + }, + { + "epoch": 0.045922910571174154, + "ewc_loss": 0.0007644463912583888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644463948963676e-06, + "grad_norm": 3.4026966094970703, + "learning_rate": 1.526070368800339e-07, + "loss": 0.5329, + "mean_token_accuracy": 0.8292497992515564, + "num_tokens": 13786448.0, + "step": 361 + }, + { + "epoch": 0.04605012084976466, + "ewc_loss": 0.0007497872575186193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.497872502426617e-06, + "grad_norm": 4.003698825836182, + "learning_rate": 1.530309453158118e-07, + "loss": 0.5561, + "mean_token_accuracy": 0.8310503959655762, + "num_tokens": 13827663.0, + "step": 362 + }, + { + "epoch": 0.04617733112835517, + "ewc_loss": 0.0007559184450656176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.559184268757235e-06, + "grad_norm": 4.155731201171875, + "learning_rate": 1.5345485375158963e-07, + "loss": 0.6351, + "mean_token_accuracy": 0.8061652183532715, + "num_tokens": 13866078.0, + "step": 363 + }, + { + "epoch": 0.04630454140694568, + "ewc_loss": 0.0007660409901291132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66041011956986e-06, + "grad_norm": 3.392063617706299, + "learning_rate": 1.5387876218736753e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.8384078741073608, + "num_tokens": 13909767.0, + "step": 364 + }, + { + "epoch": 0.04643175168553619, + "ewc_loss": 0.00075874401954934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.587439995404566e-06, + "grad_norm": 3.7524871826171875, + "learning_rate": 1.5430267062314538e-07, + "loss": 0.6201, + "mean_token_accuracy": 0.8106573820114136, + "num_tokens": 13948937.0, + "step": 365 + }, + { + "epoch": 0.0465589619641267, + "ewc_loss": 0.000761786475777626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617864866915625e-06, + "grad_norm": 4.982729434967041, + "learning_rate": 1.5472657905892328e-07, + "loss": 0.6316, + "mean_token_accuracy": 0.8051564693450928, + "num_tokens": 13984039.0, + "step": 366 + }, + { + "epoch": 0.04668617224271721, + "ewc_loss": 0.0007866423111408949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866423402447253e-06, + "grad_norm": 4.6365647315979, + "learning_rate": 1.5515048749470113e-07, + "loss": 0.6135, + "mean_token_accuracy": 0.8095606565475464, + "num_tokens": 14018162.0, + "step": 367 + }, + { + "epoch": 0.04681338252130772, + "ewc_loss": 0.0007976443157531321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976443157531321e-06, + "grad_norm": 3.6893203258514404, + "learning_rate": 1.55574395930479e-07, + "loss": 0.6128, + "mean_token_accuracy": 0.810403048992157, + "num_tokens": 14056493.0, + "step": 368 + }, + { + "epoch": 0.046940592799898234, + "ewc_loss": 0.0007850051624700427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850051588320639e-06, + "grad_norm": 5.364877223968506, + "learning_rate": 1.5599830436625687e-07, + "loss": 0.6063, + "mean_token_accuracy": 0.8138444423675537, + "num_tokens": 14097530.0, + "step": 369 + }, + { + "epoch": 0.04706780307848874, + "ewc_loss": 0.0008052541525103152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052541488723364e-06, + "grad_norm": 4.061832904815674, + "learning_rate": 1.5642221280203474e-07, + "loss": 0.625, + "mean_token_accuracy": 0.8106066584587097, + "num_tokens": 14136240.0, + "step": 370 + }, + { + "epoch": 0.04719501335707925, + "ewc_loss": 0.0007979462388902903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979462679941207e-06, + "grad_norm": 5.761065483093262, + "learning_rate": 1.5684612123781262e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.8319053649902344, + "num_tokens": 14171010.0, + "step": 371 + }, + { + "epoch": 0.04732222363566976, + "ewc_loss": 0.0008169714128598571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.169714419636875e-06, + "grad_norm": 3.9620249271392822, + "learning_rate": 1.572700296735905e-07, + "loss": 0.6079, + "mean_token_accuracy": 0.818461537361145, + "num_tokens": 14206985.0, + "step": 372 + }, + { + "epoch": 0.04744943391426027, + "ewc_loss": 0.000798403169028461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984031981322914e-06, + "grad_norm": 5.669106960296631, + "learning_rate": 1.576939381093684e-07, + "loss": 0.6059, + "mean_token_accuracy": 0.8098430037498474, + "num_tokens": 14236728.0, + "step": 373 + }, + { + "epoch": 0.047576644192850785, + "ewc_loss": 0.0008126696338877082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.126696229737718e-06, + "grad_norm": 3.1258904933929443, + "learning_rate": 1.5811784654514623e-07, + "loss": 0.5326, + "mean_token_accuracy": 0.8353351354598999, + "num_tokens": 14270758.0, + "step": 374 + }, + { + "epoch": 0.04770385447144129, + "ewc_loss": 0.0007764363544993103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.764363544993103e-06, + "grad_norm": 3.669438362121582, + "learning_rate": 1.5854175498092413e-07, + "loss": 0.6497, + "mean_token_accuracy": 0.7977445125579834, + "num_tokens": 14315002.0, + "step": 375 + }, + { + "epoch": 0.0478310647500318, + "ewc_loss": 0.0007730136276222765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730136530881282e-06, + "grad_norm": 3.987334966659546, + "learning_rate": 1.5896566341670198e-07, + "loss": 0.6197, + "mean_token_accuracy": 0.8056362271308899, + "num_tokens": 14353385.0, + "step": 376 + }, + { + "epoch": 0.047958275028622314, + "ewc_loss": 0.0007843193016014993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84319308877457e-06, + "grad_norm": 4.409621238708496, + "learning_rate": 1.5938957185247988e-07, + "loss": 0.653, + "mean_token_accuracy": 0.795025110244751, + "num_tokens": 14391702.0, + "step": 377 + }, + { + "epoch": 0.04808548530721282, + "ewc_loss": 0.000801729504019022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017294931050856e-06, + "grad_norm": 4.620528697967529, + "learning_rate": 1.5981348028825772e-07, + "loss": 0.62, + "mean_token_accuracy": 0.8076490759849548, + "num_tokens": 14427876.0, + "step": 378 + }, + { + "epoch": 0.048212695585803336, + "ewc_loss": 0.0008148410124704242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.148410415742546e-06, + "grad_norm": 4.480224609375, + "learning_rate": 1.6023738872403562e-07, + "loss": 0.7059, + "mean_token_accuracy": 0.7821175456047058, + "num_tokens": 14460349.0, + "step": 379 + }, + { + "epoch": 0.048339905864393844, + "ewc_loss": 0.0008172386442311108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.172386515070684e-06, + "grad_norm": 3.954188346862793, + "learning_rate": 1.6066129715981347e-07, + "loss": 0.5574, + "mean_token_accuracy": 0.8282862305641174, + "num_tokens": 14497763.0, + "step": 380 + }, + { + "epoch": 0.04846711614298435, + "ewc_loss": 0.0008054669597186148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054669706325512e-06, + "grad_norm": 3.549941301345825, + "learning_rate": 1.6108520559559137e-07, + "loss": 0.5513, + "mean_token_accuracy": 0.8264584541320801, + "num_tokens": 14537172.0, + "step": 381 + }, + { + "epoch": 0.048594326421574865, + "ewc_loss": 0.0007894158479757607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.894158443377819e-06, + "grad_norm": 4.080392360687256, + "learning_rate": 1.6150911403136921e-07, + "loss": 0.6368, + "mean_token_accuracy": 0.8087947964668274, + "num_tokens": 14565515.0, + "step": 382 + }, + { + "epoch": 0.04872153670016537, + "ewc_loss": 0.0007979811052791774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979811016411986e-06, + "grad_norm": 3.1440634727478027, + "learning_rate": 1.619330224671471e-07, + "loss": 0.6283, + "mean_token_accuracy": 0.807813286781311, + "num_tokens": 14608104.0, + "step": 383 + }, + { + "epoch": 0.04884874697875588, + "ewc_loss": 0.000784250907599926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842509148758836e-06, + "grad_norm": 4.6076178550720215, + "learning_rate": 1.6235693090292496e-07, + "loss": 0.6366, + "mean_token_accuracy": 0.8087155818939209, + "num_tokens": 14645328.0, + "step": 384 + }, + { + "epoch": 0.048975957257346395, + "ewc_loss": 0.0008134230738505721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.134230483847205e-06, + "grad_norm": 4.931049346923828, + "learning_rate": 1.6278083933870286e-07, + "loss": 0.6687, + "mean_token_accuracy": 0.7903692722320557, + "num_tokens": 14678792.0, + "step": 385 + }, + { + "epoch": 0.0491031675359369, + "ewc_loss": 0.0008366238325834274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36623803479597e-06, + "grad_norm": 3.6085119247436523, + "learning_rate": 1.632047477744807e-07, + "loss": 0.5929, + "mean_token_accuracy": 0.8171325922012329, + "num_tokens": 14715095.0, + "step": 386 + }, + { + "epoch": 0.049230377814527417, + "ewc_loss": 0.0008190095541067421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.190095286408905e-06, + "grad_norm": 3.866312265396118, + "learning_rate": 1.6362865621025858e-07, + "loss": 0.6124, + "mean_token_accuracy": 0.8110594749450684, + "num_tokens": 14753641.0, + "step": 387 + }, + { + "epoch": 0.049357588093117924, + "ewc_loss": 0.0008168114582076669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.168114618456457e-06, + "grad_norm": 4.350931167602539, + "learning_rate": 1.6405256464603645e-07, + "loss": 0.5838, + "mean_token_accuracy": 0.8172895908355713, + "num_tokens": 14788816.0, + "step": 388 + }, + { + "epoch": 0.04948479837170843, + "ewc_loss": 0.0008258611196652055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.258611160272267e-06, + "grad_norm": 4.141021728515625, + "learning_rate": 1.6447647308181432e-07, + "loss": 0.6271, + "mean_token_accuracy": 0.8050895929336548, + "num_tokens": 14821607.0, + "step": 389 + }, + { + "epoch": 0.049612008650298946, + "ewc_loss": 0.0008298950851894915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298950888274703e-06, + "grad_norm": 3.7963061332702637, + "learning_rate": 1.649003815175922e-07, + "loss": 0.6454, + "mean_token_accuracy": 0.802573561668396, + "num_tokens": 14861298.0, + "step": 390 + }, + { + "epoch": 0.04973921892888945, + "ewc_loss": 0.000821989553514868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.219895789807197e-06, + "grad_norm": 3.774150848388672, + "learning_rate": 1.6532428995337007e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.8241697549819946, + "num_tokens": 14899951.0, + "step": 391 + }, + { + "epoch": 0.04986642920747997, + "ewc_loss": 0.0008183269528672099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.183269528672099e-06, + "grad_norm": 3.0858802795410156, + "learning_rate": 1.6574819838914794e-07, + "loss": 0.5433, + "mean_token_accuracy": 0.8317368626594543, + "num_tokens": 14937169.0, + "step": 392 + }, + { + "epoch": 0.049993639486070475, + "ewc_loss": 0.0008011701283976436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011701538634952e-06, + "grad_norm": 3.9581661224365234, + "learning_rate": 1.661721068249258e-07, + "loss": 0.5954, + "mean_token_accuracy": 0.8148902654647827, + "num_tokens": 14975549.0, + "step": 393 + }, + { + "epoch": 0.05012084976466098, + "ewc_loss": 0.0008202967583201826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.202967364923097e-06, + "grad_norm": 4.191145420074463, + "learning_rate": 1.6659601526070368e-07, + "loss": 0.6227, + "mean_token_accuracy": 0.8085660338401794, + "num_tokens": 15016630.0, + "step": 394 + }, + { + "epoch": 0.0502480600432515, + "ewc_loss": 0.000840034568682313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400345905101858e-06, + "grad_norm": 3.9163897037506104, + "learning_rate": 1.6701992369648156e-07, + "loss": 0.6053, + "mean_token_accuracy": 0.8110105991363525, + "num_tokens": 15057822.0, + "step": 395 + }, + { + "epoch": 0.050375270321842004, + "ewc_loss": 0.0008431330788880587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.431330570601858e-06, + "grad_norm": 4.426570892333984, + "learning_rate": 1.6744383213225943e-07, + "loss": 0.5779, + "mean_token_accuracy": 0.8156678080558777, + "num_tokens": 15094385.0, + "step": 396 + }, + { + "epoch": 0.05050248060043251, + "ewc_loss": 0.0008519011316820979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519011316820979e-06, + "grad_norm": 3.532395601272583, + "learning_rate": 1.678677405680373e-07, + "loss": 0.5368, + "mean_token_accuracy": 0.8291409015655518, + "num_tokens": 15130969.0, + "step": 397 + }, + { + "epoch": 0.050629690879023026, + "ewc_loss": 0.0008328164694830775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328164767590351e-06, + "grad_norm": 4.734771728515625, + "learning_rate": 1.6829164900381518e-07, + "loss": 0.5585, + "mean_token_accuracy": 0.8214268088340759, + "num_tokens": 15166586.0, + "step": 398 + }, + { + "epoch": 0.05075690115761353, + "ewc_loss": 0.0008512162021361291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512161912221927e-06, + "grad_norm": 3.969331741333008, + "learning_rate": 1.6871555743959305e-07, + "loss": 0.549, + "mean_token_accuracy": 0.8251354694366455, + "num_tokens": 15209603.0, + "step": 399 + }, + { + "epoch": 0.05088411143620405, + "ewc_loss": 0.0008431124733760953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.431125024799258e-06, + "grad_norm": 4.067301273345947, + "learning_rate": 1.6913946587537092e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.8206632137298584, + "num_tokens": 15251542.0, + "step": 400 + }, + { + "epoch": 0.051011321714794555, + "ewc_loss": 0.0008399727521464229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399727448704652e-06, + "grad_norm": 5.319714069366455, + "learning_rate": 1.695633743111488e-07, + "loss": 0.5803, + "mean_token_accuracy": 0.8205448985099792, + "num_tokens": 15288940.0, + "step": 401 + }, + { + "epoch": 0.05113853199338506, + "ewc_loss": 0.0008617464336566627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617464118287899e-06, + "grad_norm": 3.783015012741089, + "learning_rate": 1.6998728274692667e-07, + "loss": 0.5474, + "mean_token_accuracy": 0.8324145078659058, + "num_tokens": 15321965.0, + "step": 402 + }, + { + "epoch": 0.05126574227197558, + "ewc_loss": 0.0008400804363191128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400804290431552e-06, + "grad_norm": 3.7700865268707275, + "learning_rate": 1.7041119118270454e-07, + "loss": 0.5563, + "mean_token_accuracy": 0.8280221223831177, + "num_tokens": 15362895.0, + "step": 403 + }, + { + "epoch": 0.051392952550566084, + "ewc_loss": 0.0008282491471618414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.282491762656718e-06, + "grad_norm": 4.940451622009277, + "learning_rate": 1.7083509961848238e-07, + "loss": 0.6129, + "mean_token_accuracy": 0.8089977502822876, + "num_tokens": 15396675.0, + "step": 404 + }, + { + "epoch": 0.0515201628291566, + "ewc_loss": 0.0008454524213448167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454524504486471e-06, + "grad_norm": 4.022578239440918, + "learning_rate": 1.7125900805426028e-07, + "loss": 0.6346, + "mean_token_accuracy": 0.803713858127594, + "num_tokens": 15443070.0, + "step": 405 + }, + { + "epoch": 0.051647373107747106, + "ewc_loss": 0.0008356374455615878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356374564755242e-06, + "grad_norm": 3.5707647800445557, + "learning_rate": 1.7168291649003813e-07, + "loss": 0.5617, + "mean_token_accuracy": 0.823595404624939, + "num_tokens": 15476747.0, + "step": 406 + }, + { + "epoch": 0.051774583386337614, + "ewc_loss": 0.0008179154829122126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.179154974641278e-06, + "grad_norm": 3.49234676361084, + "learning_rate": 1.7210682492581603e-07, + "loss": 0.5873, + "mean_token_accuracy": 0.8181326389312744, + "num_tokens": 15514866.0, + "step": 407 + }, + { + "epoch": 0.05190179366492813, + "ewc_loss": 0.0008093406795524061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093406904663425e-06, + "grad_norm": 3.1803908348083496, + "learning_rate": 1.7253073336159387e-07, + "loss": 0.5643, + "mean_token_accuracy": 0.8219194412231445, + "num_tokens": 15554245.0, + "step": 408 + }, + { + "epoch": 0.052029003943518635, + "ewc_loss": 0.0008017031359486282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017031177587342e-06, + "grad_norm": 3.454695224761963, + "learning_rate": 1.7295464179737177e-07, + "loss": 0.5938, + "mean_token_accuracy": 0.8171184062957764, + "num_tokens": 15593958.0, + "step": 409 + }, + { + "epoch": 0.05215621422210915, + "ewc_loss": 0.0008135710959322751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.135711141221691e-06, + "grad_norm": 3.708268880844116, + "learning_rate": 1.7337855023314962e-07, + "loss": 0.5387, + "mean_token_accuracy": 0.8306050300598145, + "num_tokens": 15632567.0, + "step": 410 + }, + { + "epoch": 0.05228342450069966, + "ewc_loss": 0.0008269288809970021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.269288628071081e-06, + "grad_norm": 4.143406867980957, + "learning_rate": 1.7380245866892752e-07, + "loss": 0.5472, + "mean_token_accuracy": 0.8272514343261719, + "num_tokens": 15668255.0, + "step": 411 + }, + { + "epoch": 0.052410634779290165, + "ewc_loss": 0.0008445075945928693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445075764029752e-06, + "grad_norm": 3.688753366470337, + "learning_rate": 1.7422636710470536e-07, + "loss": 0.6, + "mean_token_accuracy": 0.8126099705696106, + "num_tokens": 15705435.0, + "step": 412 + }, + { + "epoch": 0.05253784505788068, + "ewc_loss": 0.0008402019157074392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40201937535312e-06, + "grad_norm": 3.4703428745269775, + "learning_rate": 1.7465027554048326e-07, + "loss": 0.6198, + "mean_token_accuracy": 0.8068659901618958, + "num_tokens": 15744657.0, + "step": 413 + }, + { + "epoch": 0.05266505533647119, + "ewc_loss": 0.0008314467850141227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314467777381651e-06, + "grad_norm": 3.8322644233703613, + "learning_rate": 1.750741839762611e-07, + "loss": 0.5641, + "mean_token_accuracy": 0.8254433870315552, + "num_tokens": 15777374.0, + "step": 414 + }, + { + "epoch": 0.052792265615061694, + "ewc_loss": 0.0008382829837501049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382829946640413e-06, + "grad_norm": 3.4472339153289795, + "learning_rate": 1.75498092412039e-07, + "loss": 0.5361, + "mean_token_accuracy": 0.8323286175727844, + "num_tokens": 15814047.0, + "step": 415 + }, + { + "epoch": 0.05291947589365221, + "ewc_loss": 0.000832165707834065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321657332999166e-06, + "grad_norm": 3.4934241771698, + "learning_rate": 1.7592200084781686e-07, + "loss": 0.6168, + "mean_token_accuracy": 0.8075742721557617, + "num_tokens": 15849110.0, + "step": 416 + }, + { + "epoch": 0.053046686172242716, + "ewc_loss": 0.0008326194947585464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326194802066311e-06, + "grad_norm": 3.545766830444336, + "learning_rate": 1.7634590928359475e-07, + "loss": 0.5878, + "mean_token_accuracy": 0.8194212317466736, + "num_tokens": 15888910.0, + "step": 417 + }, + { + "epoch": 0.05317389645083323, + "ewc_loss": 0.0008371046278625727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371046533284243e-06, + "grad_norm": 3.5263423919677734, + "learning_rate": 1.767698177193726e-07, + "loss": 0.5738, + "mean_token_accuracy": 0.8205206394195557, + "num_tokens": 15925604.0, + "step": 418 + }, + { + "epoch": 0.05330110672942374, + "ewc_loss": 0.0008420265512540936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420265658060089e-06, + "grad_norm": 3.68768572807312, + "learning_rate": 1.771937261551505e-07, + "loss": 0.548, + "mean_token_accuracy": 0.8337869644165039, + "num_tokens": 15961401.0, + "step": 419 + }, + { + "epoch": 0.053428317008014245, + "ewc_loss": 0.0008514440851286054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51444110594457e-06, + "grad_norm": 3.3474152088165283, + "learning_rate": 1.7761763459092835e-07, + "loss": 0.5786, + "mean_token_accuracy": 0.8203591704368591, + "num_tokens": 16001029.0, + "step": 420 + }, + { + "epoch": 0.05355552728660476, + "ewc_loss": 0.000843608460854739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436084499408025e-06, + "grad_norm": 3.9866268634796143, + "learning_rate": 1.7804154302670624e-07, + "loss": 0.56, + "mean_token_accuracy": 0.8253527283668518, + "num_tokens": 16036014.0, + "step": 421 + }, + { + "epoch": 0.05368273756519527, + "ewc_loss": 0.0008604806498624384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604806680523325e-06, + "grad_norm": 3.443892478942871, + "learning_rate": 1.784654514624841e-07, + "loss": 0.5414, + "mean_token_accuracy": 0.8274781107902527, + "num_tokens": 16075373.0, + "step": 422 + }, + { + "epoch": 0.05380994784378578, + "ewc_loss": 0.0008514455403201282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514455657859799e-06, + "grad_norm": 3.4662599563598633, + "learning_rate": 1.7888935989826196e-07, + "loss": 0.5841, + "mean_token_accuracy": 0.8156543970108032, + "num_tokens": 16111352.0, + "step": 423 + }, + { + "epoch": 0.05393715812237629, + "ewc_loss": 0.0008506212616339326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50621290737763e-06, + "grad_norm": 3.15509033203125, + "learning_rate": 1.7931326833403984e-07, + "loss": 0.5999, + "mean_token_accuracy": 0.8101694583892822, + "num_tokens": 16148826.0, + "step": 424 + }, + { + "epoch": 0.054064368400966796, + "ewc_loss": 0.0008411089656874537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411089766013902e-06, + "grad_norm": 3.0750303268432617, + "learning_rate": 1.797371767698177e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8404339551925659, + "num_tokens": 16193491.0, + "step": 425 + }, + { + "epoch": 0.05419157867955731, + "ewc_loss": 0.0008409140864387155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409140718868002e-06, + "grad_norm": 3.5616366863250732, + "learning_rate": 1.8016108520559558e-07, + "loss": 0.5565, + "mean_token_accuracy": 0.8258898854255676, + "num_tokens": 16235595.0, + "step": 426 + }, + { + "epoch": 0.05431878895814782, + "ewc_loss": 0.0008632055833004415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632056051283143e-06, + "grad_norm": 4.077852725982666, + "learning_rate": 1.8058499364137345e-07, + "loss": 0.6104, + "mean_token_accuracy": 0.814926266670227, + "num_tokens": 16274105.0, + "step": 427 + }, + { + "epoch": 0.054445999236738325, + "ewc_loss": 0.0008902635890990496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.9026361820288e-06, + "grad_norm": 3.2111802101135254, + "learning_rate": 1.8100890207715133e-07, + "loss": 0.6539, + "mean_token_accuracy": 0.7934185266494751, + "num_tokens": 16315886.0, + "step": 428 + }, + { + "epoch": 0.05457320951532884, + "ewc_loss": 0.0008714954019524157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.71495376486564e-06, + "grad_norm": 3.1457481384277344, + "learning_rate": 1.814328105129292e-07, + "loss": 0.595, + "mean_token_accuracy": 0.8130544424057007, + "num_tokens": 16353021.0, + "step": 429 + }, + { + "epoch": 0.05470041979391935, + "ewc_loss": 0.0008614221005700529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614220860181376e-06, + "grad_norm": 3.6989545822143555, + "learning_rate": 1.8185671894870707e-07, + "loss": 0.5703, + "mean_token_accuracy": 0.8187055587768555, + "num_tokens": 16380887.0, + "step": 430 + }, + { + "epoch": 0.05482763007250986, + "ewc_loss": 0.0008816675981506705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.816676199785434e-06, + "grad_norm": 3.0517771244049072, + "learning_rate": 1.8228062738448494e-07, + "loss": 0.6258, + "mean_token_accuracy": 0.8071064949035645, + "num_tokens": 16421066.0, + "step": 431 + }, + { + "epoch": 0.05495484035110037, + "ewc_loss": 0.0008728140383027494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.728140528546646e-06, + "grad_norm": 3.9932491779327393, + "learning_rate": 1.8270453582026282e-07, + "loss": 0.5939, + "mean_token_accuracy": 0.8141324520111084, + "num_tokens": 16457197.0, + "step": 432 + }, + { + "epoch": 0.055082050629690876, + "ewc_loss": 0.0009049862273968756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.049862455867697e-06, + "grad_norm": 3.1286370754241943, + "learning_rate": 1.831284442560407e-07, + "loss": 0.5501, + "mean_token_accuracy": 0.8263768553733826, + "num_tokens": 16497385.0, + "step": 433 + }, + { + "epoch": 0.05520926090828139, + "ewc_loss": 0.0008869143784977496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.869144039636012e-06, + "grad_norm": 3.0051610469818115, + "learning_rate": 1.8355235269181856e-07, + "loss": 0.5649, + "mean_token_accuracy": 0.8234003186225891, + "num_tokens": 16540099.0, + "step": 434 + }, + { + "epoch": 0.0553364711868719, + "ewc_loss": 0.0008771795546635985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.771795364737045e-06, + "grad_norm": 3.6197054386138916, + "learning_rate": 1.8397626112759643e-07, + "loss": 0.5613, + "mean_token_accuracy": 0.8240504264831543, + "num_tokens": 16572760.0, + "step": 435 + }, + { + "epoch": 0.05546368146546241, + "ewc_loss": 0.0009014105889946222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.014105671667494e-06, + "grad_norm": 3.2149128913879395, + "learning_rate": 1.844001695633743e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.8350264430046082, + "num_tokens": 16608549.0, + "step": 436 + }, + { + "epoch": 0.05559089174405292, + "ewc_loss": 0.0008994320523925126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.994320523925126e-06, + "grad_norm": 3.920259952545166, + "learning_rate": 1.8482407799915218e-07, + "loss": 0.5579, + "mean_token_accuracy": 0.8253370523452759, + "num_tokens": 16643800.0, + "step": 437 + }, + { + "epoch": 0.05571810202264343, + "ewc_loss": 0.000924015766941011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.24015785130905e-06, + "grad_norm": 3.868375301361084, + "learning_rate": 1.8524798643493005e-07, + "loss": 0.631, + "mean_token_accuracy": 0.8004575967788696, + "num_tokens": 16677948.0, + "step": 438 + }, + { + "epoch": 0.05584531230123394, + "ewc_loss": 0.0009324614657089114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.324614438810386e-06, + "grad_norm": 3.003857374191284, + "learning_rate": 1.8567189487070792e-07, + "loss": 0.605, + "mean_token_accuracy": 0.8119800090789795, + "num_tokens": 16716905.0, + "step": 439 + }, + { + "epoch": 0.05597252257982445, + "ewc_loss": 0.0009012189111672342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.012189366330858e-06, + "grad_norm": 3.310128927230835, + "learning_rate": 1.8609580330648577e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8357115983963013, + "num_tokens": 16756158.0, + "step": 440 + }, + { + "epoch": 0.05609973285841496, + "ewc_loss": 0.0009034143877215683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.034143658936955e-06, + "grad_norm": 4.075357437133789, + "learning_rate": 1.8651971174226367e-07, + "loss": 0.6074, + "mean_token_accuracy": 0.8133885264396667, + "num_tokens": 16794261.0, + "step": 441 + }, + { + "epoch": 0.05622694313700547, + "ewc_loss": 0.0009336445946246386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.33644605538575e-06, + "grad_norm": 2.941096544265747, + "learning_rate": 1.8694362017804152e-07, + "loss": 0.5641, + "mean_token_accuracy": 0.8215110301971436, + "num_tokens": 16834953.0, + "step": 442 + }, + { + "epoch": 0.05635415341559598, + "ewc_loss": 0.0009038419229909778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.03841919352999e-06, + "grad_norm": 4.592951774597168, + "learning_rate": 1.8736752861381941e-07, + "loss": 0.5234, + "mean_token_accuracy": 0.8357095122337341, + "num_tokens": 16873954.0, + "step": 443 + }, + { + "epoch": 0.05648136369418649, + "ewc_loss": 0.0009492267272435129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.4922670541564e-06, + "grad_norm": 3.71821928024292, + "learning_rate": 1.8779143704959726e-07, + "loss": 0.5447, + "mean_token_accuracy": 0.8253798484802246, + "num_tokens": 16906018.0, + "step": 444 + }, + { + "epoch": 0.056608573972777, + "ewc_loss": 0.0009420251590199769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.420251444680616e-06, + "grad_norm": 3.7712855339050293, + "learning_rate": 1.8821534548537516e-07, + "loss": 0.5978, + "mean_token_accuracy": 0.8145521283149719, + "num_tokens": 16938945.0, + "step": 445 + }, + { + "epoch": 0.05673578425136751, + "ewc_loss": 0.0009375798981636763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.375798981636763e-06, + "grad_norm": 3.2472829818725586, + "learning_rate": 1.88639253921153e-07, + "loss": 0.5568, + "mean_token_accuracy": 0.8271791338920593, + "num_tokens": 16982214.0, + "step": 446 + }, + { + "epoch": 0.05686299452995802, + "ewc_loss": 0.0009143998613581061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.143998795480002e-06, + "grad_norm": 3.162325859069824, + "learning_rate": 1.890631623569309e-07, + "loss": 0.5655, + "mean_token_accuracy": 0.8221235871315002, + "num_tokens": 17024367.0, + "step": 447 + }, + { + "epoch": 0.05699020480854853, + "ewc_loss": 0.0009043985046446323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.04398530110484e-06, + "grad_norm": 4.031844615936279, + "learning_rate": 1.8948707079270875e-07, + "loss": 0.5876, + "mean_token_accuracy": 0.8115262985229492, + "num_tokens": 17057451.0, + "step": 448 + }, + { + "epoch": 0.057117415087139044, + "ewc_loss": 0.0009406183380633593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.406183380633593e-06, + "grad_norm": 4.210582256317139, + "learning_rate": 1.8991097922848665e-07, + "loss": 0.5391, + "mean_token_accuracy": 0.8292074203491211, + "num_tokens": 17094633.0, + "step": 449 + }, + { + "epoch": 0.05724462536572955, + "ewc_loss": 0.0009631384164094925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.631384273234289e-06, + "grad_norm": 3.3379409313201904, + "learning_rate": 1.903348876642645e-07, + "loss": 0.5929, + "mean_token_accuracy": 0.8114311099052429, + "num_tokens": 17135892.0, + "step": 450 + }, + { + "epoch": 0.05737183564432006, + "ewc_loss": 0.0009339308016933501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.33930823521223e-06, + "grad_norm": 3.9348182678222656, + "learning_rate": 1.907587961000424e-07, + "loss": 0.5367, + "mean_token_accuracy": 0.8314437866210938, + "num_tokens": 17173881.0, + "step": 451 + }, + { + "epoch": 0.05749904592291057, + "ewc_loss": 0.0009382679127156734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.382679309055675e-06, + "grad_norm": 3.5474395751953125, + "learning_rate": 1.9118270453582024e-07, + "loss": 0.5852, + "mean_token_accuracy": 0.8175249695777893, + "num_tokens": 17215668.0, + "step": 452 + }, + { + "epoch": 0.05762625620150108, + "ewc_loss": 0.0009288432192988694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.288432011089753e-06, + "grad_norm": 3.734053373336792, + "learning_rate": 1.9160661297159814e-07, + "loss": 0.6084, + "mean_token_accuracy": 0.8117145299911499, + "num_tokens": 17254339.0, + "step": 453 + }, + { + "epoch": 0.057753466480091595, + "ewc_loss": 0.0009341047261841595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.341047189082019e-06, + "grad_norm": 3.4795949459075928, + "learning_rate": 1.9203052140737599e-07, + "loss": 0.5942, + "mean_token_accuracy": 0.8118317723274231, + "num_tokens": 17288858.0, + "step": 454 + }, + { + "epoch": 0.0578806767586821, + "ewc_loss": 0.0009268171270377934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.268171197618358e-06, + "grad_norm": 3.5597996711730957, + "learning_rate": 1.9245442984315389e-07, + "loss": 0.5704, + "mean_token_accuracy": 0.8236197829246521, + "num_tokens": 17330601.0, + "step": 455 + }, + { + "epoch": 0.05800788703727261, + "ewc_loss": 0.0009288484579883516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.288484761782456e-06, + "grad_norm": 3.5127575397491455, + "learning_rate": 1.9287833827893173e-07, + "loss": 0.5904, + "mean_token_accuracy": 0.8144149780273438, + "num_tokens": 17367797.0, + "step": 456 + }, + { + "epoch": 0.058135097315863124, + "ewc_loss": 0.0009326047729700804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.32604780246038e-06, + "grad_norm": 3.3458499908447266, + "learning_rate": 1.9330224671470963e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.8193576335906982, + "num_tokens": 17403206.0, + "step": 457 + }, + { + "epoch": 0.05826230759445363, + "ewc_loss": 0.0009271929156966507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.271929229726084e-06, + "grad_norm": 3.8993756771087646, + "learning_rate": 1.9372615515048748e-07, + "loss": 0.6216, + "mean_token_accuracy": 0.8085041642189026, + "num_tokens": 17437207.0, + "step": 458 + }, + { + "epoch": 0.05838951787304414, + "ewc_loss": 0.0009458804852329195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.45880492508877e-06, + "grad_norm": 3.652946710586548, + "learning_rate": 1.9415006358626535e-07, + "loss": 0.5578, + "mean_token_accuracy": 0.8267952799797058, + "num_tokens": 17473335.0, + "step": 459 + }, + { + "epoch": 0.05851672815163465, + "ewc_loss": 0.0009472022647969425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.472022611589637e-06, + "grad_norm": 4.558284282684326, + "learning_rate": 1.9457397202204322e-07, + "loss": 0.5485, + "mean_token_accuracy": 0.8236469030380249, + "num_tokens": 17505104.0, + "step": 460 + }, + { + "epoch": 0.05864393843022516, + "ewc_loss": 0.0009754030616022646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.75403054326307e-06, + "grad_norm": 3.8752145767211914, + "learning_rate": 1.949978804578211e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8340885043144226, + "num_tokens": 17539052.0, + "step": 461 + }, + { + "epoch": 0.058771148708815675, + "ewc_loss": 0.0009631594293750823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.631594366510399e-06, + "grad_norm": 2.9093339443206787, + "learning_rate": 1.9542178889359897e-07, + "loss": 0.6052, + "mean_token_accuracy": 0.8093667030334473, + "num_tokens": 17578942.0, + "step": 462 + }, + { + "epoch": 0.05889835898740618, + "ewc_loss": 0.0009181685745716095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.181685527437367e-06, + "grad_norm": 3.4500672817230225, + "learning_rate": 1.9584569732937684e-07, + "loss": 0.5828, + "mean_token_accuracy": 0.8213487267494202, + "num_tokens": 17611931.0, + "step": 463 + }, + { + "epoch": 0.05902556926599669, + "ewc_loss": 0.0009320268873125315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.320268873125315e-06, + "grad_norm": 4.096554279327393, + "learning_rate": 1.962696057651547e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.8451437950134277, + "num_tokens": 17648764.0, + "step": 464 + }, + { + "epoch": 0.059152779544587204, + "ewc_loss": 0.0009671744192019105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.671744010120165e-06, + "grad_norm": 3.5536153316497803, + "learning_rate": 1.9669351420093258e-07, + "loss": 0.5996, + "mean_token_accuracy": 0.812279224395752, + "num_tokens": 17683099.0, + "step": 465 + }, + { + "epoch": 0.05927998982317771, + "ewc_loss": 0.0009652996668592095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.65299659583252e-06, + "grad_norm": 3.1328678131103516, + "learning_rate": 1.9711742263671046e-07, + "loss": 0.5855, + "mean_token_accuracy": 0.8207023739814758, + "num_tokens": 17727120.0, + "step": 466 + }, + { + "epoch": 0.059407200101768226, + "ewc_loss": 0.0009382017306052148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.382017196912784e-06, + "grad_norm": 4.000699520111084, + "learning_rate": 1.9754133107248833e-07, + "loss": 0.539, + "mean_token_accuracy": 0.8241766691207886, + "num_tokens": 17761676.0, + "step": 467 + }, + { + "epoch": 0.059534410380358734, + "ewc_loss": 0.0009656741167418659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.656740985519718e-06, + "grad_norm": 3.310607433319092, + "learning_rate": 1.979652395082662e-07, + "loss": 0.6072, + "mean_token_accuracy": 0.8118778467178345, + "num_tokens": 17802311.0, + "step": 468 + }, + { + "epoch": 0.05966162065894924, + "ewc_loss": 0.0009531382820568979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.53138260229025e-06, + "grad_norm": 2.867687463760376, + "learning_rate": 1.9838914794404408e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.8395534157752991, + "num_tokens": 17839048.0, + "step": 469 + }, + { + "epoch": 0.059788830937539755, + "ewc_loss": 0.0009296723874285817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.296723874285817e-06, + "grad_norm": 3.091507911682129, + "learning_rate": 1.9881305637982195e-07, + "loss": 0.5698, + "mean_token_accuracy": 0.8212317228317261, + "num_tokens": 17881510.0, + "step": 470 + }, + { + "epoch": 0.05991604121613026, + "ewc_loss": 0.0009378719259984791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.378719369124155e-06, + "grad_norm": 3.6685760021209717, + "learning_rate": 1.9923696481559982e-07, + "loss": 0.5936, + "mean_token_accuracy": 0.8159617185592651, + "num_tokens": 17915080.0, + "step": 471 + }, + { + "epoch": 0.06004325149472077, + "ewc_loss": 0.0009727795259095728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.727795259095728e-06, + "grad_norm": 3.6003732681274414, + "learning_rate": 1.996608732513777e-07, + "loss": 0.6271, + "mean_token_accuracy": 0.8062000870704651, + "num_tokens": 17957972.0, + "step": 472 + }, + { + "epoch": 0.060170461773311285, + "ewc_loss": 0.0009818696416914463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.818696526053827e-06, + "grad_norm": 3.132619857788086, + "learning_rate": 2.0008478168715557e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8390794992446899, + "num_tokens": 17998008.0, + "step": 473 + }, + { + "epoch": 0.06029767205190179, + "ewc_loss": 0.0009582675411365926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.582675374986138e-06, + "grad_norm": 3.040884494781494, + "learning_rate": 2.0050869012293344e-07, + "loss": 0.5053, + "mean_token_accuracy": 0.8362827301025391, + "num_tokens": 18032427.0, + "step": 474 + }, + { + "epoch": 0.060424882330492306, + "ewc_loss": 0.0009482402820140123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.482402674620971e-06, + "grad_norm": 2.9025259017944336, + "learning_rate": 2.009325985587113e-07, + "loss": 0.536, + "mean_token_accuracy": 0.8303592205047607, + "num_tokens": 18069848.0, + "step": 475 + }, + { + "epoch": 0.060552092609082814, + "ewc_loss": 0.0009459874709136784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.459874490858056e-06, + "grad_norm": 3.622262477874756, + "learning_rate": 2.0135650699448918e-07, + "loss": 0.6282, + "mean_token_accuracy": 0.8032381534576416, + "num_tokens": 18109240.0, + "step": 476 + }, + { + "epoch": 0.06067930288767332, + "ewc_loss": 0.0009870490757748485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.870490430330392e-06, + "grad_norm": 3.25325083732605, + "learning_rate": 2.0178041543026706e-07, + "loss": 0.5987, + "mean_token_accuracy": 0.8143975734710693, + "num_tokens": 18146849.0, + "step": 477 + }, + { + "epoch": 0.060806513166263836, + "ewc_loss": 0.0009834114462137222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.834114280238282e-06, + "grad_norm": 3.7454752922058105, + "learning_rate": 2.022043238660449e-07, + "loss": 0.6289, + "mean_token_accuracy": 0.8032098412513733, + "num_tokens": 18186872.0, + "step": 478 + }, + { + "epoch": 0.06093372344485434, + "ewc_loss": 0.0010017749154940248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0017749445978552e-05, + "grad_norm": 3.3304004669189453, + "learning_rate": 2.026282323018228e-07, + "loss": 0.6039, + "mean_token_accuracy": 0.813483476638794, + "num_tokens": 18222299.0, + "step": 479 + }, + { + "epoch": 0.06106093372344486, + "ewc_loss": 0.0009871934307739139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.871933798422106e-06, + "grad_norm": 2.824009418487549, + "learning_rate": 2.0305214073760065e-07, + "loss": 0.6081, + "mean_token_accuracy": 0.8134537935256958, + "num_tokens": 18261645.0, + "step": 480 + }, + { + "epoch": 0.061188144002035365, + "ewc_loss": 0.0009589114342816174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.58911459747469e-06, + "grad_norm": 3.414790153503418, + "learning_rate": 2.0347604917337855e-07, + "loss": 0.5786, + "mean_token_accuracy": 0.8183363676071167, + "num_tokens": 18292970.0, + "step": 481 + }, + { + "epoch": 0.06131535428062587, + "ewc_loss": 0.0009890218498185277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.890218279906549e-06, + "grad_norm": 2.9215924739837646, + "learning_rate": 2.038999576091564e-07, + "loss": 0.527, + "mean_token_accuracy": 0.8361397981643677, + "num_tokens": 18331133.0, + "step": 482 + }, + { + "epoch": 0.06144256455921639, + "ewc_loss": 0.000976729323156178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.767292795004323e-06, + "grad_norm": 2.8871023654937744, + "learning_rate": 2.043238660449343e-07, + "loss": 0.6122, + "mean_token_accuracy": 0.8090972900390625, + "num_tokens": 18368905.0, + "step": 483 + }, + { + "epoch": 0.061569774837806894, + "ewc_loss": 0.0009749594028107822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.749594028107822e-06, + "grad_norm": 3.1278955936431885, + "learning_rate": 2.0474777448071214e-07, + "loss": 0.538, + "mean_token_accuracy": 0.8324052095413208, + "num_tokens": 18406362.0, + "step": 484 + }, + { + "epoch": 0.0616969851163974, + "ewc_loss": 0.0009920505108311772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.920505362970289e-06, + "grad_norm": 3.1051573753356934, + "learning_rate": 2.0517168291649004e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.8215954899787903, + "num_tokens": 18439246.0, + "step": 485 + }, + { + "epoch": 0.061824195394987916, + "ewc_loss": 0.0010043960064649582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0043960173788946e-05, + "grad_norm": 3.302572727203369, + "learning_rate": 2.0559559135226788e-07, + "loss": 0.5782, + "mean_token_accuracy": 0.8168659806251526, + "num_tokens": 18473408.0, + "step": 486 + }, + { + "epoch": 0.06195140567357842, + "ewc_loss": 0.0010191386099904776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0191386536462232e-05, + "grad_norm": 2.9638171195983887, + "learning_rate": 2.0601949978804578e-07, + "loss": 0.5319, + "mean_token_accuracy": 0.8311502933502197, + "num_tokens": 18513246.0, + "step": 487 + }, + { + "epoch": 0.06207861595216894, + "ewc_loss": 0.0010049535194411874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0049535376310814e-05, + "grad_norm": 2.8746178150177, + "learning_rate": 2.0644340822382363e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8390238285064697, + "num_tokens": 18549125.0, + "step": 488 + }, + { + "epoch": 0.062205826230759445, + "ewc_loss": 0.001000619726255536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0006197044276632e-05, + "grad_norm": 3.1859006881713867, + "learning_rate": 2.0686731665960153e-07, + "loss": 0.5545, + "mean_token_accuracy": 0.8307241201400757, + "num_tokens": 18585306.0, + "step": 489 + }, + { + "epoch": 0.06233303650934995, + "ewc_loss": 0.0010266798781231046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0266799108649138e-05, + "grad_norm": 3.171896457672119, + "learning_rate": 2.0729122509537937e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.838667094707489, + "num_tokens": 18624378.0, + "step": 490 + }, + { + "epoch": 0.06246024678794047, + "ewc_loss": 0.001034808112308383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0348081559641287e-05, + "grad_norm": 3.4027106761932373, + "learning_rate": 2.0771513353115727e-07, + "loss": 0.5565, + "mean_token_accuracy": 0.8239606618881226, + "num_tokens": 18660814.0, + "step": 491 + }, + { + "epoch": 0.06258745706653097, + "ewc_loss": 0.0010503326775506139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0503326848265715e-05, + "grad_norm": 2.7931370735168457, + "learning_rate": 2.0813904196693512e-07, + "loss": 0.5265, + "mean_token_accuracy": 0.8344353437423706, + "num_tokens": 18702129.0, + "step": 492 + }, + { + "epoch": 0.06271466734512149, + "ewc_loss": 0.0010190093889832497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0190094144491013e-05, + "grad_norm": 3.55887770652771, + "learning_rate": 2.0856295040271302e-07, + "loss": 0.5958, + "mean_token_accuracy": 0.8129519820213318, + "num_tokens": 18737496.0, + "step": 493 + }, + { + "epoch": 0.06284187762371199, + "ewc_loss": 0.0010573051404207945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0573051440587733e-05, + "grad_norm": 2.7164318561553955, + "learning_rate": 2.0898685883849086e-07, + "loss": 0.5547, + "mean_token_accuracy": 0.8284092545509338, + "num_tokens": 18779332.0, + "step": 494 + }, + { + "epoch": 0.0629690879023025, + "ewc_loss": 0.0010247629834339023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.024762968881987e-05, + "grad_norm": 2.705479860305786, + "learning_rate": 2.0941076727426874e-07, + "loss": 0.5402, + "mean_token_accuracy": 0.8301683068275452, + "num_tokens": 18817976.0, + "step": 495 + }, + { + "epoch": 0.06309629818089302, + "ewc_loss": 0.0010247960453853011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0247960744891316e-05, + "grad_norm": 2.4568419456481934, + "learning_rate": 2.098346757100466e-07, + "loss": 0.531, + "mean_token_accuracy": 0.8320217132568359, + "num_tokens": 18862860.0, + "step": 496 + }, + { + "epoch": 0.06322350845948353, + "ewc_loss": 0.0010155069176107645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0155069503525738e-05, + "grad_norm": 3.048403739929199, + "learning_rate": 2.1025858414582448e-07, + "loss": 0.5713, + "mean_token_accuracy": 0.8192679286003113, + "num_tokens": 18899025.0, + "step": 497 + }, + { + "epoch": 0.06335071873807403, + "ewc_loss": 0.0010658386163413525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0658386599970981e-05, + "grad_norm": 2.814354419708252, + "learning_rate": 2.1068249258160238e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.8238769173622131, + "num_tokens": 18933118.0, + "step": 498 + }, + { + "epoch": 0.06347792901666455, + "ewc_loss": 0.0010639647953212261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0639648280630354e-05, + "grad_norm": 2.9269373416900635, + "learning_rate": 2.1110640101738023e-07, + "loss": 0.538, + "mean_token_accuracy": 0.8319272994995117, + "num_tokens": 18969165.0, + "step": 499 + }, + { + "epoch": 0.06360513929525506, + "ewc_loss": 0.0010720662539824843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.072066243068548e-05, + "grad_norm": 2.9578583240509033, + "learning_rate": 2.1153030945315813e-07, + "loss": 0.5473, + "mean_token_accuracy": 0.8261195421218872, + "num_tokens": 19003882.0, + "step": 500 + }, + { + "epoch": 0.06373234957384556, + "ewc_loss": 0.001080782967619598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0807829312398098e-05, + "grad_norm": 3.0219452381134033, + "learning_rate": 2.1195421788893597e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.8296693563461304, + "num_tokens": 19037540.0, + "step": 501 + }, + { + "epoch": 0.06385955985243608, + "ewc_loss": 0.0010901197092607617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0901197128987405e-05, + "grad_norm": 2.792937755584717, + "learning_rate": 2.1237812632471387e-07, + "loss": 0.5583, + "mean_token_accuracy": 0.8202411532402039, + "num_tokens": 19077270.0, + "step": 502 + }, + { + "epoch": 0.06398677013102659, + "ewc_loss": 0.0010778132127597928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0778132491395809e-05, + "grad_norm": 2.7964909076690674, + "learning_rate": 2.1280203476049172e-07, + "loss": 0.6447, + "mean_token_accuracy": 0.7949936389923096, + "num_tokens": 19117579.0, + "step": 503 + }, + { + "epoch": 0.06411398040961709, + "ewc_loss": 0.001079805544577539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0798054972838145e-05, + "grad_norm": 2.6864774227142334, + "learning_rate": 2.1322594319626962e-07, + "loss": 0.5847, + "mean_token_accuracy": 0.8154870271682739, + "num_tokens": 19156599.0, + "step": 504 + }, + { + "epoch": 0.0642411906882076, + "ewc_loss": 0.0010792625835165381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.079262528946856e-05, + "grad_norm": 3.3635361194610596, + "learning_rate": 2.1364985163204746e-07, + "loss": 0.5753, + "mean_token_accuracy": 0.8189011812210083, + "num_tokens": 19187367.0, + "step": 505 + }, + { + "epoch": 0.06436840096679812, + "ewc_loss": 0.00112835131585598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1283513231319375e-05, + "grad_norm": 3.1610500812530518, + "learning_rate": 2.1407376006782536e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.8361823558807373, + "num_tokens": 19223916.0, + "step": 506 + }, + { + "epoch": 0.06449561124538863, + "ewc_loss": 0.0011296771699562669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1296771845081821e-05, + "grad_norm": 2.9013118743896484, + "learning_rate": 2.144976685036032e-07, + "loss": 0.5256, + "mean_token_accuracy": 0.8347365260124207, + "num_tokens": 19260336.0, + "step": 507 + }, + { + "epoch": 0.06462282152397913, + "ewc_loss": 0.0011112167267128825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1112167157989461e-05, + "grad_norm": 2.7135157585144043, + "learning_rate": 2.149215769393811e-07, + "loss": 0.587, + "mean_token_accuracy": 0.8170906901359558, + "num_tokens": 19299895.0, + "step": 508 + }, + { + "epoch": 0.06475003180256965, + "ewc_loss": 0.0010941217187792063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.094121762434952e-05, + "grad_norm": 3.1385836601257324, + "learning_rate": 2.1534548537515895e-07, + "loss": 0.5299, + "mean_token_accuracy": 0.8316280841827393, + "num_tokens": 19336614.0, + "step": 509 + }, + { + "epoch": 0.06487724208116016, + "ewc_loss": 0.0011224759509786963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1224758964090142e-05, + "grad_norm": 2.6874618530273438, + "learning_rate": 2.1576939381093685e-07, + "loss": 0.5806, + "mean_token_accuracy": 0.8190348744392395, + "num_tokens": 19378683.0, + "step": 510 + }, + { + "epoch": 0.06500445235975066, + "ewc_loss": 0.0011060141259804368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1060141332563944e-05, + "grad_norm": 2.5919394493103027, + "learning_rate": 2.161933022467147e-07, + "loss": 0.564, + "mean_token_accuracy": 0.819223165512085, + "num_tokens": 19422598.0, + "step": 511 + }, + { + "epoch": 0.06513166263834118, + "ewc_loss": 0.0010979275684803724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0979275430145208e-05, + "grad_norm": 2.518256664276123, + "learning_rate": 2.166172106824926e-07, + "loss": 0.5548, + "mean_token_accuracy": 0.8281524181365967, + "num_tokens": 19466972.0, + "step": 512 + }, + { + "epoch": 0.06525887291693169, + "ewc_loss": 0.0010983216343447566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.098321627068799e-05, + "grad_norm": 3.325937032699585, + "learning_rate": 2.1704111911827044e-07, + "loss": 0.5449, + "mean_token_accuracy": 0.8233153820037842, + "num_tokens": 19503809.0, + "step": 513 + }, + { + "epoch": 0.0653860831955222, + "ewc_loss": 0.001167227397672832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1672274013108108e-05, + "grad_norm": 2.927309513092041, + "learning_rate": 2.1746502755404831e-07, + "loss": 0.6006, + "mean_token_accuracy": 0.8180954456329346, + "num_tokens": 19545713.0, + "step": 514 + }, + { + "epoch": 0.06551329347411271, + "ewc_loss": 0.001153238001279533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1532380085554905e-05, + "grad_norm": 2.746910572052002, + "learning_rate": 2.178889359898262e-07, + "loss": 0.5866, + "mean_token_accuracy": 0.8191039562225342, + "num_tokens": 19582272.0, + "step": 515 + }, + { + "epoch": 0.06564050375270322, + "ewc_loss": 0.0011329661356285214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1329660992487334e-05, + "grad_norm": 2.639636754989624, + "learning_rate": 2.1831284442560406e-07, + "loss": 0.602, + "mean_token_accuracy": 0.8131691217422485, + "num_tokens": 19625574.0, + "step": 516 + }, + { + "epoch": 0.06576771403129372, + "ewc_loss": 0.0011214240221306682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1214239748369437e-05, + "grad_norm": 3.0002195835113525, + "learning_rate": 2.1873675286138193e-07, + "loss": 0.5896, + "mean_token_accuracy": 0.8126276135444641, + "num_tokens": 19667791.0, + "step": 517 + }, + { + "epoch": 0.06589492430988424, + "ewc_loss": 0.0011531499912962317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1531499694683589e-05, + "grad_norm": 2.753387928009033, + "learning_rate": 2.191606612971598e-07, + "loss": 0.6178, + "mean_token_accuracy": 0.8071136474609375, + "num_tokens": 19708289.0, + "step": 518 + }, + { + "epoch": 0.06602213458847475, + "ewc_loss": 0.0011416497873142362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.141649772762321e-05, + "grad_norm": 2.8854405879974365, + "learning_rate": 2.1958456973293768e-07, + "loss": 0.5968, + "mean_token_accuracy": 0.8088433742523193, + "num_tokens": 19747444.0, + "step": 519 + }, + { + "epoch": 0.06614934486706527, + "ewc_loss": 0.0011500177206471562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1500177606649231e-05, + "grad_norm": 2.838224172592163, + "learning_rate": 2.2000847816871555e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8409953117370605, + "num_tokens": 19787061.0, + "step": 520 + }, + { + "epoch": 0.06627655514565577, + "ewc_loss": 0.0011523921275511384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1523920875333715e-05, + "grad_norm": 2.5374772548675537, + "learning_rate": 2.2043238660449342e-07, + "loss": 0.5275, + "mean_token_accuracy": 0.834182858467102, + "num_tokens": 19826893.0, + "step": 521 + }, + { + "epoch": 0.06640376542424628, + "ewc_loss": 0.0011305122170597315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1305121915938798e-05, + "grad_norm": 2.7214431762695312, + "learning_rate": 2.208562950402713e-07, + "loss": 0.5205, + "mean_token_accuracy": 0.8322402834892273, + "num_tokens": 19864285.0, + "step": 522 + }, + { + "epoch": 0.0665309757028368, + "ewc_loss": 0.0011433983454480767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1433983672759496e-05, + "grad_norm": 2.5963852405548096, + "learning_rate": 2.2128020347604917e-07, + "loss": 0.5564, + "mean_token_accuracy": 0.825290322303772, + "num_tokens": 19903806.0, + "step": 523 + }, + { + "epoch": 0.0666581859814273, + "ewc_loss": 0.0011435950873419642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.143595090979943e-05, + "grad_norm": 2.682539939880371, + "learning_rate": 2.2170411191182704e-07, + "loss": 0.5336, + "mean_token_accuracy": 0.829940915107727, + "num_tokens": 19950382.0, + "step": 524 + }, + { + "epoch": 0.06678539626001781, + "ewc_loss": 0.0011530007468536496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.153000721387798e-05, + "grad_norm": 2.8394031524658203, + "learning_rate": 2.221280203476049e-07, + "loss": 0.5575, + "mean_token_accuracy": 0.8222602605819702, + "num_tokens": 19984765.0, + "step": 525 + }, + { + "epoch": 0.06691260653860832, + "ewc_loss": 0.0011713204439729452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1713204003171995e-05, + "grad_norm": 2.699871063232422, + "learning_rate": 2.2255192878338279e-07, + "loss": 0.5967, + "mean_token_accuracy": 0.8149552941322327, + "num_tokens": 20021081.0, + "step": 526 + }, + { + "epoch": 0.06703981681719882, + "ewc_loss": 0.001163011766038835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.163011711469153e-05, + "grad_norm": 2.840449094772339, + "learning_rate": 2.2297583721916066e-07, + "loss": 0.5678, + "mean_token_accuracy": 0.820286750793457, + "num_tokens": 20055860.0, + "step": 527 + }, + { + "epoch": 0.06716702709578934, + "ewc_loss": 0.001174159930087626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1741599337256048e-05, + "grad_norm": 2.766587734222412, + "learning_rate": 2.2339974565493853e-07, + "loss": 0.5248, + "mean_token_accuracy": 0.8359790444374084, + "num_tokens": 20095023.0, + "step": 528 + }, + { + "epoch": 0.06729423737437985, + "ewc_loss": 0.0011714542051777244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1714541869878303e-05, + "grad_norm": 2.7831873893737793, + "learning_rate": 2.238236540907164e-07, + "loss": 0.5787, + "mean_token_accuracy": 0.8187743425369263, + "num_tokens": 20134620.0, + "step": 529 + }, + { + "epoch": 0.06742144765297035, + "ewc_loss": 0.0011747275711968541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1747275493689813e-05, + "grad_norm": 2.8303656578063965, + "learning_rate": 2.2424756252649428e-07, + "loss": 0.5477, + "mean_token_accuracy": 0.824466347694397, + "num_tokens": 20169690.0, + "step": 530 + }, + { + "epoch": 0.06754865793156087, + "ewc_loss": 0.0011775755556300282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.177575541078113e-05, + "grad_norm": 2.9029414653778076, + "learning_rate": 2.2467147096227215e-07, + "loss": 0.5405, + "mean_token_accuracy": 0.8294446468353271, + "num_tokens": 20203609.0, + "step": 531 + }, + { + "epoch": 0.06767586821015138, + "ewc_loss": 0.0011890161549672484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1890161658811849e-05, + "grad_norm": 2.929957151412964, + "learning_rate": 2.2509537939805002e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8251497745513916, + "num_tokens": 20234616.0, + "step": 532 + }, + { + "epoch": 0.0678030784887419, + "ewc_loss": 0.0011967032914981246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1967033060500398e-05, + "grad_norm": 2.613223075866699, + "learning_rate": 2.2551928783382787e-07, + "loss": 0.5387, + "mean_token_accuracy": 0.8258353471755981, + "num_tokens": 20271634.0, + "step": 533 + }, + { + "epoch": 0.0679302887673324, + "ewc_loss": 0.0011722719063982368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1722719136741944e-05, + "grad_norm": 2.7982051372528076, + "learning_rate": 2.2594319626960577e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.8246477842330933, + "num_tokens": 20309683.0, + "step": 534 + }, + { + "epoch": 0.06805749904592291, + "ewc_loss": 0.001184792723506689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1847927453345619e-05, + "grad_norm": 2.5557756423950195, + "learning_rate": 2.263671047053836e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.8419005274772644, + "num_tokens": 20346013.0, + "step": 535 + }, + { + "epoch": 0.06818470932451343, + "ewc_loss": 0.001172671327367425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1726712727977429e-05, + "grad_norm": 2.778421640396118, + "learning_rate": 2.267910131411615e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.8322839140892029, + "num_tokens": 20387613.0, + "step": 536 + }, + { + "epoch": 0.06831191960310393, + "ewc_loss": 0.0011938668321818113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1938668649236206e-05, + "grad_norm": 2.9578161239624023, + "learning_rate": 2.2721492157693936e-07, + "loss": 0.537, + "mean_token_accuracy": 0.8292654752731323, + "num_tokens": 20417579.0, + "step": 537 + }, + { + "epoch": 0.06843912988169444, + "ewc_loss": 0.0012162927305325866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2162927305325866e-05, + "grad_norm": 2.5350124835968018, + "learning_rate": 2.2763883001271726e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.8352062702178955, + "num_tokens": 20455100.0, + "step": 538 + }, + { + "epoch": 0.06856634016028496, + "ewc_loss": 0.001186623703688383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1866237400681712e-05, + "grad_norm": 2.4754817485809326, + "learning_rate": 2.280627384484951e-07, + "loss": 0.5706, + "mean_token_accuracy": 0.8186227679252625, + "num_tokens": 20496458.0, + "step": 539 + }, + { + "epoch": 0.06869355043887546, + "ewc_loss": 0.0011774278245866299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1774278391385451e-05, + "grad_norm": 3.2189502716064453, + "learning_rate": 2.28486646884273e-07, + "loss": 0.546, + "mean_token_accuracy": 0.8250206708908081, + "num_tokens": 20533758.0, + "step": 540 + }, + { + "epoch": 0.06882076071746597, + "ewc_loss": 0.0012458735145628452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.24587350001093e-05, + "grad_norm": 2.963425636291504, + "learning_rate": 2.2891055532005085e-07, + "loss": 0.5903, + "mean_token_accuracy": 0.8163201808929443, + "num_tokens": 20567259.0, + "step": 541 + }, + { + "epoch": 0.06894797099605648, + "ewc_loss": 0.001244809478521347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2448094821593259e-05, + "grad_norm": 3.0505523681640625, + "learning_rate": 2.2933446375582875e-07, + "loss": 0.5713, + "mean_token_accuracy": 0.8223739862442017, + "num_tokens": 20599333.0, + "step": 542 + }, + { + "epoch": 0.06907518127464699, + "ewc_loss": 0.0012430248316377401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2430247807060368e-05, + "grad_norm": 2.7614924907684326, + "learning_rate": 2.297583721916066e-07, + "loss": 0.5094, + "mean_token_accuracy": 0.8382680416107178, + "num_tokens": 20640195.0, + "step": 543 + }, + { + "epoch": 0.0692023915532375, + "ewc_loss": 0.0012134932912886143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2134933058405295e-05, + "grad_norm": 2.6032931804656982, + "learning_rate": 2.301822806273845e-07, + "loss": 0.5297, + "mean_token_accuracy": 0.8312073945999146, + "num_tokens": 20682617.0, + "step": 544 + }, + { + "epoch": 0.06932960183182801, + "ewc_loss": 0.0011976452078670263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1976451787631959e-05, + "grad_norm": 2.9132277965545654, + "learning_rate": 2.3060618906316234e-07, + "loss": 0.5806, + "mean_token_accuracy": 0.8209168314933777, + "num_tokens": 20718032.0, + "step": 545 + }, + { + "epoch": 0.06945681211041853, + "ewc_loss": 0.0012291063321754336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2291063285374548e-05, + "grad_norm": 2.9120583534240723, + "learning_rate": 2.3103009749894024e-07, + "loss": 0.5463, + "mean_token_accuracy": 0.8278170824050903, + "num_tokens": 20755190.0, + "step": 546 + }, + { + "epoch": 0.06958402238900903, + "ewc_loss": 0.001234894385561347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.234894443769008e-05, + "grad_norm": 2.449087619781494, + "learning_rate": 2.3145400593471808e-07, + "loss": 0.57, + "mean_token_accuracy": 0.817772626876831, + "num_tokens": 20799088.0, + "step": 547 + }, + { + "epoch": 0.06971123266759954, + "ewc_loss": 0.0011945682344958186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1945682672376279e-05, + "grad_norm": 3.039289951324463, + "learning_rate": 2.3187791437049598e-07, + "loss": 0.5993, + "mean_token_accuracy": 0.8099796175956726, + "num_tokens": 20831819.0, + "step": 548 + }, + { + "epoch": 0.06983844294619006, + "ewc_loss": 0.0012424108572304249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2424108717823401e-05, + "grad_norm": 3.222358465194702, + "learning_rate": 2.3230182280627383e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8333526849746704, + "num_tokens": 20861291.0, + "step": 549 + }, + { + "epoch": 0.06996565322478056, + "ewc_loss": 0.0012763524428009987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2763524864567444e-05, + "grad_norm": 2.645143985748291, + "learning_rate": 2.327257312420517e-07, + "loss": 0.5066, + "mean_token_accuracy": 0.840904176235199, + "num_tokens": 20897446.0, + "step": 550 + }, + { + "epoch": 0.07009286350337107, + "ewc_loss": 0.0012287896825000644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2287897334317677e-05, + "grad_norm": 2.9278080463409424, + "learning_rate": 2.3314963967782957e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8273531198501587, + "num_tokens": 20934423.0, + "step": 551 + }, + { + "epoch": 0.07022007378196159, + "ewc_loss": 0.0012409393675625324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2409394003043417e-05, + "grad_norm": 2.691016435623169, + "learning_rate": 2.3357354811360745e-07, + "loss": 0.5492, + "mean_token_accuracy": 0.8289259672164917, + "num_tokens": 20978394.0, + "step": 552 + }, + { + "epoch": 0.07034728406055209, + "ewc_loss": 0.001226406660862267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2264066754141822e-05, + "grad_norm": 3.0572028160095215, + "learning_rate": 2.3399745654938532e-07, + "loss": 0.5736, + "mean_token_accuracy": 0.8201910257339478, + "num_tokens": 21010669.0, + "step": 553 + }, + { + "epoch": 0.0704744943391426, + "ewc_loss": 0.0012546735815703869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2546735888463445e-05, + "grad_norm": 2.6133248805999756, + "learning_rate": 2.344213649851632e-07, + "loss": 0.5433, + "mean_token_accuracy": 0.8300507068634033, + "num_tokens": 21047696.0, + "step": 554 + }, + { + "epoch": 0.07060170461773312, + "ewc_loss": 0.0012252925662323833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2252925444045104e-05, + "grad_norm": 2.489053964614868, + "learning_rate": 2.3484527342094106e-07, + "loss": 0.6241, + "mean_token_accuracy": 0.8076022863388062, + "num_tokens": 21091437.0, + "step": 555 + }, + { + "epoch": 0.07072891489632362, + "ewc_loss": 0.0012136447476223111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2136447367083747e-05, + "grad_norm": 2.5994503498077393, + "learning_rate": 2.3526918185671894e-07, + "loss": 0.5849, + "mean_token_accuracy": 0.816792905330658, + "num_tokens": 21128880.0, + "step": 556 + }, + { + "epoch": 0.07085612517491413, + "ewc_loss": 0.0012310949387028813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2310949387028813e-05, + "grad_norm": 2.7388663291931152, + "learning_rate": 2.356930902924968e-07, + "loss": 0.5359, + "mean_token_accuracy": 0.8309779167175293, + "num_tokens": 21168523.0, + "step": 557 + }, + { + "epoch": 0.07098333545350465, + "ewc_loss": 0.0012524432968348265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.252443325938657e-05, + "grad_norm": 2.692720890045166, + "learning_rate": 2.3611699872827468e-07, + "loss": 0.6212, + "mean_token_accuracy": 0.8040759563446045, + "num_tokens": 21211376.0, + "step": 558 + }, + { + "epoch": 0.07111054573209516, + "ewc_loss": 0.0012515898561105132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2515898561105132e-05, + "grad_norm": 2.6083786487579346, + "learning_rate": 2.3654090716405255e-07, + "loss": 0.5557, + "mean_token_accuracy": 0.8242285251617432, + "num_tokens": 21249162.0, + "step": 559 + }, + { + "epoch": 0.07123775601068566, + "ewc_loss": 0.0012462573358789086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2462573067750782e-05, + "grad_norm": 2.896536111831665, + "learning_rate": 2.3696481559983043e-07, + "loss": 0.5807, + "mean_token_accuracy": 0.8163899183273315, + "num_tokens": 21282351.0, + "step": 560 + }, + { + "epoch": 0.07136496628927617, + "ewc_loss": 0.0012747800210490823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.274779970117379e-05, + "grad_norm": 2.585752248764038, + "learning_rate": 2.373887240356083e-07, + "loss": 0.5722, + "mean_token_accuracy": 0.8195812702178955, + "num_tokens": 21319508.0, + "step": 561 + }, + { + "epoch": 0.07149217656786669, + "ewc_loss": 0.0012533878907561302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2533879271359183e-05, + "grad_norm": 2.6084699630737305, + "learning_rate": 2.3781263247138617e-07, + "loss": 0.5374, + "mean_token_accuracy": 0.8313350081443787, + "num_tokens": 21357111.0, + "step": 562 + }, + { + "epoch": 0.07161938684645719, + "ewc_loss": 0.0012530435342341661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.253043592441827e-05, + "grad_norm": 2.859823226928711, + "learning_rate": 2.3823654090716404e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.8258234262466431, + "num_tokens": 21393222.0, + "step": 563 + }, + { + "epoch": 0.0717465971250477, + "ewc_loss": 0.0012838776456192136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2838776456192136e-05, + "grad_norm": 2.5805699825286865, + "learning_rate": 2.386604493429419e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.8395442366600037, + "num_tokens": 21429595.0, + "step": 564 + }, + { + "epoch": 0.07187380740363822, + "ewc_loss": 0.0012686079135164618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2686079571722075e-05, + "grad_norm": 2.8011114597320557, + "learning_rate": 2.390843577787198e-07, + "loss": 0.589, + "mean_token_accuracy": 0.8101036548614502, + "num_tokens": 21467341.0, + "step": 565 + }, + { + "epoch": 0.07200101768222872, + "ewc_loss": 0.0012868078192695975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2868078556493856e-05, + "grad_norm": 2.392224073410034, + "learning_rate": 2.3950826621449766e-07, + "loss": 0.4689, + "mean_token_accuracy": 0.850894570350647, + "num_tokens": 21510927.0, + "step": 566 + }, + { + "epoch": 0.07212822796081923, + "ewc_loss": 0.0012567852390930057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2567852536449209e-05, + "grad_norm": 2.6164608001708984, + "learning_rate": 2.3993217465027556e-07, + "loss": 0.543, + "mean_token_accuracy": 0.828840970993042, + "num_tokens": 21551604.0, + "step": 567 + }, + { + "epoch": 0.07225543823940975, + "ewc_loss": 0.0012793181231245399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2793180758308154e-05, + "grad_norm": 3.099656105041504, + "learning_rate": 2.403560830860534e-07, + "loss": 0.5931, + "mean_token_accuracy": 0.8166974782943726, + "num_tokens": 21591871.0, + "step": 568 + }, + { + "epoch": 0.07238264851800025, + "ewc_loss": 0.0013354859547689557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3354859220271464e-05, + "grad_norm": 2.9759557247161865, + "learning_rate": 2.4077999152183125e-07, + "loss": 0.5909, + "mean_token_accuracy": 0.8134573101997375, + "num_tokens": 21629810.0, + "step": 569 + }, + { + "epoch": 0.07250985879659076, + "ewc_loss": 0.0013320271391421556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.332027113676304e-05, + "grad_norm": 2.5458738803863525, + "learning_rate": 2.4120389995760915e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.8466179370880127, + "num_tokens": 21668706.0, + "step": 570 + }, + { + "epoch": 0.07263706907518128, + "ewc_loss": 0.0012783249840140343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2783249985659495e-05, + "grad_norm": 2.5814054012298584, + "learning_rate": 2.41627808393387e-07, + "loss": 0.5324, + "mean_token_accuracy": 0.8286839127540588, + "num_tokens": 21712725.0, + "step": 571 + }, + { + "epoch": 0.07276427935377179, + "ewc_loss": 0.0012764346320182085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2764346138283145e-05, + "grad_norm": 2.6755483150482178, + "learning_rate": 2.420517168291649e-07, + "loss": 0.5436, + "mean_token_accuracy": 0.8308561444282532, + "num_tokens": 21746086.0, + "step": 572 + }, + { + "epoch": 0.07289148963236229, + "ewc_loss": 0.0012974456185474992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2974455785297323e-05, + "grad_norm": 2.728407859802246, + "learning_rate": 2.4247562526494274e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8365875482559204, + "num_tokens": 21779573.0, + "step": 573 + }, + { + "epoch": 0.0730186999109528, + "ewc_loss": 0.0013110419968143106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3110419786244165e-05, + "grad_norm": 2.541550874710083, + "learning_rate": 2.4289953370072064e-07, + "loss": 0.5749, + "mean_token_accuracy": 0.8173781633377075, + "num_tokens": 21818766.0, + "step": 574 + }, + { + "epoch": 0.07314591018954332, + "ewc_loss": 0.001293811947107315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2938119652972091e-05, + "grad_norm": 2.4769442081451416, + "learning_rate": 2.433234421364985e-07, + "loss": 0.5305, + "mean_token_accuracy": 0.8343767523765564, + "num_tokens": 21855539.0, + "step": 575 + }, + { + "epoch": 0.07327312046813382, + "ewc_loss": 0.0012917316053062677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2917315871163737e-05, + "grad_norm": 2.450955629348755, + "learning_rate": 2.437473505722764e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.8390215039253235, + "num_tokens": 21892567.0, + "step": 576 + }, + { + "epoch": 0.07340033074672433, + "ewc_loss": 0.00129495479632169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.294954836339457e-05, + "grad_norm": 2.640383243560791, + "learning_rate": 2.4417125900805423e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8285588026046753, + "num_tokens": 21928607.0, + "step": 577 + }, + { + "epoch": 0.07352754102531485, + "ewc_loss": 0.0013196864165365696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3196864529163577e-05, + "grad_norm": 2.678494453430176, + "learning_rate": 2.4459516744383213e-07, + "loss": 0.5623, + "mean_token_accuracy": 0.822858452796936, + "num_tokens": 21963781.0, + "step": 578 + }, + { + "epoch": 0.07365475130390535, + "ewc_loss": 0.0013283930020406842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3283930456964299e-05, + "grad_norm": 2.6278843879699707, + "learning_rate": 2.4501907587961e-07, + "loss": 0.5227, + "mean_token_accuracy": 0.8343148231506348, + "num_tokens": 21999140.0, + "step": 579 + }, + { + "epoch": 0.07378196158249586, + "ewc_loss": 0.0013272454962134361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3272454452817328e-05, + "grad_norm": 2.401930093765259, + "learning_rate": 2.454429843153879e-07, + "loss": 0.6212, + "mean_token_accuracy": 0.8029935359954834, + "num_tokens": 22038703.0, + "step": 580 + }, + { + "epoch": 0.07390917186108638, + "ewc_loss": 0.0013032429851591587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3032429706072435e-05, + "grad_norm": 2.609412670135498, + "learning_rate": 2.458668927511657e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.8223373889923096, + "num_tokens": 22071052.0, + "step": 581 + }, + { + "epoch": 0.07403638213967688, + "ewc_loss": 0.0013286963803693652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3286963621794712e-05, + "grad_norm": 2.468928098678589, + "learning_rate": 2.462908011869436e-07, + "loss": 0.5014, + "mean_token_accuracy": 0.8404667377471924, + "num_tokens": 22109629.0, + "step": 582 + }, + { + "epoch": 0.0741635924182674, + "ewc_loss": 0.001321609248407185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3216092156653758e-05, + "grad_norm": 2.435603618621826, + "learning_rate": 2.4671470962272147e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.8375140428543091, + "num_tokens": 22151544.0, + "step": 583 + }, + { + "epoch": 0.07429080269685791, + "ewc_loss": 0.001318479422479868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3184793715481646e-05, + "grad_norm": 2.5559275150299072, + "learning_rate": 2.4713861805849937e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8405725955963135, + "num_tokens": 22189241.0, + "step": 584 + }, + { + "epoch": 0.07441801297544842, + "ewc_loss": 0.0013355884002521634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3355884220800363e-05, + "grad_norm": 2.418593406677246, + "learning_rate": 2.475625264942772e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8369249701499939, + "num_tokens": 22229138.0, + "step": 585 + }, + { + "epoch": 0.07454522325403892, + "ewc_loss": 0.0013273502700030804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.327350219071377e-05, + "grad_norm": 2.615556478500366, + "learning_rate": 2.479864349300551e-07, + "loss": 0.5443, + "mean_token_accuracy": 0.8286646604537964, + "num_tokens": 22264930.0, + "step": 586 + }, + { + "epoch": 0.07467243353262944, + "ewc_loss": 0.0013493993319571018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3493993719748687e-05, + "grad_norm": 2.675445556640625, + "learning_rate": 2.4841034336583296e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.837760329246521, + "num_tokens": 22297162.0, + "step": 587 + }, + { + "epoch": 0.07479964381121995, + "ewc_loss": 0.0013594480697065592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3594481060863473e-05, + "grad_norm": 2.610403060913086, + "learning_rate": 2.488342518016108e-07, + "loss": 0.5695, + "mean_token_accuracy": 0.8161824941635132, + "num_tokens": 22334779.0, + "step": 588 + }, + { + "epoch": 0.07492685408981045, + "ewc_loss": 0.0013555146288126707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3555146324506495e-05, + "grad_norm": 2.510127544403076, + "learning_rate": 2.492581602373887e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8332091569900513, + "num_tokens": 22372820.0, + "step": 589 + }, + { + "epoch": 0.07505406436840097, + "ewc_loss": 0.001342455972917378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3424560165731236e-05, + "grad_norm": 2.463989019393921, + "learning_rate": 2.4968206867316655e-07, + "loss": 0.548, + "mean_token_accuracy": 0.825435996055603, + "num_tokens": 22417841.0, + "step": 590 + }, + { + "epoch": 0.07518127464699148, + "ewc_loss": 0.0013433460844680667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3433461390377488e-05, + "grad_norm": 2.5424301624298096, + "learning_rate": 2.5010597710894445e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.8522456884384155, + "num_tokens": 22449133.0, + "step": 591 + }, + { + "epoch": 0.07530848492558198, + "ewc_loss": 0.0013519887579604983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3519887033908162e-05, + "grad_norm": 2.458961009979248, + "learning_rate": 2.505298855447223e-07, + "loss": 0.5282, + "mean_token_accuracy": 0.8328754305839539, + "num_tokens": 22491145.0, + "step": 592 + }, + { + "epoch": 0.0754356952041725, + "ewc_loss": 0.0013468684628605843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3468685210682452e-05, + "grad_norm": 2.472369432449341, + "learning_rate": 2.509537939805002e-07, + "loss": 0.5402, + "mean_token_accuracy": 0.8303582668304443, + "num_tokens": 22531738.0, + "step": 593 + }, + { + "epoch": 0.07556290548276301, + "ewc_loss": 0.001349332625977695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3493326150637586e-05, + "grad_norm": 2.871126413345337, + "learning_rate": 2.513777024162781e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.8366304636001587, + "num_tokens": 22563005.0, + "step": 594 + }, + { + "epoch": 0.07569011576135352, + "ewc_loss": 0.0013927801046520472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3927800864621531e-05, + "grad_norm": 2.320620059967041, + "learning_rate": 2.5180161085205594e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8405348658561707, + "num_tokens": 22601333.0, + "step": 595 + }, + { + "epoch": 0.07581732603994402, + "ewc_loss": 0.0013413206906989217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3413206943369005e-05, + "grad_norm": 2.4942095279693604, + "learning_rate": 2.522255192878338e-07, + "loss": 0.5529, + "mean_token_accuracy": 0.8238440155982971, + "num_tokens": 22639528.0, + "step": 596 + }, + { + "epoch": 0.07594453631853454, + "ewc_loss": 0.0013556807534769177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3556807971326634e-05, + "grad_norm": 2.4583988189697266, + "learning_rate": 2.526494277236117e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.8293888568878174, + "num_tokens": 22675290.0, + "step": 597 + }, + { + "epoch": 0.07607174659712505, + "ewc_loss": 0.0013640422839671373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.364042327622883e-05, + "grad_norm": 2.616544246673584, + "learning_rate": 2.530733361593896e-07, + "loss": 0.5127, + "mean_token_accuracy": 0.834816038608551, + "num_tokens": 22712823.0, + "step": 598 + }, + { + "epoch": 0.07619895687571555, + "ewc_loss": 0.0013878755271434784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3878755453333724e-05, + "grad_norm": 2.5493781566619873, + "learning_rate": 2.5349724459516743e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.8303008079528809, + "num_tokens": 22750916.0, + "step": 599 + }, + { + "epoch": 0.07632616715430607, + "ewc_loss": 0.001380794681608677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3807946743327193e-05, + "grad_norm": 2.64862322807312, + "learning_rate": 2.539211530309453e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8417129516601562, + "num_tokens": 22782537.0, + "step": 600 + }, + { + "epoch": 0.07645337743289658, + "ewc_loss": 0.0013880442129448056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3880442566005513e-05, + "grad_norm": 2.8363471031188965, + "learning_rate": 2.543450614667232e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8188474774360657, + "num_tokens": 22814805.0, + "step": 601 + }, + { + "epoch": 0.07658058771148708, + "ewc_loss": 0.00141177698969841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4117769751464948e-05, + "grad_norm": 2.6409952640533447, + "learning_rate": 2.547689699025011e-07, + "loss": 0.5752, + "mean_token_accuracy": 0.8171258568763733, + "num_tokens": 22852654.0, + "step": 602 + }, + { + "epoch": 0.0767077979900776, + "ewc_loss": 0.001391201512888074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3912014765082859e-05, + "grad_norm": 2.510443925857544, + "learning_rate": 2.551928783382789e-07, + "loss": 0.5283, + "mean_token_accuracy": 0.8291792869567871, + "num_tokens": 22894205.0, + "step": 603 + }, + { + "epoch": 0.07683500826866811, + "ewc_loss": 0.0013694263761863112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3694263543584384e-05, + "grad_norm": 2.5695486068725586, + "learning_rate": 2.5561678677405677e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8401399254798889, + "num_tokens": 22924095.0, + "step": 604 + }, + { + "epoch": 0.07696221854725861, + "ewc_loss": 0.0013799798907712102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.379979858029401e-05, + "grad_norm": 2.42254376411438, + "learning_rate": 2.5604069520983467e-07, + "loss": 0.5766, + "mean_token_accuracy": 0.8144761919975281, + "num_tokens": 22960857.0, + "step": 605 + }, + { + "epoch": 0.07708942882584913, + "ewc_loss": 0.0013748586643487215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.374858675262658e-05, + "grad_norm": 2.4063634872436523, + "learning_rate": 2.564646036456125e-07, + "loss": 0.5307, + "mean_token_accuracy": 0.8280526399612427, + "num_tokens": 23004670.0, + "step": 606 + }, + { + "epoch": 0.07721663910443964, + "ewc_loss": 0.00137111428193748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3711142855754588e-05, + "grad_norm": 2.6131277084350586, + "learning_rate": 2.568885120813904e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8285760879516602, + "num_tokens": 23041999.0, + "step": 607 + }, + { + "epoch": 0.07734384938303016, + "ewc_loss": 0.0014015276683494449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4015276974532753e-05, + "grad_norm": 2.5093491077423096, + "learning_rate": 2.5731242051716826e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8362894058227539, + "num_tokens": 23080704.0, + "step": 608 + }, + { + "epoch": 0.07747105966162066, + "ewc_loss": 0.0013918582117184997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.391858222632436e-05, + "grad_norm": 2.418802499771118, + "learning_rate": 2.5773632895294616e-07, + "loss": 0.5537, + "mean_token_accuracy": 0.8255266547203064, + "num_tokens": 23118662.0, + "step": 609 + }, + { + "epoch": 0.07759826994021117, + "ewc_loss": 0.0013773064129054546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3773063983535394e-05, + "grad_norm": 2.5398030281066895, + "learning_rate": 2.58160237388724e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8434144854545593, + "num_tokens": 23150367.0, + "step": 610 + }, + { + "epoch": 0.07772548021880168, + "ewc_loss": 0.001402224414050579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4022243703948334e-05, + "grad_norm": 2.5684704780578613, + "learning_rate": 2.585841458245019e-07, + "loss": 0.5347, + "mean_token_accuracy": 0.8284273147583008, + "num_tokens": 23185836.0, + "step": 611 + }, + { + "epoch": 0.07785269049739219, + "ewc_loss": 0.0014145087916404009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.41450873343274e-05, + "grad_norm": 2.4651901721954346, + "learning_rate": 2.5900805426027975e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8464263677597046, + "num_tokens": 23219346.0, + "step": 612 + }, + { + "epoch": 0.0779799007759827, + "ewc_loss": 0.0014043112751096487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4043112969375215e-05, + "grad_norm": 2.4872548580169678, + "learning_rate": 2.5943196269605765e-07, + "loss": 0.546, + "mean_token_accuracy": 0.8315510153770447, + "num_tokens": 23257569.0, + "step": 613 + }, + { + "epoch": 0.07810711105457321, + "ewc_loss": 0.0014103353023529053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4103352441452444e-05, + "grad_norm": 2.2942276000976562, + "learning_rate": 2.598558711318355e-07, + "loss": 0.5384, + "mean_token_accuracy": 0.8308298587799072, + "num_tokens": 23302944.0, + "step": 614 + }, + { + "epoch": 0.07823432133316371, + "ewc_loss": 0.0013922930229455233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3922930520493537e-05, + "grad_norm": 2.5442118644714355, + "learning_rate": 2.602797795676134e-07, + "loss": 0.5446, + "mean_token_accuracy": 0.827418327331543, + "num_tokens": 23341660.0, + "step": 615 + }, + { + "epoch": 0.07836153161175423, + "ewc_loss": 0.0014290014514699578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4290014405560214e-05, + "grad_norm": 2.5680124759674072, + "learning_rate": 2.6070368800339124e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8236299753189087, + "num_tokens": 23375956.0, + "step": 616 + }, + { + "epoch": 0.07848874189034474, + "ewc_loss": 0.0014393386663869023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4393386663869023e-05, + "grad_norm": 2.5253520011901855, + "learning_rate": 2.6112759643916914e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8406787514686584, + "num_tokens": 23407568.0, + "step": 617 + }, + { + "epoch": 0.07861595216893524, + "ewc_loss": 0.0014325021766126156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4325021766126156e-05, + "grad_norm": 2.6157469749450684, + "learning_rate": 2.61551504874947e-07, + "loss": 0.5241, + "mean_token_accuracy": 0.8323985934257507, + "num_tokens": 23440261.0, + "step": 618 + }, + { + "epoch": 0.07874316244752576, + "ewc_loss": 0.0014427552232518792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4427552741835825e-05, + "grad_norm": 2.4770545959472656, + "learning_rate": 2.619754133107249e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8384293913841248, + "num_tokens": 23481706.0, + "step": 619 + }, + { + "epoch": 0.07887037272611627, + "ewc_loss": 0.001427097013220191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4270969586505089e-05, + "grad_norm": 2.6537251472473145, + "learning_rate": 2.623993217465028e-07, + "loss": 0.5679, + "mean_token_accuracy": 0.8163504004478455, + "num_tokens": 23515410.0, + "step": 620 + }, + { + "epoch": 0.07899758300470679, + "ewc_loss": 0.0014498431701213121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4498431482934393e-05, + "grad_norm": 2.4627177715301514, + "learning_rate": 2.6282323018228063e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.828664243221283, + "num_tokens": 23554627.0, + "step": 621 + }, + { + "epoch": 0.07912479328329729, + "ewc_loss": 0.001431173994205892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4311740414996166e-05, + "grad_norm": 2.4460880756378174, + "learning_rate": 2.632471386180585e-07, + "loss": 0.5751, + "mean_token_accuracy": 0.8167243003845215, + "num_tokens": 23593400.0, + "step": 622 + }, + { + "epoch": 0.0792520035618878, + "ewc_loss": 0.001426883041858673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4268830454966519e-05, + "grad_norm": 2.551389455795288, + "learning_rate": 2.6367104705383637e-07, + "loss": 0.5718, + "mean_token_accuracy": 0.8174570202827454, + "num_tokens": 23628242.0, + "step": 623 + }, + { + "epoch": 0.07937921384047832, + "ewc_loss": 0.001445966656319797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4459666090260725e-05, + "grad_norm": 2.341034412384033, + "learning_rate": 2.6409495548961427e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8259826898574829, + "num_tokens": 23671041.0, + "step": 624 + }, + { + "epoch": 0.07950642411906882, + "ewc_loss": 0.00142633484210819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4263348930398934e-05, + "grad_norm": 2.5885655879974365, + "learning_rate": 2.6451886392539206e-07, + "loss": 0.5356, + "mean_token_accuracy": 0.8323543667793274, + "num_tokens": 23701904.0, + "step": 625 + }, + { + "epoch": 0.07963363439765933, + "ewc_loss": 0.0014548979233950377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4548979379469529e-05, + "grad_norm": 2.384702444076538, + "learning_rate": 2.6494277236116996e-07, + "loss": 0.501, + "mean_token_accuracy": 0.8391281366348267, + "num_tokens": 23742374.0, + "step": 626 + }, + { + "epoch": 0.07976084467624985, + "ewc_loss": 0.0014367018593475223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4367018593475223e-05, + "grad_norm": 2.5587871074676514, + "learning_rate": 2.6536668079694786e-07, + "loss": 0.6049, + "mean_token_accuracy": 0.8099923729896545, + "num_tokens": 23780216.0, + "step": 627 + }, + { + "epoch": 0.07988805495484035, + "ewc_loss": 0.0014553496148437262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4553495930158533e-05, + "grad_norm": 2.5737643241882324, + "learning_rate": 2.6579058923272576e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.8521363735198975, + "num_tokens": 23813641.0, + "step": 628 + }, + { + "epoch": 0.08001526523343086, + "ewc_loss": 0.0014652721583843231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.465272180212196e-05, + "grad_norm": 2.5034780502319336, + "learning_rate": 2.6621449766850356e-07, + "loss": 0.5447, + "mean_token_accuracy": 0.8272180557250977, + "num_tokens": 23850189.0, + "step": 629 + }, + { + "epoch": 0.08014247551202137, + "ewc_loss": 0.0014539837138727307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4539837138727307e-05, + "grad_norm": 2.4137556552886963, + "learning_rate": 2.6663840610428145e-07, + "loss": 0.5005, + "mean_token_accuracy": 0.8389406800270081, + "num_tokens": 23884410.0, + "step": 630 + }, + { + "epoch": 0.08026968579061187, + "ewc_loss": 0.0014446551213040948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.444655117666116e-05, + "grad_norm": 2.308623790740967, + "learning_rate": 2.6706231454005935e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8376753330230713, + "num_tokens": 23930139.0, + "step": 631 + }, + { + "epoch": 0.08039689606920239, + "ewc_loss": 0.0014378969790413976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4378969353856519e-05, + "grad_norm": 2.2724244594573975, + "learning_rate": 2.6748622297583725e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8376502990722656, + "num_tokens": 23973124.0, + "step": 632 + }, + { + "epoch": 0.0805241063477929, + "ewc_loss": 0.0014412745367735624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.441274525859626e-05, + "grad_norm": 2.4110710620880127, + "learning_rate": 2.6791013141161505e-07, + "loss": 0.5154, + "mean_token_accuracy": 0.8373228311538696, + "num_tokens": 24008676.0, + "step": 633 + }, + { + "epoch": 0.08065131662638342, + "ewc_loss": 0.001466399640776217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4663996807939839e-05, + "grad_norm": 2.580592155456543, + "learning_rate": 2.6833403984739294e-07, + "loss": 0.5158, + "mean_token_accuracy": 0.8346357941627502, + "num_tokens": 24044880.0, + "step": 634 + }, + { + "epoch": 0.08077852690497392, + "ewc_loss": 0.001491381088271737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4913810446159914e-05, + "grad_norm": 2.2741241455078125, + "learning_rate": 2.6875794828317084e-07, + "loss": 0.5056, + "mean_token_accuracy": 0.8415565490722656, + "num_tokens": 24090140.0, + "step": 635 + }, + { + "epoch": 0.08090573718356443, + "ewc_loss": 0.0014469919260591269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4469918824033812e-05, + "grad_norm": 2.5341975688934326, + "learning_rate": 2.6918185671894874e-07, + "loss": 0.5882, + "mean_token_accuracy": 0.8170017004013062, + "num_tokens": 24123311.0, + "step": 636 + }, + { + "epoch": 0.08103294746215495, + "ewc_loss": 0.001484755310229957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4847552847641055e-05, + "grad_norm": 2.5142712593078613, + "learning_rate": 2.6960576515472654e-07, + "loss": 0.6226, + "mean_token_accuracy": 0.8027646541595459, + "num_tokens": 24164019.0, + "step": 637 + }, + { + "epoch": 0.08116015774074545, + "ewc_loss": 0.0014900052919983864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4900052519806195e-05, + "grad_norm": 2.4194495677948, + "learning_rate": 2.7002967359050443e-07, + "loss": 0.49, + "mean_token_accuracy": 0.843230128288269, + "num_tokens": 24205986.0, + "step": 638 + }, + { + "epoch": 0.08128736801933596, + "ewc_loss": 0.0014738943427801132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4738943718839437e-05, + "grad_norm": 2.354686737060547, + "learning_rate": 2.7045358202628233e-07, + "loss": 0.5466, + "mean_token_accuracy": 0.8224931955337524, + "num_tokens": 24251956.0, + "step": 639 + }, + { + "epoch": 0.08141457829792648, + "ewc_loss": 0.001464642584323883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4646426279796287e-05, + "grad_norm": 2.4434573650360107, + "learning_rate": 2.7087749046206023e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8391869068145752, + "num_tokens": 24284990.0, + "step": 640 + }, + { + "epoch": 0.08154178857651698, + "ewc_loss": 0.0014833102468401194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4833101886324584e-05, + "grad_norm": 2.468658685684204, + "learning_rate": 2.71301398897838e-07, + "loss": 0.5501, + "mean_token_accuracy": 0.8243846893310547, + "num_tokens": 24323305.0, + "step": 641 + }, + { + "epoch": 0.08166899885510749, + "ewc_loss": 0.001487196423113346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.487196368543664e-05, + "grad_norm": 2.2703051567077637, + "learning_rate": 2.717253073336159e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8495409488677979, + "num_tokens": 24365847.0, + "step": 642 + }, + { + "epoch": 0.081796209133698, + "ewc_loss": 0.001457729609683156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4577295587514527e-05, + "grad_norm": 2.3960063457489014, + "learning_rate": 2.721492157693938e-07, + "loss": 0.4686, + "mean_token_accuracy": 0.8478146195411682, + "num_tokens": 24407269.0, + "step": 643 + }, + { + "epoch": 0.0819234194122885, + "ewc_loss": 0.0014800699427723885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.480069931858452e-05, + "grad_norm": 2.429086923599243, + "learning_rate": 2.7257312420517167e-07, + "loss": 0.5241, + "mean_token_accuracy": 0.8320445418357849, + "num_tokens": 24444990.0, + "step": 644 + }, + { + "epoch": 0.08205062969087902, + "ewc_loss": 0.0014896293869242072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4896293578203768e-05, + "grad_norm": 2.3448326587677, + "learning_rate": 2.729970326409495e-07, + "loss": 0.5535, + "mean_token_accuracy": 0.8264370560646057, + "num_tokens": 24487293.0, + "step": 645 + }, + { + "epoch": 0.08217783996946953, + "ewc_loss": 0.0014779680641368032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4779680896026548e-05, + "grad_norm": 2.532034397125244, + "learning_rate": 2.734209410767274e-07, + "loss": 0.5162, + "mean_token_accuracy": 0.8356043696403503, + "num_tokens": 24521403.0, + "step": 646 + }, + { + "epoch": 0.08230505024806005, + "ewc_loss": 0.0015039005083963275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5039005120343063e-05, + "grad_norm": 2.4176435470581055, + "learning_rate": 2.738448495125053e-07, + "loss": 0.527, + "mean_token_accuracy": 0.8343867063522339, + "num_tokens": 24562740.0, + "step": 647 + }, + { + "epoch": 0.08243226052665055, + "ewc_loss": 0.0014938567765057087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4938567801436875e-05, + "grad_norm": 2.53631591796875, + "learning_rate": 2.7426875794828316e-07, + "loss": 0.5329, + "mean_token_accuracy": 0.8263692855834961, + "num_tokens": 24597507.0, + "step": 648 + }, + { + "epoch": 0.08255947080524106, + "ewc_loss": 0.0015089198714122176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5089198313944507e-05, + "grad_norm": 2.439741849899292, + "learning_rate": 2.74692666384061e-07, + "loss": 0.4954, + "mean_token_accuracy": 0.8383785486221313, + "num_tokens": 24635629.0, + "step": 649 + }, + { + "epoch": 0.08268668108383158, + "ewc_loss": 0.0014945196453481913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4945196198823396e-05, + "grad_norm": 2.3907580375671387, + "learning_rate": 2.751165748198389e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8318799138069153, + "num_tokens": 24676087.0, + "step": 650 + }, + { + "epoch": 0.08281389136242208, + "ewc_loss": 0.001491634757257998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.491634793637786e-05, + "grad_norm": 2.575327157974243, + "learning_rate": 2.755404832556168e-07, + "loss": 0.5375, + "mean_token_accuracy": 0.8276834487915039, + "num_tokens": 24713652.0, + "step": 651 + }, + { + "epoch": 0.0829411016410126, + "ewc_loss": 0.00152318284381181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5231828001560643e-05, + "grad_norm": 2.4719574451446533, + "learning_rate": 2.7596439169139465e-07, + "loss": 0.5208, + "mean_token_accuracy": 0.8344995379447937, + "num_tokens": 24750130.0, + "step": 652 + }, + { + "epoch": 0.08306831191960311, + "ewc_loss": 0.001512598362751305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5125983736652415e-05, + "grad_norm": 2.4407660961151123, + "learning_rate": 2.763883001271725e-07, + "loss": 0.5699, + "mean_token_accuracy": 0.8184343576431274, + "num_tokens": 24787337.0, + "step": 653 + }, + { + "epoch": 0.08319552219819361, + "ewc_loss": 0.001503422623500228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5034226635179948e-05, + "grad_norm": 2.4928274154663086, + "learning_rate": 2.768122085629504e-07, + "loss": 0.562, + "mean_token_accuracy": 0.8151668310165405, + "num_tokens": 24822736.0, + "step": 654 + }, + { + "epoch": 0.08332273247678412, + "ewc_loss": 0.001513009425252676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5130093743209727e-05, + "grad_norm": 2.4512722492218018, + "learning_rate": 2.772361169987283e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8411670923233032, + "num_tokens": 24858743.0, + "step": 655 + }, + { + "epoch": 0.08344994275537464, + "ewc_loss": 0.001510715577751398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5107155377336312e-05, + "grad_norm": 2.372929334640503, + "learning_rate": 2.7766002543450614e-07, + "loss": 0.513, + "mean_token_accuracy": 0.8327528238296509, + "num_tokens": 24901286.0, + "step": 656 + }, + { + "epoch": 0.08357715303396514, + "ewc_loss": 0.00149902468547225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4990247109381016e-05, + "grad_norm": 2.3562121391296387, + "learning_rate": 2.78083933870284e-07, + "loss": 0.5798, + "mean_token_accuracy": 0.8170446753501892, + "num_tokens": 24943457.0, + "step": 657 + }, + { + "epoch": 0.08370436331255565, + "ewc_loss": 0.001500551006756723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.500551024946617e-05, + "grad_norm": 2.437864065170288, + "learning_rate": 2.785078423060619e-07, + "loss": 0.5205, + "mean_token_accuracy": 0.8304308652877808, + "num_tokens": 24979247.0, + "step": 658 + }, + { + "epoch": 0.08383157359114617, + "ewc_loss": 0.0015174071304500103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5174071450019255e-05, + "grad_norm": 2.4205493927001953, + "learning_rate": 2.789317507418398e-07, + "loss": 0.5343, + "mean_token_accuracy": 0.8288154006004333, + "num_tokens": 25017456.0, + "step": 659 + }, + { + "epoch": 0.08395878386973668, + "ewc_loss": 0.001520494814030826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5204947885649744e-05, + "grad_norm": 2.4555020332336426, + "learning_rate": 2.7935565917761763e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8402173519134521, + "num_tokens": 25054682.0, + "step": 660 + }, + { + "epoch": 0.08408599414832718, + "ewc_loss": 0.0015246177790686488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5246178008965217e-05, + "grad_norm": 2.6297836303710938, + "learning_rate": 2.797795676133955e-07, + "loss": 0.5624, + "mean_token_accuracy": 0.8190152645111084, + "num_tokens": 25092501.0, + "step": 661 + }, + { + "epoch": 0.0842132044269177, + "ewc_loss": 0.001549042877741158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5490428268094547e-05, + "grad_norm": 2.420660972595215, + "learning_rate": 2.802034760491734e-07, + "loss": 0.4336, + "mean_token_accuracy": 0.8595223426818848, + "num_tokens": 25129253.0, + "step": 662 + }, + { + "epoch": 0.08434041470550821, + "ewc_loss": 0.001523573650047183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5235736100294162e-05, + "grad_norm": 2.2799148559570312, + "learning_rate": 2.806273844849512e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8482311964035034, + "num_tokens": 25169119.0, + "step": 663 + }, + { + "epoch": 0.08446762498409871, + "ewc_loss": 0.0015032293740659952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.503229395893868e-05, + "grad_norm": 2.443657636642456, + "learning_rate": 2.810512929207291e-07, + "loss": 0.5413, + "mean_token_accuracy": 0.8216131925582886, + "num_tokens": 25207654.0, + "step": 664 + }, + { + "epoch": 0.08459483526268922, + "ewc_loss": 0.0015314797637984157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.531479756522458e-05, + "grad_norm": 2.3453786373138428, + "learning_rate": 2.8147520135650697e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.8396363854408264, + "num_tokens": 25243258.0, + "step": 665 + }, + { + "epoch": 0.08472204554127974, + "ewc_loss": 0.0015250276774168015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5250276192091405e-05, + "grad_norm": 2.3435301780700684, + "learning_rate": 2.8189910979228487e-07, + "loss": 0.5265, + "mean_token_accuracy": 0.832619309425354, + "num_tokens": 25281301.0, + "step": 666 + }, + { + "epoch": 0.08484925581987024, + "ewc_loss": 0.0015288694994524121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5288695067283697e-05, + "grad_norm": 2.492267608642578, + "learning_rate": 2.823230182280627e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8458033800125122, + "num_tokens": 25313500.0, + "step": 667 + }, + { + "epoch": 0.08497646609846075, + "ewc_loss": 0.0015514902770519257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5514902770519257e-05, + "grad_norm": 2.3608572483062744, + "learning_rate": 2.827469266638406e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8341962099075317, + "num_tokens": 25355121.0, + "step": 668 + }, + { + "epoch": 0.08510367637705127, + "ewc_loss": 0.0015381762059405446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5381761841126718e-05, + "grad_norm": 2.3373160362243652, + "learning_rate": 2.8317083509961846e-07, + "loss": 0.5307, + "mean_token_accuracy": 0.8310428261756897, + "num_tokens": 25398868.0, + "step": 669 + }, + { + "epoch": 0.08523088665564178, + "ewc_loss": 0.0015310060698539019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5310060916817747e-05, + "grad_norm": 2.3651089668273926, + "learning_rate": 2.8359474353539636e-07, + "loss": 0.5103, + "mean_token_accuracy": 0.8346929550170898, + "num_tokens": 25438670.0, + "step": 670 + }, + { + "epoch": 0.08535809693423228, + "ewc_loss": 0.0015452749794349074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5452749721589498e-05, + "grad_norm": 2.44888973236084, + "learning_rate": 2.840186519711742e-07, + "loss": 0.5557, + "mean_token_accuracy": 0.8257195353507996, + "num_tokens": 25479243.0, + "step": 671 + }, + { + "epoch": 0.0854853072128228, + "ewc_loss": 0.0015623692888766527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5623692888766527e-05, + "grad_norm": 2.4739444255828857, + "learning_rate": 2.844425604069521e-07, + "loss": 0.5406, + "mean_token_accuracy": 0.8267446160316467, + "num_tokens": 25514615.0, + "step": 672 + }, + { + "epoch": 0.08561251749141331, + "ewc_loss": 0.0015674388268962502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.567438812344335e-05, + "grad_norm": 2.51203989982605, + "learning_rate": 2.8486646884272995e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8334324359893799, + "num_tokens": 25552753.0, + "step": 673 + }, + { + "epoch": 0.08573972777000381, + "ewc_loss": 0.0015707362908869982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.570736276335083e-05, + "grad_norm": 2.442335367202759, + "learning_rate": 2.8529037727850785e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.850121259689331, + "num_tokens": 25587415.0, + "step": 674 + }, + { + "epoch": 0.08586693804859433, + "ewc_loss": 0.001561689656227827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.561689714435488e-05, + "grad_norm": 2.3495237827301025, + "learning_rate": 2.857142857142857e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8461006879806519, + "num_tokens": 25625962.0, + "step": 675 + }, + { + "epoch": 0.08599414832718484, + "ewc_loss": 0.0015462457668036222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5462457668036222e-05, + "grad_norm": 2.427021026611328, + "learning_rate": 2.861381941500636e-07, + "loss": 0.532, + "mean_token_accuracy": 0.8289679288864136, + "num_tokens": 25669680.0, + "step": 676 + }, + { + "epoch": 0.08612135860577534, + "ewc_loss": 0.0015632468275725842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5632467693649232e-05, + "grad_norm": 2.4309043884277344, + "learning_rate": 2.8656210258584144e-07, + "loss": 0.566, + "mean_token_accuracy": 0.8161135911941528, + "num_tokens": 25709221.0, + "step": 677 + }, + { + "epoch": 0.08624856888436586, + "ewc_loss": 0.0015668647829443216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.566864739288576e-05, + "grad_norm": 2.5718271732330322, + "learning_rate": 2.869860110216193e-07, + "loss": 0.5679, + "mean_token_accuracy": 0.8158044219017029, + "num_tokens": 25741880.0, + "step": 678 + }, + { + "epoch": 0.08637577916295637, + "ewc_loss": 0.0015852446667850018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.585244717716705e-05, + "grad_norm": 2.2865304946899414, + "learning_rate": 2.874099194573972e-07, + "loss": 0.5014, + "mean_token_accuracy": 0.8386208415031433, + "num_tokens": 25786282.0, + "step": 679 + }, + { + "epoch": 0.08650298944154687, + "ewc_loss": 0.0015426786849275231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5426787285832688e-05, + "grad_norm": 2.4930357933044434, + "learning_rate": 2.878338278931751e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.8275343179702759, + "num_tokens": 25820229.0, + "step": 680 + }, + { + "epoch": 0.08663019972013739, + "ewc_loss": 0.0015764099080115557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5764098861836828e-05, + "grad_norm": 2.391033887863159, + "learning_rate": 2.8825773632895293e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8473746180534363, + "num_tokens": 25856465.0, + "step": 681 + }, + { + "epoch": 0.0867574099987279, + "ewc_loss": 0.0015665313694626093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.566531318530906e-05, + "grad_norm": 2.3243682384490967, + "learning_rate": 2.886816447647308e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8442554473876953, + "num_tokens": 25897407.0, + "step": 682 + }, + { + "epoch": 0.08688462027731841, + "ewc_loss": 0.0015562784392386675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5562784028588794e-05, + "grad_norm": 2.406550407409668, + "learning_rate": 2.891055532005087e-07, + "loss": 0.5046, + "mean_token_accuracy": 0.8372468948364258, + "num_tokens": 25938825.0, + "step": 683 + }, + { + "epoch": 0.08701183055590891, + "ewc_loss": 0.0015748519217595458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5748519217595458e-05, + "grad_norm": 2.6085386276245117, + "learning_rate": 2.8952946163628657e-07, + "loss": 0.5337, + "mean_token_accuracy": 0.8261644840240479, + "num_tokens": 25974944.0, + "step": 684 + }, + { + "epoch": 0.08713904083449943, + "ewc_loss": 0.0016117445193231106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6117444829433225e-05, + "grad_norm": 2.6487321853637695, + "learning_rate": 2.899533700720644e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.8394838571548462, + "num_tokens": 26012864.0, + "step": 685 + }, + { + "epoch": 0.08726625111308994, + "ewc_loss": 0.001609722152352333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6097221305244602e-05, + "grad_norm": 2.377540111541748, + "learning_rate": 2.9037727850784227e-07, + "loss": 0.5381, + "mean_token_accuracy": 0.8267819285392761, + "num_tokens": 26049779.0, + "step": 686 + }, + { + "epoch": 0.08739346139168044, + "ewc_loss": 0.0015622152714058757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5622152204741724e-05, + "grad_norm": 2.3498404026031494, + "learning_rate": 2.9080118694362016e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8369306325912476, + "num_tokens": 26089095.0, + "step": 687 + }, + { + "epoch": 0.08752067167027096, + "ewc_loss": 0.0015588350361213088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.558834992465563e-05, + "grad_norm": 2.241575002670288, + "learning_rate": 2.9122509537939806e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8373426198959351, + "num_tokens": 26132553.0, + "step": 688 + }, + { + "epoch": 0.08764788194886147, + "ewc_loss": 0.0015545147471129894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5545147107332014e-05, + "grad_norm": 2.3851888179779053, + "learning_rate": 2.916490038151759e-07, + "loss": 0.5598, + "mean_token_accuracy": 0.8174666166305542, + "num_tokens": 26171301.0, + "step": 689 + }, + { + "epoch": 0.08777509222745197, + "ewc_loss": 0.001583307282999158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5833073121029884e-05, + "grad_norm": 2.4237496852874756, + "learning_rate": 2.9207291225095376e-07, + "loss": 0.5462, + "mean_token_accuracy": 0.8233294486999512, + "num_tokens": 26217871.0, + "step": 690 + }, + { + "epoch": 0.08790230250604249, + "ewc_loss": 0.0015873933443799615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5873933080001734e-05, + "grad_norm": 2.3259029388427734, + "learning_rate": 2.9249682068673166e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.834297239780426, + "num_tokens": 26255927.0, + "step": 691 + }, + { + "epoch": 0.088029512784633, + "ewc_loss": 0.0015693677123636007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.569367668707855e-05, + "grad_norm": 2.3194565773010254, + "learning_rate": 2.9292072912250955e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8428878784179688, + "num_tokens": 26296041.0, + "step": 692 + }, + { + "epoch": 0.0881567230632235, + "ewc_loss": 0.0015739355003461242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5739355148980394e-05, + "grad_norm": 2.5485808849334717, + "learning_rate": 2.933446375582874e-07, + "loss": 0.574, + "mean_token_accuracy": 0.8176521062850952, + "num_tokens": 26333785.0, + "step": 693 + }, + { + "epoch": 0.08828393334181402, + "ewc_loss": 0.0016123125096783042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.61231255333405e-05, + "grad_norm": 2.4224820137023926, + "learning_rate": 2.9376854599406525e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8412057161331177, + "num_tokens": 26368837.0, + "step": 694 + }, + { + "epoch": 0.08841114362040453, + "ewc_loss": 0.0015900597209110856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5900597645668313e-05, + "grad_norm": 2.5404908657073975, + "learning_rate": 2.9419245442984315e-07, + "loss": 0.5554, + "mean_token_accuracy": 0.8219907879829407, + "num_tokens": 26402151.0, + "step": 695 + }, + { + "epoch": 0.08853835389899505, + "ewc_loss": 0.0016101481160148978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6101481378427707e-05, + "grad_norm": 2.4439659118652344, + "learning_rate": 2.9461636286562104e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8336606621742249, + "num_tokens": 26435550.0, + "step": 696 + }, + { + "epoch": 0.08866556417758555, + "ewc_loss": 0.0015974801499396563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5974801499396563e-05, + "grad_norm": 2.2296392917633057, + "learning_rate": 2.9504027130139884e-07, + "loss": 0.5005, + "mean_token_accuracy": 0.8389901518821716, + "num_tokens": 26479299.0, + "step": 697 + }, + { + "epoch": 0.08879277445617606, + "ewc_loss": 0.0015631010755896568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5631010683136992e-05, + "grad_norm": 2.3856289386749268, + "learning_rate": 2.9546417973717674e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.8275549411773682, + "num_tokens": 26519380.0, + "step": 698 + }, + { + "epoch": 0.08891998473476657, + "ewc_loss": 0.0016009421087801456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6009420505724847e-05, + "grad_norm": 2.3446831703186035, + "learning_rate": 2.9588808817295464e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.848150372505188, + "num_tokens": 26557306.0, + "step": 699 + }, + { + "epoch": 0.08904719501335707, + "ewc_loss": 0.0015974999405443668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.597499976924155e-05, + "grad_norm": 2.499009132385254, + "learning_rate": 2.9631199660873253e-07, + "loss": 0.5322, + "mean_token_accuracy": 0.8303310871124268, + "num_tokens": 26593549.0, + "step": 700 + }, + { + "epoch": 0.08917440529194759, + "ewc_loss": 0.001621853094547987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.621853152755648e-05, + "grad_norm": 2.35076642036438, + "learning_rate": 2.9673590504451033e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.8524047136306763, + "num_tokens": 26629926.0, + "step": 701 + }, + { + "epoch": 0.0893016155705381, + "ewc_loss": 0.0016019213944673538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6019213944673538e-05, + "grad_norm": 2.2833092212677, + "learning_rate": 2.9715981348028823e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8437118530273438, + "num_tokens": 26668652.0, + "step": 702 + }, + { + "epoch": 0.0894288258491286, + "ewc_loss": 0.0015937206335365772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5937206626404077e-05, + "grad_norm": 2.444796085357666, + "learning_rate": 2.975837219160661e-07, + "loss": 0.5501, + "mean_token_accuracy": 0.8232187032699585, + "num_tokens": 26708084.0, + "step": 703 + }, + { + "epoch": 0.08955603612771912, + "ewc_loss": 0.0016256952658295631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6256952221738175e-05, + "grad_norm": 2.3593943119049072, + "learning_rate": 2.98007630351844e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.8334959745407104, + "num_tokens": 26750134.0, + "step": 704 + }, + { + "epoch": 0.08968324640630963, + "ewc_loss": 0.0016156351193785667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6156351193785667e-05, + "grad_norm": 2.4586257934570312, + "learning_rate": 2.984315387876218e-07, + "loss": 0.5258, + "mean_token_accuracy": 0.8310179710388184, + "num_tokens": 26785410.0, + "step": 705 + }, + { + "epoch": 0.08981045668490013, + "ewc_loss": 0.0016305515309795737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6305515600834042e-05, + "grad_norm": 2.5233399868011475, + "learning_rate": 2.988554472233997e-07, + "loss": 0.531, + "mean_token_accuracy": 0.8312810659408569, + "num_tokens": 26825226.0, + "step": 706 + }, + { + "epoch": 0.08993766696349065, + "ewc_loss": 0.0016395588172599673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6395588318118826e-05, + "grad_norm": 2.309037208557129, + "learning_rate": 2.992793556591776e-07, + "loss": 0.5227, + "mean_token_accuracy": 0.8316068649291992, + "num_tokens": 26865401.0, + "step": 707 + }, + { + "epoch": 0.09006487724208116, + "ewc_loss": 0.0016047775279730558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6047775716288015e-05, + "grad_norm": 2.3992509841918945, + "learning_rate": 2.997032640949555e-07, + "loss": 0.5338, + "mean_token_accuracy": 0.8302549719810486, + "num_tokens": 26905365.0, + "step": 708 + }, + { + "epoch": 0.09019208752067168, + "ewc_loss": 0.001625661039724946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6256610251730308e-05, + "grad_norm": 2.356490135192871, + "learning_rate": 3.001271725307333e-07, + "loss": 0.4737, + "mean_token_accuracy": 0.8451337218284607, + "num_tokens": 26941348.0, + "step": 709 + }, + { + "epoch": 0.09031929779926218, + "ewc_loss": 0.001625745790079236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.625745790079236e-05, + "grad_norm": 2.464895725250244, + "learning_rate": 3.005510809665112e-07, + "loss": 0.5105, + "mean_token_accuracy": 0.8359881043434143, + "num_tokens": 26975803.0, + "step": 710 + }, + { + "epoch": 0.09044650807785269, + "ewc_loss": 0.0016468383837491274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.646838427404873e-05, + "grad_norm": 2.399945020675659, + "learning_rate": 3.009749894022891e-07, + "loss": 0.4569, + "mean_token_accuracy": 0.8513563275337219, + "num_tokens": 27014931.0, + "step": 711 + }, + { + "epoch": 0.0905737183564432, + "ewc_loss": 0.0016335916006937623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6335916370735504e-05, + "grad_norm": 2.3882546424865723, + "learning_rate": 3.01398897838067e-07, + "loss": 0.5112, + "mean_token_accuracy": 0.8376801013946533, + "num_tokens": 27055505.0, + "step": 712 + }, + { + "epoch": 0.0907009286350337, + "ewc_loss": 0.0016324924072250724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6324924217769876e-05, + "grad_norm": 2.429501533508301, + "learning_rate": 3.018228062738448e-07, + "loss": 0.5398, + "mean_token_accuracy": 0.8301019668579102, + "num_tokens": 27092121.0, + "step": 713 + }, + { + "epoch": 0.09082813891362422, + "ewc_loss": 0.0016412257682532072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.641225753701292e-05, + "grad_norm": 2.369710683822632, + "learning_rate": 3.022467147096227e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8457863926887512, + "num_tokens": 27129827.0, + "step": 714 + }, + { + "epoch": 0.09095534919221474, + "ewc_loss": 0.0016323885647580028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.632388557482045e-05, + "grad_norm": 2.289473056793213, + "learning_rate": 3.026706231454006e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.8398165702819824, + "num_tokens": 27174977.0, + "step": 715 + }, + { + "epoch": 0.09108255947080524, + "ewc_loss": 0.0016219977987930179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6219977624132298e-05, + "grad_norm": 2.5245039463043213, + "learning_rate": 3.0309453158117844e-07, + "loss": 0.499, + "mean_token_accuracy": 0.8391695022583008, + "num_tokens": 27211472.0, + "step": 716 + }, + { + "epoch": 0.09120976974939575, + "ewc_loss": 0.0016671507619321346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.667150718276389e-05, + "grad_norm": 2.4212236404418945, + "learning_rate": 3.035184400169563e-07, + "loss": 0.5776, + "mean_token_accuracy": 0.8178520202636719, + "num_tokens": 27249160.0, + "step": 717 + }, + { + "epoch": 0.09133698002798626, + "ewc_loss": 0.0016517149051651359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.651714956096839e-05, + "grad_norm": 2.4906022548675537, + "learning_rate": 3.039423484527342e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8404220938682556, + "num_tokens": 27287946.0, + "step": 718 + }, + { + "epoch": 0.09146419030657676, + "ewc_loss": 0.0016604579286649823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6604579286649823e-05, + "grad_norm": 2.4931488037109375, + "learning_rate": 3.043662568885121e-07, + "loss": 0.5221, + "mean_token_accuracy": 0.8285219669342041, + "num_tokens": 27323388.0, + "step": 719 + }, + { + "epoch": 0.09159140058516728, + "ewc_loss": 0.001662340248003602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6623402188997716e-05, + "grad_norm": 2.583494186401367, + "learning_rate": 3.0479016532428993e-07, + "loss": 0.5714, + "mean_token_accuracy": 0.8159554600715637, + "num_tokens": 27357524.0, + "step": 720 + }, + { + "epoch": 0.0917186108637578, + "ewc_loss": 0.001676844316534698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.676844294706825e-05, + "grad_norm": 2.418858289718628, + "learning_rate": 3.052140737600678e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8418834209442139, + "num_tokens": 27392037.0, + "step": 721 + }, + { + "epoch": 0.09184582114234831, + "ewc_loss": 0.0016532865120097995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6532865629415028e-05, + "grad_norm": 2.4328551292419434, + "learning_rate": 3.056379821958457e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8406948447227478, + "num_tokens": 27429149.0, + "step": 722 + }, + { + "epoch": 0.09197303142093881, + "ewc_loss": 0.0016605814453214407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.660581438045483e-05, + "grad_norm": 2.474045991897583, + "learning_rate": 3.060618906316236e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8421726226806641, + "num_tokens": 27465023.0, + "step": 723 + }, + { + "epoch": 0.09210024169952932, + "ewc_loss": 0.0016676309751346707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.667630931478925e-05, + "grad_norm": 2.382158041000366, + "learning_rate": 3.064857990674014e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.842201828956604, + "num_tokens": 27501268.0, + "step": 724 + }, + { + "epoch": 0.09222745197811984, + "ewc_loss": 0.0016542491503059864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6542491721338592e-05, + "grad_norm": 2.357604503631592, + "learning_rate": 3.0690970750317927e-07, + "loss": 0.5383, + "mean_token_accuracy": 0.828134298324585, + "num_tokens": 27539937.0, + "step": 725 + }, + { + "epoch": 0.09235466225671034, + "ewc_loss": 0.0016518315533176064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6518315533176064e-05, + "grad_norm": 2.4412333965301514, + "learning_rate": 3.0733361593895717e-07, + "loss": 0.501, + "mean_token_accuracy": 0.838054895401001, + "num_tokens": 27574576.0, + "step": 726 + }, + { + "epoch": 0.09248187253530085, + "ewc_loss": 0.0016697982791811228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.66979825735325e-05, + "grad_norm": 2.3916585445404053, + "learning_rate": 3.0775752437473507e-07, + "loss": 0.5094, + "mean_token_accuracy": 0.8315789699554443, + "num_tokens": 27616036.0, + "step": 727 + }, + { + "epoch": 0.09260908281389137, + "ewc_loss": 0.0016614043852314353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6614043488516472e-05, + "grad_norm": 2.3339059352874756, + "learning_rate": 3.081814328105129e-07, + "loss": 0.532, + "mean_token_accuracy": 0.8295480608940125, + "num_tokens": 27662040.0, + "step": 728 + }, + { + "epoch": 0.09273629309248187, + "ewc_loss": 0.0016542504308745265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6542504454264417e-05, + "grad_norm": 2.404599189758301, + "learning_rate": 3.0860534124629076e-07, + "loss": 0.5614, + "mean_token_accuracy": 0.8198573589324951, + "num_tokens": 27705136.0, + "step": 729 + }, + { + "epoch": 0.09286350337107238, + "ewc_loss": 0.0016679477412253618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6679477994330227e-05, + "grad_norm": 2.4468061923980713, + "learning_rate": 3.0902924968206866e-07, + "loss": 0.5106, + "mean_token_accuracy": 0.8373355269432068, + "num_tokens": 27740564.0, + "step": 730 + }, + { + "epoch": 0.0929907136496629, + "ewc_loss": 0.0016828555380925536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.682855508988723e-05, + "grad_norm": 2.513162612915039, + "learning_rate": 3.0945315811784656e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8401931524276733, + "num_tokens": 27774788.0, + "step": 731 + }, + { + "epoch": 0.0931179239282534, + "ewc_loss": 0.0016920911148190498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6920910638873465e-05, + "grad_norm": 2.339493989944458, + "learning_rate": 3.098770665536244e-07, + "loss": 0.6284, + "mean_token_accuracy": 0.7981569170951843, + "num_tokens": 27818300.0, + "step": 732 + }, + { + "epoch": 0.09324513420684391, + "ewc_loss": 0.0016615865752100945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6615866115898825e-05, + "grad_norm": 2.3923470973968506, + "learning_rate": 3.1030097498940225e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8457051515579224, + "num_tokens": 27857739.0, + "step": 733 + }, + { + "epoch": 0.09337234448543442, + "ewc_loss": 0.0016709760529920459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.670976052992046e-05, + "grad_norm": 2.4203388690948486, + "learning_rate": 3.1072488342518015e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8295755386352539, + "num_tokens": 27893897.0, + "step": 734 + }, + { + "epoch": 0.09349955476402494, + "ewc_loss": 0.0016879617469385266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.687961776042357e-05, + "grad_norm": 2.4470911026000977, + "learning_rate": 3.11148791860958e-07, + "loss": 0.4989, + "mean_token_accuracy": 0.8363272547721863, + "num_tokens": 27930511.0, + "step": 735 + }, + { + "epoch": 0.09362676504261544, + "ewc_loss": 0.0016910278936848044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.691027864580974e-05, + "grad_norm": 2.5723721981048584, + "learning_rate": 3.115727002967359e-07, + "loss": 0.515, + "mean_token_accuracy": 0.8339095711708069, + "num_tokens": 27961128.0, + "step": 736 + }, + { + "epoch": 0.09375397532120595, + "ewc_loss": 0.0017121469136327505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7121468772529624e-05, + "grad_norm": 2.3812873363494873, + "learning_rate": 3.1199660873251374e-07, + "loss": 0.5159, + "mean_token_accuracy": 0.8342710137367249, + "num_tokens": 28000748.0, + "step": 737 + }, + { + "epoch": 0.09388118559979647, + "ewc_loss": 0.0016806708881631494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6806709027150646e-05, + "grad_norm": 2.282965660095215, + "learning_rate": 3.1242051716829164e-07, + "loss": 0.5502, + "mean_token_accuracy": 0.82206130027771, + "num_tokens": 28046455.0, + "step": 738 + }, + { + "epoch": 0.09400839587838697, + "ewc_loss": 0.001667578355409205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6675783626851626e-05, + "grad_norm": 2.4403841495513916, + "learning_rate": 3.128444256040695e-07, + "loss": 0.5251, + "mean_token_accuracy": 0.8321841955184937, + "num_tokens": 28087026.0, + "step": 739 + }, + { + "epoch": 0.09413560615697748, + "ewc_loss": 0.0017021915409713984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.702191548247356e-05, + "grad_norm": 2.342730760574341, + "learning_rate": 3.132683340398474e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.8509177565574646, + "num_tokens": 28126382.0, + "step": 740 + }, + { + "epoch": 0.094262816435568, + "ewc_loss": 0.0016898101894184947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6898102330742404e-05, + "grad_norm": 2.378127336502075, + "learning_rate": 3.1369224247562523e-07, + "loss": 0.4755, + "mean_token_accuracy": 0.847231388092041, + "num_tokens": 28161198.0, + "step": 741 + }, + { + "epoch": 0.0943900267141585, + "ewc_loss": 0.0016973608871921897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6973608580883592e-05, + "grad_norm": 2.260073661804199, + "learning_rate": 3.1411615091140313e-07, + "loss": 0.4455, + "mean_token_accuracy": 0.8550169467926025, + "num_tokens": 28204612.0, + "step": 742 + }, + { + "epoch": 0.09451723699274901, + "ewc_loss": 0.0016829018713906407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6829018932185136e-05, + "grad_norm": 2.3701329231262207, + "learning_rate": 3.14540059347181e-07, + "loss": 0.4992, + "mean_token_accuracy": 0.8380113244056702, + "num_tokens": 28241050.0, + "step": 743 + }, + { + "epoch": 0.09464444727133953, + "ewc_loss": 0.0017106200102716684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.710620017547626e-05, + "grad_norm": 2.35951566696167, + "learning_rate": 3.149639677829589e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.8497326374053955, + "num_tokens": 28278032.0, + "step": 744 + }, + { + "epoch": 0.09477165754993004, + "ewc_loss": 0.001708950032480061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7089500033762306e-05, + "grad_norm": 2.444175958633423, + "learning_rate": 3.153878762187368e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8286182880401611, + "num_tokens": 28315599.0, + "step": 745 + }, + { + "epoch": 0.09489886782852054, + "ewc_loss": 0.001721433363854885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.721433363854885e-05, + "grad_norm": 2.3683674335479736, + "learning_rate": 3.158117846545146e-07, + "loss": 0.5709, + "mean_token_accuracy": 0.819551944732666, + "num_tokens": 28358504.0, + "step": 746 + }, + { + "epoch": 0.09502607810711106, + "ewc_loss": 0.0017102280398830771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.710228025331162e-05, + "grad_norm": 2.6290647983551025, + "learning_rate": 3.1623569309029247e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.8421734571456909, + "num_tokens": 28389626.0, + "step": 747 + }, + { + "epoch": 0.09515328838570157, + "ewc_loss": 0.0017496371874585748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7496371583547443e-05, + "grad_norm": 2.302344560623169, + "learning_rate": 3.1665960152607037e-07, + "loss": 0.4683, + "mean_token_accuracy": 0.8462472558021545, + "num_tokens": 28430361.0, + "step": 748 + }, + { + "epoch": 0.09528049866429207, + "ewc_loss": 0.0017006095731630921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.700609573163092e-05, + "grad_norm": 2.3381707668304443, + "learning_rate": 3.1708350996184826e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8349008560180664, + "num_tokens": 28469792.0, + "step": 749 + }, + { + "epoch": 0.09540770894288259, + "ewc_loss": 0.0017045276472344995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.70452767633833e-05, + "grad_norm": 2.4392919540405273, + "learning_rate": 3.175074183976261e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8451109528541565, + "num_tokens": 28507569.0, + "step": 750 + }, + { + "epoch": 0.0955349192214731, + "ewc_loss": 0.0017290362156927586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7290361938648857e-05, + "grad_norm": 2.485943078994751, + "learning_rate": 3.1793132683340396e-07, + "loss": 0.5104, + "mean_token_accuracy": 0.836467444896698, + "num_tokens": 28542336.0, + "step": 751 + }, + { + "epoch": 0.0956621295000636, + "ewc_loss": 0.001734336605295539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.734336547087878e-05, + "grad_norm": 2.4595978260040283, + "learning_rate": 3.1835523526918186e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8493151664733887, + "num_tokens": 28578587.0, + "step": 752 + }, + { + "epoch": 0.09578933977865411, + "ewc_loss": 0.0017214759718626738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.721475928206928e-05, + "grad_norm": 2.3829944133758545, + "learning_rate": 3.1877914370495975e-07, + "loss": 0.5403, + "mean_token_accuracy": 0.8263670206069946, + "num_tokens": 28617505.0, + "step": 753 + }, + { + "epoch": 0.09591655005724463, + "ewc_loss": 0.0017075339565053582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.70753392012557e-05, + "grad_norm": 2.3944449424743652, + "learning_rate": 3.1920305214073755e-07, + "loss": 0.5447, + "mean_token_accuracy": 0.8259007930755615, + "num_tokens": 28658526.0, + "step": 754 + }, + { + "epoch": 0.09604376033583513, + "ewc_loss": 0.0017136095557361841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7136095266323537e-05, + "grad_norm": 2.363992929458618, + "learning_rate": 3.1962696057651545e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.8436373472213745, + "num_tokens": 28698825.0, + "step": 755 + }, + { + "epoch": 0.09617097061442564, + "ewc_loss": 0.0017149243503808975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7149242921732366e-05, + "grad_norm": 2.496734619140625, + "learning_rate": 3.2005086901229335e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.8285937905311584, + "num_tokens": 28733238.0, + "step": 756 + }, + { + "epoch": 0.09629818089301616, + "ewc_loss": 0.0017414125613868237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7414125977666117e-05, + "grad_norm": 2.3561809062957764, + "learning_rate": 3.2047477744807125e-07, + "loss": 0.5239, + "mean_token_accuracy": 0.8336278200149536, + "num_tokens": 28771552.0, + "step": 757 + }, + { + "epoch": 0.09642539117160667, + "ewc_loss": 0.0017171046929433942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.717104714771267e-05, + "grad_norm": 2.4036688804626465, + "learning_rate": 3.2089868588384904e-07, + "loss": 0.5539, + "mean_token_accuracy": 0.819291353225708, + "num_tokens": 28810827.0, + "step": 758 + }, + { + "epoch": 0.09655260145019717, + "ewc_loss": 0.0017317377496510744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.731737756927032e-05, + "grad_norm": 2.3977551460266113, + "learning_rate": 3.2132259431962694e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8481142520904541, + "num_tokens": 28849017.0, + "step": 759 + }, + { + "epoch": 0.09667981172878769, + "ewc_loss": 0.0017384167294949293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7384167222189717e-05, + "grad_norm": 2.42790150642395, + "learning_rate": 3.2174650275540484e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8380146026611328, + "num_tokens": 28887659.0, + "step": 760 + }, + { + "epoch": 0.0968070220073782, + "ewc_loss": 0.0017418977804481983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7418977222405374e-05, + "grad_norm": 2.3010261058807373, + "learning_rate": 3.2217041119118274e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8419116735458374, + "num_tokens": 28928983.0, + "step": 761 + }, + { + "epoch": 0.0969342322859687, + "ewc_loss": 0.0017174389213323593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7174388631246984e-05, + "grad_norm": 2.497368574142456, + "learning_rate": 3.2259431962696053e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8344167470932007, + "num_tokens": 28962195.0, + "step": 762 + }, + { + "epoch": 0.09706144256455922, + "ewc_loss": 0.001766091212630272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.766091190802399e-05, + "grad_norm": 2.4493417739868164, + "learning_rate": 3.2301822806273843e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8364449739456177, + "num_tokens": 28997351.0, + "step": 763 + }, + { + "epoch": 0.09718865284314973, + "ewc_loss": 0.0017601436702534556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7601436411496252e-05, + "grad_norm": 2.3691744804382324, + "learning_rate": 3.2344213649851633e-07, + "loss": 0.526, + "mean_token_accuracy": 0.8305438160896301, + "num_tokens": 29040988.0, + "step": 764 + }, + { + "epoch": 0.09731586312174023, + "ewc_loss": 0.0017408462008461356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7408461644663475e-05, + "grad_norm": 2.3549606800079346, + "learning_rate": 3.238660449342942e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.8395329713821411, + "num_tokens": 29081873.0, + "step": 765 + }, + { + "epoch": 0.09744307340033075, + "ewc_loss": 0.0017411414301022887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.741141386446543e-05, + "grad_norm": 2.457716464996338, + "learning_rate": 3.24289953370072e-07, + "loss": 0.4637, + "mean_token_accuracy": 0.8455235958099365, + "num_tokens": 29115360.0, + "step": 766 + }, + { + "epoch": 0.09757028367892126, + "ewc_loss": 0.0017683665500953794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7683665646472946e-05, + "grad_norm": 2.2917463779449463, + "learning_rate": 3.247138618058499e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8315966129302979, + "num_tokens": 29160139.0, + "step": 767 + }, + { + "epoch": 0.09769749395751176, + "ewc_loss": 0.0017396077746525407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7396077964804135e-05, + "grad_norm": 2.3064119815826416, + "learning_rate": 3.251377702416278e-07, + "loss": 0.5358, + "mean_token_accuracy": 0.8292088508605957, + "num_tokens": 29202872.0, + "step": 768 + }, + { + "epoch": 0.09782470423610228, + "ewc_loss": 0.0017438885988667607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7438886061427183e-05, + "grad_norm": 2.288649797439575, + "learning_rate": 3.255616786774057e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8427351713180542, + "num_tokens": 29246965.0, + "step": 769 + }, + { + "epoch": 0.09795191451469279, + "ewc_loss": 0.0017490186728537083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7490187019575387e-05, + "grad_norm": 2.2583067417144775, + "learning_rate": 3.259855871131835e-07, + "loss": 0.4239, + "mean_token_accuracy": 0.8601208329200745, + "num_tokens": 29289531.0, + "step": 770 + }, + { + "epoch": 0.0980791247932833, + "ewc_loss": 0.0017463135300204158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7463135009165853e-05, + "grad_norm": 2.335386276245117, + "learning_rate": 3.264094955489614e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.8434207439422607, + "num_tokens": 29330484.0, + "step": 771 + }, + { + "epoch": 0.0982063350718738, + "ewc_loss": 0.0017613789532333612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7613789168535732e-05, + "grad_norm": 2.307096242904663, + "learning_rate": 3.268334039847393e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8353496193885803, + "num_tokens": 29369595.0, + "step": 772 + }, + { + "epoch": 0.09833354535046432, + "ewc_loss": 0.001753504853695631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.753504875523504e-05, + "grad_norm": 2.368176221847534, + "learning_rate": 3.2725731242051715e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8424972295761108, + "num_tokens": 29411856.0, + "step": 773 + }, + { + "epoch": 0.09846075562905483, + "ewc_loss": 0.001767601352185011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.767601315805223e-05, + "grad_norm": 2.34671688079834, + "learning_rate": 3.27681220856295e-07, + "loss": 0.4768, + "mean_token_accuracy": 0.8458845615386963, + "num_tokens": 29454754.0, + "step": 774 + }, + { + "epoch": 0.09858796590764533, + "ewc_loss": 0.0017619332065805793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7619331629248336e-05, + "grad_norm": 2.437957286834717, + "learning_rate": 3.281051292920729e-07, + "loss": 0.4734, + "mean_token_accuracy": 0.8433421850204468, + "num_tokens": 29489131.0, + "step": 775 + }, + { + "epoch": 0.09871517618623585, + "ewc_loss": 0.0017801306676119566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.780130696715787e-05, + "grad_norm": 2.375288963317871, + "learning_rate": 3.285290377278508e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8291339874267578, + "num_tokens": 29528292.0, + "step": 776 + }, + { + "epoch": 0.09884238646482636, + "ewc_loss": 0.0017684891354292631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7684891645330936e-05, + "grad_norm": 2.4375040531158447, + "learning_rate": 3.2895294616362864e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.851624608039856, + "num_tokens": 29564878.0, + "step": 777 + }, + { + "epoch": 0.09896959674341686, + "ewc_loss": 0.0017771286657080054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.777128636604175e-05, + "grad_norm": 2.356196165084839, + "learning_rate": 3.293768545994065e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.8412512540817261, + "num_tokens": 29602826.0, + "step": 778 + }, + { + "epoch": 0.09909680702200738, + "ewc_loss": 0.0017613283125683665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7613283489481546e-05, + "grad_norm": 2.418990135192871, + "learning_rate": 3.298007630351844e-07, + "loss": 0.5488, + "mean_token_accuracy": 0.8257114291191101, + "num_tokens": 29640323.0, + "step": 779 + }, + { + "epoch": 0.09922401730059789, + "ewc_loss": 0.0017785800155252218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7785800082492642e-05, + "grad_norm": 2.314601182937622, + "learning_rate": 3.302246714709623e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8432333469390869, + "num_tokens": 29683302.0, + "step": 780 + }, + { + "epoch": 0.09935122757918839, + "ewc_loss": 0.001759798382408917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7597983969608322e-05, + "grad_norm": 2.3707969188690186, + "learning_rate": 3.3064857990674013e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8397098779678345, + "num_tokens": 29721085.0, + "step": 781 + }, + { + "epoch": 0.0994784378577789, + "ewc_loss": 0.0017721648328006268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7721647964208387e-05, + "grad_norm": 2.4450647830963135, + "learning_rate": 3.31072488342518e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8485487699508667, + "num_tokens": 29753810.0, + "step": 782 + }, + { + "epoch": 0.09960564813636942, + "ewc_loss": 0.001787530374713242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7875303456094116e-05, + "grad_norm": 2.305150270462036, + "learning_rate": 3.314963967782959e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8446238040924072, + "num_tokens": 29796488.0, + "step": 783 + }, + { + "epoch": 0.09973285841495994, + "ewc_loss": 0.0017635924741625786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.763592445058748e-05, + "grad_norm": 2.333554744720459, + "learning_rate": 3.319203052140738e-07, + "loss": 0.4937, + "mean_token_accuracy": 0.839735746383667, + "num_tokens": 29838181.0, + "step": 784 + }, + { + "epoch": 0.09986006869355044, + "ewc_loss": 0.0017670923843979836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7670923625701107e-05, + "grad_norm": 2.651160717010498, + "learning_rate": 3.323442136498516e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.8454188704490662, + "num_tokens": 29868893.0, + "step": 785 + }, + { + "epoch": 0.09998727897214095, + "ewc_loss": 0.00183411268517375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.834112663345877e-05, + "grad_norm": 2.5892653465270996, + "learning_rate": 3.3276812208562947e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8381340503692627, + "num_tokens": 29903571.0, + "step": 786 + }, + { + "epoch": 0.10011448925073146, + "ewc_loss": 0.0018201421480625868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8201420971308835e-05, + "grad_norm": 2.468518018722534, + "learning_rate": 3.3319203052140737e-07, + "loss": 0.5479, + "mean_token_accuracy": 0.8304526805877686, + "num_tokens": 29940438.0, + "step": 787 + }, + { + "epoch": 0.10024169952932196, + "ewc_loss": 0.0017884772969409823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7884773114928976e-05, + "grad_norm": 2.3545711040496826, + "learning_rate": 3.336159389571852e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8502732515335083, + "num_tokens": 29976538.0, + "step": 788 + }, + { + "epoch": 0.10036890980791248, + "ewc_loss": 0.0017731217667460442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.773121766746044e-05, + "grad_norm": 2.3188202381134033, + "learning_rate": 3.340398473929631e-07, + "loss": 0.54, + "mean_token_accuracy": 0.8263730406761169, + "num_tokens": 30019814.0, + "step": 789 + }, + { + "epoch": 0.100496120086503, + "ewc_loss": 0.0017779775662347674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7779775589588098e-05, + "grad_norm": 2.4465701580047607, + "learning_rate": 3.3446375582874096e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8419643640518188, + "num_tokens": 30057578.0, + "step": 790 + }, + { + "epoch": 0.1006233303650935, + "ewc_loss": 0.0018088981742039323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.808898196031805e-05, + "grad_norm": 2.4412171840667725, + "learning_rate": 3.3488766426451886e-07, + "loss": 0.5612, + "mean_token_accuracy": 0.8274294137954712, + "num_tokens": 30094869.0, + "step": 791 + }, + { + "epoch": 0.10075054064368401, + "ewc_loss": 0.001804898725822568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.804898784030229e-05, + "grad_norm": 2.354790210723877, + "learning_rate": 3.353115727002967e-07, + "loss": 0.5235, + "mean_token_accuracy": 0.8321143388748169, + "num_tokens": 30136446.0, + "step": 792 + }, + { + "epoch": 0.10087775092227452, + "ewc_loss": 0.0017903794068843126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7903794287121855e-05, + "grad_norm": 2.302579164505005, + "learning_rate": 3.357354811360746e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8292152881622314, + "num_tokens": 30179226.0, + "step": 793 + }, + { + "epoch": 0.10100496120086502, + "ewc_loss": 0.0017861267551779747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7861268133856356e-05, + "grad_norm": 2.4385857582092285, + "learning_rate": 3.3615938957185245e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8351901769638062, + "num_tokens": 30213890.0, + "step": 794 + }, + { + "epoch": 0.10113217147945554, + "ewc_loss": 0.0018188810208812356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.818880991777405e-05, + "grad_norm": 2.5500380992889404, + "learning_rate": 3.3658329800763035e-07, + "loss": 0.5578, + "mean_token_accuracy": 0.8176630139350891, + "num_tokens": 30245691.0, + "step": 795 + }, + { + "epoch": 0.10125938175804605, + "ewc_loss": 0.0018430526833981276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8430526324664243e-05, + "grad_norm": 2.437546491622925, + "learning_rate": 3.370072064434082e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.8266066908836365, + "num_tokens": 30281738.0, + "step": 796 + }, + { + "epoch": 0.10138659203663657, + "ewc_loss": 0.0018189175752922893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8189175534644164e-05, + "grad_norm": 2.314648151397705, + "learning_rate": 3.374311148791861e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8420053124427795, + "num_tokens": 30320707.0, + "step": 797 + }, + { + "epoch": 0.10151380231522707, + "ewc_loss": 0.0017993503715842962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.799350320652593e-05, + "grad_norm": 2.5837888717651367, + "learning_rate": 3.3785502331496394e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8400957584381104, + "num_tokens": 30351379.0, + "step": 798 + }, + { + "epoch": 0.10164101259381758, + "ewc_loss": 0.0018606719095259905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8606719095259905e-05, + "grad_norm": 2.559279680252075, + "learning_rate": 3.3827893175074184e-07, + "loss": 0.5097, + "mean_token_accuracy": 0.8338163495063782, + "num_tokens": 30382395.0, + "step": 799 + }, + { + "epoch": 0.1017682228724081, + "ewc_loss": 0.00186016911175102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8601691408548504e-05, + "grad_norm": 2.4383723735809326, + "learning_rate": 3.387028401865197e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.8373584747314453, + "num_tokens": 30419758.0, + "step": 800 + }, + { + "epoch": 0.1018954331509986, + "ewc_loss": 0.0018271873705089092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8271874068886973e-05, + "grad_norm": 2.378626823425293, + "learning_rate": 3.391267486222976e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8353543281555176, + "num_tokens": 30458115.0, + "step": 801 + }, + { + "epoch": 0.10202264342958911, + "ewc_loss": 0.0018173230346292257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.817323027353268e-05, + "grad_norm": 2.5590949058532715, + "learning_rate": 3.3955065705807543e-07, + "loss": 0.5345, + "mean_token_accuracy": 0.824286937713623, + "num_tokens": 30489679.0, + "step": 802 + }, + { + "epoch": 0.10214985370817962, + "ewc_loss": 0.0018620694754645228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8620694390847348e-05, + "grad_norm": 2.4359378814697266, + "learning_rate": 3.3997456549385333e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8473048210144043, + "num_tokens": 30524874.0, + "step": 803 + }, + { + "epoch": 0.10227706398677013, + "ewc_loss": 0.0018414841033518314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8414841179037467e-05, + "grad_norm": 2.4893200397491455, + "learning_rate": 3.403984739296312e-07, + "loss": 0.5433, + "mean_token_accuracy": 0.8249140977859497, + "num_tokens": 30558354.0, + "step": 804 + }, + { + "epoch": 0.10240427426536064, + "ewc_loss": 0.001849401625804603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8494016330805607e-05, + "grad_norm": 2.3466110229492188, + "learning_rate": 3.408223823654091e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.8452818393707275, + "num_tokens": 30594827.0, + "step": 805 + }, + { + "epoch": 0.10253148454395115, + "ewc_loss": 0.0018337718211114407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8337717847316526e-05, + "grad_norm": 2.4196150302886963, + "learning_rate": 3.412462908011869e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.838117241859436, + "num_tokens": 30635259.0, + "step": 806 + }, + { + "epoch": 0.10265869482254165, + "ewc_loss": 0.0018467819318175316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.846781924541574e-05, + "grad_norm": 2.3321540355682373, + "learning_rate": 3.4167019923696477e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.8404598236083984, + "num_tokens": 30681850.0, + "step": 807 + }, + { + "epoch": 0.10278590510113217, + "ewc_loss": 0.001834701863117516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8347018340136856e-05, + "grad_norm": 2.369905471801758, + "learning_rate": 3.4209410767274267e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8490042686462402, + "num_tokens": 30721302.0, + "step": 808 + }, + { + "epoch": 0.10291311537972268, + "ewc_loss": 0.001842400524765253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.842400524765253e-05, + "grad_norm": 2.4354453086853027, + "learning_rate": 3.4251801610852057e-07, + "loss": 0.5508, + "mean_token_accuracy": 0.8203064203262329, + "num_tokens": 30761975.0, + "step": 809 + }, + { + "epoch": 0.1030403256583132, + "ewc_loss": 0.0018591246334835887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8591246771393344e-05, + "grad_norm": 2.484596014022827, + "learning_rate": 3.429419245442984e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.835523247718811, + "num_tokens": 30797754.0, + "step": 810 + }, + { + "epoch": 0.1031675359369037, + "ewc_loss": 0.0018683549715206027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8683549569686875e-05, + "grad_norm": 2.3799548149108887, + "learning_rate": 3.4336583298007626e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8459485769271851, + "num_tokens": 30834183.0, + "step": 811 + }, + { + "epoch": 0.10329474621549421, + "ewc_loss": 0.0018480177968740463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.848017745942343e-05, + "grad_norm": 2.310314893722534, + "learning_rate": 3.4378974141585416e-07, + "loss": 0.5055, + "mean_token_accuracy": 0.837088942527771, + "num_tokens": 30874160.0, + "step": 812 + }, + { + "epoch": 0.10342195649408473, + "ewc_loss": 0.0018382875714451075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8382876078248955e-05, + "grad_norm": 2.2643542289733887, + "learning_rate": 3.4421364985163206e-07, + "loss": 0.4448, + "mean_token_accuracy": 0.8537077903747559, + "num_tokens": 30916827.0, + "step": 813 + }, + { + "epoch": 0.10354916677267523, + "ewc_loss": 0.0018448284827172756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.844828511821106e-05, + "grad_norm": 2.327535390853882, + "learning_rate": 3.446375582874099e-07, + "loss": 0.5138, + "mean_token_accuracy": 0.8352557420730591, + "num_tokens": 30960063.0, + "step": 814 + }, + { + "epoch": 0.10367637705126574, + "ewc_loss": 0.0018636357272043824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.863635770860128e-05, + "grad_norm": 2.4300782680511475, + "learning_rate": 3.4506146672318775e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8465916514396667, + "num_tokens": 30995172.0, + "step": 815 + }, + { + "epoch": 0.10380358732985626, + "ewc_loss": 0.001882387325167656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8823873688234016e-05, + "grad_norm": 2.3936285972595215, + "learning_rate": 3.4548537515896565e-07, + "loss": 0.5189, + "mean_token_accuracy": 0.8327468037605286, + "num_tokens": 31035089.0, + "step": 816 + }, + { + "epoch": 0.10393079760844676, + "ewc_loss": 0.0018714881734922528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8714881662162952e-05, + "grad_norm": 2.3402137756347656, + "learning_rate": 3.4590928359474355e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8353238701820374, + "num_tokens": 31074836.0, + "step": 817 + }, + { + "epoch": 0.10405800788703727, + "ewc_loss": 0.001855847891420126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.855847949627787e-05, + "grad_norm": 2.332257032394409, + "learning_rate": 3.463331920305214e-07, + "loss": 0.5229, + "mean_token_accuracy": 0.8317009806632996, + "num_tokens": 31114700.0, + "step": 818 + }, + { + "epoch": 0.10418521816562779, + "ewc_loss": 0.0018597169546410441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.859716940089129e-05, + "grad_norm": 2.496347427368164, + "learning_rate": 3.4675710046629924e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.846050500869751, + "num_tokens": 31148258.0, + "step": 819 + }, + { + "epoch": 0.1043124284442183, + "ewc_loss": 0.0018948456272482872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8948456272482872e-05, + "grad_norm": 2.3895103931427, + "learning_rate": 3.4718100890207714e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.8370671272277832, + "num_tokens": 31185720.0, + "step": 820 + }, + { + "epoch": 0.1044396387228088, + "ewc_loss": 0.0018700686050578952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.870068626885768e-05, + "grad_norm": 2.471949338912964, + "learning_rate": 3.4760491733785504e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8292123079299927, + "num_tokens": 31220353.0, + "step": 821 + }, + { + "epoch": 0.10456684900139931, + "ewc_loss": 0.0018862835131585598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8862834622268565e-05, + "grad_norm": 2.410125255584717, + "learning_rate": 3.480288257736329e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.835299551486969, + "num_tokens": 31256477.0, + "step": 822 + }, + { + "epoch": 0.10469405927998983, + "ewc_loss": 0.001879445742815733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8794456991599873e-05, + "grad_norm": 2.4005825519561768, + "learning_rate": 3.4845273420941073e-07, + "loss": 0.4972, + "mean_token_accuracy": 0.840078592300415, + "num_tokens": 31294365.0, + "step": 823 + }, + { + "epoch": 0.10482126955858033, + "ewc_loss": 0.0018796323565766215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.879632327472791e-05, + "grad_norm": 2.609257221221924, + "learning_rate": 3.4887664264518863e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.8364539742469788, + "num_tokens": 31329357.0, + "step": 824 + }, + { + "epoch": 0.10494847983717084, + "ewc_loss": 0.0019218443194404244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9218443412682973e-05, + "grad_norm": 2.3092758655548096, + "learning_rate": 3.4930055108096653e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8386489152908325, + "num_tokens": 31372180.0, + "step": 825 + }, + { + "epoch": 0.10507569011576136, + "ewc_loss": 0.0018576018046587706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8576018192106858e-05, + "grad_norm": 2.3390772342681885, + "learning_rate": 3.497244595167443e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.8429181575775146, + "num_tokens": 31414866.0, + "step": 826 + }, + { + "epoch": 0.10520290039435186, + "ewc_loss": 0.001867234124802053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8672340956982225e-05, + "grad_norm": 2.5455305576324463, + "learning_rate": 3.501483679525222e-07, + "loss": 0.5476, + "mean_token_accuracy": 0.8224650621414185, + "num_tokens": 31450583.0, + "step": 827 + }, + { + "epoch": 0.10533011067294237, + "ewc_loss": 0.0019231863552704453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9231863916502334e-05, + "grad_norm": 2.3786144256591797, + "learning_rate": 3.505722763883001e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.842248797416687, + "num_tokens": 31490371.0, + "step": 828 + }, + { + "epoch": 0.10545732095153289, + "ewc_loss": 0.0018873733934015036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8873734006774612e-05, + "grad_norm": 2.4633703231811523, + "learning_rate": 3.50996184824078e-07, + "loss": 0.5208, + "mean_token_accuracy": 0.8306963443756104, + "num_tokens": 31525951.0, + "step": 829 + }, + { + "epoch": 0.10558453123012339, + "ewc_loss": 0.0019001747714355588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9001747205038555e-05, + "grad_norm": 2.3785688877105713, + "learning_rate": 3.514200932598558e-07, + "loss": 0.5615, + "mean_token_accuracy": 0.8204419016838074, + "num_tokens": 31570109.0, + "step": 830 + }, + { + "epoch": 0.1057117415087139, + "ewc_loss": 0.0018890048377215862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8890048522735015e-05, + "grad_norm": 2.4200587272644043, + "learning_rate": 3.518440016956337e-07, + "loss": 0.5046, + "mean_token_accuracy": 0.8362576961517334, + "num_tokens": 31610569.0, + "step": 831 + }, + { + "epoch": 0.10583895178730442, + "ewc_loss": 0.0018990307580679655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8990307580679655e-05, + "grad_norm": 2.4342942237854004, + "learning_rate": 3.522679101314116e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.83609938621521, + "num_tokens": 31653585.0, + "step": 832 + }, + { + "epoch": 0.10596616206589493, + "ewc_loss": 0.0019017887534573674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9017887098016217e-05, + "grad_norm": 2.3642356395721436, + "learning_rate": 3.526918185671895e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8307992219924927, + "num_tokens": 31696353.0, + "step": 833 + }, + { + "epoch": 0.10609337234448543, + "ewc_loss": 0.0018930697115138173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8930697478936054e-05, + "grad_norm": 2.4457790851593018, + "learning_rate": 3.531157270029673e-07, + "loss": 0.4577, + "mean_token_accuracy": 0.8515555262565613, + "num_tokens": 31730620.0, + "step": 834 + }, + { + "epoch": 0.10622058262307595, + "ewc_loss": 0.0019114183960482478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9114184397039935e-05, + "grad_norm": 2.4153149127960205, + "learning_rate": 3.535396354387452e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8450859785079956, + "num_tokens": 31770538.0, + "step": 835 + }, + { + "epoch": 0.10634779290166646, + "ewc_loss": 0.0019102111691609025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.91021117643686e-05, + "grad_norm": 2.461975336074829, + "learning_rate": 3.539635438745231e-07, + "loss": 0.5154, + "mean_token_accuracy": 0.8332167863845825, + "num_tokens": 31808687.0, + "step": 836 + }, + { + "epoch": 0.10647500318025696, + "ewc_loss": 0.0019162046955898404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9162047465215437e-05, + "grad_norm": 2.2957088947296143, + "learning_rate": 3.54387452310301e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8388590812683105, + "num_tokens": 31852310.0, + "step": 837 + }, + { + "epoch": 0.10660221345884748, + "ewc_loss": 0.0018823823193088174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8823822756530717e-05, + "grad_norm": 2.3625848293304443, + "learning_rate": 3.548113607460788e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8518427610397339, + "num_tokens": 31887897.0, + "step": 838 + }, + { + "epoch": 0.10672942373743799, + "ewc_loss": 0.0019058947218582034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9058947145822458e-05, + "grad_norm": 2.42388916015625, + "learning_rate": 3.552352691818567e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8434605598449707, + "num_tokens": 31926159.0, + "step": 839 + }, + { + "epoch": 0.10685663401602849, + "ewc_loss": 0.001926072989590466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9260729459347203e-05, + "grad_norm": 2.3487651348114014, + "learning_rate": 3.556591776176346e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.8492381572723389, + "num_tokens": 31968432.0, + "step": 840 + }, + { + "epoch": 0.106983844294619, + "ewc_loss": 0.001904845703393221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.904845703393221e-05, + "grad_norm": 2.572309970855713, + "learning_rate": 3.560830860534125e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8423662781715393, + "num_tokens": 32000949.0, + "step": 841 + }, + { + "epoch": 0.10711105457320952, + "ewc_loss": 0.0019526489777490497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9526489268173464e-05, + "grad_norm": 2.57405161857605, + "learning_rate": 3.565069944891903e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.8267910480499268, + "num_tokens": 32033862.0, + "step": 842 + }, + { + "epoch": 0.10723826485180002, + "ewc_loss": 0.0019507101969793439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9507102479110472e-05, + "grad_norm": 2.451798677444458, + "learning_rate": 3.569309029249682e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8390178084373474, + "num_tokens": 32068623.0, + "step": 843 + }, + { + "epoch": 0.10736547513039053, + "ewc_loss": 0.0019214465282857418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9214465282857418e-05, + "grad_norm": 2.414950132369995, + "learning_rate": 3.573548113607461e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8434222340583801, + "num_tokens": 32106658.0, + "step": 844 + }, + { + "epoch": 0.10749268540898105, + "ewc_loss": 0.0019190680468454957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9190680177416652e-05, + "grad_norm": 2.434199333190918, + "learning_rate": 3.577787197965239e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8285861015319824, + "num_tokens": 32142031.0, + "step": 845 + }, + { + "epoch": 0.10761989568757156, + "ewc_loss": 0.0019328749040141702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9328748749103397e-05, + "grad_norm": 2.306628465652466, + "learning_rate": 3.5820262823230177e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8534004092216492, + "num_tokens": 32182686.0, + "step": 846 + }, + { + "epoch": 0.10774710596616206, + "ewc_loss": 0.0019097300246357918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9097300537396222e-05, + "grad_norm": 2.4999566078186035, + "learning_rate": 3.5862653666807967e-07, + "loss": 0.5556, + "mean_token_accuracy": 0.8199472427368164, + "num_tokens": 32218027.0, + "step": 847 + }, + { + "epoch": 0.10787431624475258, + "ewc_loss": 0.001956154825165868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9561548469937406e-05, + "grad_norm": 2.4393908977508545, + "learning_rate": 3.5905044510385757e-07, + "loss": 0.5131, + "mean_token_accuracy": 0.8377773761749268, + "num_tokens": 32257854.0, + "step": 848 + }, + { + "epoch": 0.10800152652334309, + "ewc_loss": 0.0019470348488539457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9470347979222424e-05, + "grad_norm": 2.4216012954711914, + "learning_rate": 3.594743535396354e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8402963876724243, + "num_tokens": 32295019.0, + "step": 849 + }, + { + "epoch": 0.10812873680193359, + "ewc_loss": 0.001940746558830142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9407465515541844e-05, + "grad_norm": 2.424860715866089, + "learning_rate": 3.5989826197541326e-07, + "loss": 0.5441, + "mean_token_accuracy": 0.8243470191955566, + "num_tokens": 32335652.0, + "step": 850 + }, + { + "epoch": 0.1082559470805241, + "ewc_loss": 0.0019438299350440502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.943829920492135e-05, + "grad_norm": 2.3673362731933594, + "learning_rate": 3.6032217041119116e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.836479663848877, + "num_tokens": 32377249.0, + "step": 851 + }, + { + "epoch": 0.10838315735911462, + "ewc_loss": 0.0019376208074390888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9376208001631312e-05, + "grad_norm": 2.2791268825531006, + "learning_rate": 3.6074607884696906e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.851226270198822, + "num_tokens": 32422467.0, + "step": 852 + }, + { + "epoch": 0.10851036763770512, + "ewc_loss": 0.0019246318843215704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9246319425292313e-05, + "grad_norm": 2.3334057331085205, + "learning_rate": 3.611699872827469e-07, + "loss": 0.4519, + "mean_token_accuracy": 0.8505260348320007, + "num_tokens": 32462120.0, + "step": 853 + }, + { + "epoch": 0.10863757791629564, + "ewc_loss": 0.0019422202603891492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9422202967689373e-05, + "grad_norm": 2.544067144393921, + "learning_rate": 3.6159389571852475e-07, + "loss": 0.5714, + "mean_token_accuracy": 0.818391740322113, + "num_tokens": 32498484.0, + "step": 854 + }, + { + "epoch": 0.10876478819488615, + "ewc_loss": 0.0019878423772752285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.987842369999271e-05, + "grad_norm": 2.366330146789551, + "learning_rate": 3.6201780415430265e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8348757028579712, + "num_tokens": 32541536.0, + "step": 855 + }, + { + "epoch": 0.10889199847347665, + "ewc_loss": 0.0019434537971392274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9434537534834817e-05, + "grad_norm": 2.4691390991210938, + "learning_rate": 3.6244171259008055e-07, + "loss": 0.5303, + "mean_token_accuracy": 0.8325847387313843, + "num_tokens": 32579750.0, + "step": 856 + }, + { + "epoch": 0.10901920875206716, + "ewc_loss": 0.001966123003512621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.966122908925172e-05, + "grad_norm": 2.445034980773926, + "learning_rate": 3.628656210258584e-07, + "loss": 0.5405, + "mean_token_accuracy": 0.8236855268478394, + "num_tokens": 32617518.0, + "step": 857 + }, + { + "epoch": 0.10914641903065768, + "ewc_loss": 0.0019632475450634956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.963247632374987e-05, + "grad_norm": 2.385495662689209, + "learning_rate": 3.6328952946163624e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8436098694801331, + "num_tokens": 32653931.0, + "step": 858 + }, + { + "epoch": 0.1092736293092482, + "ewc_loss": 0.0019428817322477698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9428816813160665e-05, + "grad_norm": 2.3862318992614746, + "learning_rate": 3.6371343789741414e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8475173711776733, + "num_tokens": 32693797.0, + "step": 859 + }, + { + "epoch": 0.1094008395878387, + "ewc_loss": 0.0019498117035254836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9498116671456955e-05, + "grad_norm": 2.387662410736084, + "learning_rate": 3.6413734633319204e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.8343969583511353, + "num_tokens": 32732432.0, + "step": 860 + }, + { + "epoch": 0.10952804986642921, + "ewc_loss": 0.0019596123602241278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9596123820520006e-05, + "grad_norm": 2.466614007949829, + "learning_rate": 3.645612547689699e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.8417748808860779, + "num_tokens": 32769225.0, + "step": 861 + }, + { + "epoch": 0.10965526014501972, + "ewc_loss": 0.0019793184474110603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9793184037553146e-05, + "grad_norm": 2.355257511138916, + "learning_rate": 3.6498516320474773e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8492423892021179, + "num_tokens": 32812027.0, + "step": 862 + }, + { + "epoch": 0.10978247042361022, + "ewc_loss": 0.0019498886540532112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9498886103974655e-05, + "grad_norm": 2.37015700340271, + "learning_rate": 3.6540907164052563e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8558796644210815, + "num_tokens": 32849334.0, + "step": 863 + }, + { + "epoch": 0.10990968070220074, + "ewc_loss": 0.0019590260926634073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9590261217672378e-05, + "grad_norm": 2.4500696659088135, + "learning_rate": 3.658329800763035e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.8250060081481934, + "num_tokens": 32888422.0, + "step": 864 + }, + { + "epoch": 0.11003689098079125, + "ewc_loss": 0.001982286339625716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9822864487650804e-05, + "grad_norm": 2.493675470352173, + "learning_rate": 3.662568885120814e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8379424810409546, + "num_tokens": 32922672.0, + "step": 865 + }, + { + "epoch": 0.11016410125938175, + "ewc_loss": 0.001985620940104127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9856208382407203e-05, + "grad_norm": 2.428060531616211, + "learning_rate": 3.666807969478592e-07, + "loss": 0.5276, + "mean_token_accuracy": 0.8275173306465149, + "num_tokens": 32961694.0, + "step": 866 + }, + { + "epoch": 0.11029131153797227, + "ewc_loss": 0.001969051081687212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.969051118067e-05, + "grad_norm": 2.4360578060150146, + "learning_rate": 3.671047053836371e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8307305574417114, + "num_tokens": 32998595.0, + "step": 867 + }, + { + "epoch": 0.11041852181656278, + "ewc_loss": 0.001975389663130045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9753897504415363e-05, + "grad_norm": 2.384868860244751, + "learning_rate": 3.6752861381941497e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8427321910858154, + "num_tokens": 33035884.0, + "step": 868 + }, + { + "epoch": 0.11054573209515328, + "ewc_loss": 0.001968259923160076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9682598576764576e-05, + "grad_norm": 2.4667301177978516, + "learning_rate": 3.6795252225519287e-07, + "loss": 0.5368, + "mean_token_accuracy": 0.8298377990722656, + "num_tokens": 33073335.0, + "step": 869 + }, + { + "epoch": 0.1106729423737438, + "ewc_loss": 0.001990201184526086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.990201235457789e-05, + "grad_norm": 2.3302347660064697, + "learning_rate": 3.6837643069097077e-07, + "loss": 0.4392, + "mean_token_accuracy": 0.8550549745559692, + "num_tokens": 33111443.0, + "step": 870 + }, + { + "epoch": 0.11080015265233431, + "ewc_loss": 0.001962281996384263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9622819309006445e-05, + "grad_norm": 2.347832202911377, + "learning_rate": 3.688003391267486e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.8410900235176086, + "num_tokens": 33156362.0, + "step": 871 + }, + { + "epoch": 0.11092736293092482, + "ewc_loss": 0.0019755400717258453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9755401808652095e-05, + "grad_norm": 2.2856242656707764, + "learning_rate": 3.6922424756252646e-07, + "loss": 0.4676, + "mean_token_accuracy": 0.8472617268562317, + "num_tokens": 33204968.0, + "step": 872 + }, + { + "epoch": 0.11105457320951533, + "ewc_loss": 0.0019666277803480625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.966627678484656e-05, + "grad_norm": 2.486659526824951, + "learning_rate": 3.6964815599830436e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.8320976495742798, + "num_tokens": 33239265.0, + "step": 873 + }, + { + "epoch": 0.11118178348810584, + "ewc_loss": 0.0020116660743951797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0116660380153917e-05, + "grad_norm": 2.501784563064575, + "learning_rate": 3.7007206443408226e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8534297943115234, + "num_tokens": 33275897.0, + "step": 874 + }, + { + "epoch": 0.11130899376669635, + "ewc_loss": 0.0020069542806595564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.006954309763387e-05, + "grad_norm": 2.4580390453338623, + "learning_rate": 3.704959728698601e-07, + "loss": 0.5678, + "mean_token_accuracy": 0.8171393871307373, + "num_tokens": 33313966.0, + "step": 875 + }, + { + "epoch": 0.11143620404528685, + "ewc_loss": 0.001990001881495118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9900018742191605e-05, + "grad_norm": 2.4351816177368164, + "learning_rate": 3.7091988130563795e-07, + "loss": 0.516, + "mean_token_accuracy": 0.8344134092330933, + "num_tokens": 33352909.0, + "step": 876 + }, + { + "epoch": 0.11156341432387737, + "ewc_loss": 0.0019920975901186466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9920975319109857e-05, + "grad_norm": 2.304746627807617, + "learning_rate": 3.7134378974141585e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.840337872505188, + "num_tokens": 33400120.0, + "step": 877 + }, + { + "epoch": 0.11169062460246788, + "ewc_loss": 0.001968985889106989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.968985998246353e-05, + "grad_norm": 2.633556604385376, + "learning_rate": 3.7176769817719375e-07, + "loss": 0.5233, + "mean_token_accuracy": 0.8314266204833984, + "num_tokens": 33436703.0, + "step": 878 + }, + { + "epoch": 0.11181783488105838, + "ewc_loss": 0.002048350404947996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0483505068114027e-05, + "grad_norm": 2.4599709510803223, + "learning_rate": 3.7219160661297154e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.858770489692688, + "num_tokens": 33473515.0, + "step": 879 + }, + { + "epoch": 0.1119450451596489, + "ewc_loss": 0.002002553315833211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0025532649015076e-05, + "grad_norm": 2.420212507247925, + "learning_rate": 3.7261551504874944e-07, + "loss": 0.4953, + "mean_token_accuracy": 0.8420894145965576, + "num_tokens": 33507659.0, + "step": 880 + }, + { + "epoch": 0.11207225543823941, + "ewc_loss": 0.0019928724505007267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9928724213968962e-05, + "grad_norm": 2.394345998764038, + "learning_rate": 3.7303942348452734e-07, + "loss": 0.4419, + "mean_token_accuracy": 0.853304922580719, + "num_tokens": 33542430.0, + "step": 881 + }, + { + "epoch": 0.11219946571682991, + "ewc_loss": 0.0020049046725034714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0049046725034714e-05, + "grad_norm": 2.3458478450775146, + "learning_rate": 3.7346333192030524e-07, + "loss": 0.4511, + "mean_token_accuracy": 0.8554735779762268, + "num_tokens": 33582579.0, + "step": 882 + }, + { + "epoch": 0.11232667599542043, + "ewc_loss": 0.001993610290810466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9936103853979148e-05, + "grad_norm": 2.3061411380767822, + "learning_rate": 3.7388724035608303e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8485488891601562, + "num_tokens": 33625928.0, + "step": 883 + }, + { + "epoch": 0.11245388627401094, + "ewc_loss": 0.0019917527679353952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9917528334190138e-05, + "grad_norm": 2.4665465354919434, + "learning_rate": 3.7431114879186093e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.834652841091156, + "num_tokens": 33661325.0, + "step": 884 + }, + { + "epoch": 0.11258109655260146, + "ewc_loss": 0.0020362739451229572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.036273872363381e-05, + "grad_norm": 2.4244604110717773, + "learning_rate": 3.7473505722763883e-07, + "loss": 0.5354, + "mean_token_accuracy": 0.8274401426315308, + "num_tokens": 33696472.0, + "step": 885 + }, + { + "epoch": 0.11270830683119196, + "ewc_loss": 0.0020234892144799232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0234892872394994e-05, + "grad_norm": 2.435986280441284, + "learning_rate": 3.7515896566341673e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8387109637260437, + "num_tokens": 33732637.0, + "step": 886 + }, + { + "epoch": 0.11283551710978247, + "ewc_loss": 0.0020223648753017187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0223647879902273e-05, + "grad_norm": 2.3710014820098877, + "learning_rate": 3.755828740991945e-07, + "loss": 0.514, + "mean_token_accuracy": 0.8327723741531372, + "num_tokens": 33775106.0, + "step": 887 + }, + { + "epoch": 0.11296272738837299, + "ewc_loss": 0.0020132153294980526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0132152712903917e-05, + "grad_norm": 2.4580929279327393, + "learning_rate": 3.760067825349724e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8465299606323242, + "num_tokens": 33811297.0, + "step": 888 + }, + { + "epoch": 0.11308993766696349, + "ewc_loss": 0.002040059072896838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0400590074132197e-05, + "grad_norm": 2.2882163524627686, + "learning_rate": 3.764306909707503e-07, + "loss": 0.44, + "mean_token_accuracy": 0.8558939695358276, + "num_tokens": 33854038.0, + "step": 889 + }, + { + "epoch": 0.113217147945554, + "ewc_loss": 0.002000287175178528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0002871679025702e-05, + "grad_norm": 2.5265111923217773, + "learning_rate": 3.768545994065282e-07, + "loss": 0.495, + "mean_token_accuracy": 0.8386679887771606, + "num_tokens": 33884929.0, + "step": 890 + }, + { + "epoch": 0.11334435822414451, + "ewc_loss": 0.0020631691440939903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0631690858863294e-05, + "grad_norm": 2.415353536605835, + "learning_rate": 3.77278507842306e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8374913930892944, + "num_tokens": 33926065.0, + "step": 891 + }, + { + "epoch": 0.11347156850273502, + "ewc_loss": 0.0020386280957609415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.03862800844945e-05, + "grad_norm": 2.3541643619537354, + "learning_rate": 3.777024162780839e-07, + "loss": 0.4569, + "mean_token_accuracy": 0.8484271764755249, + "num_tokens": 33965557.0, + "step": 892 + }, + { + "epoch": 0.11359877878132553, + "ewc_loss": 0.0020201147999614477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.020114698098041e-05, + "grad_norm": 2.4484035968780518, + "learning_rate": 3.781263247138618e-07, + "loss": 0.4637, + "mean_token_accuracy": 0.8481060862541199, + "num_tokens": 34004132.0, + "step": 893 + }, + { + "epoch": 0.11372598905991604, + "ewc_loss": 0.00204584002494812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.045840119535569e-05, + "grad_norm": 2.450775384902954, + "learning_rate": 3.785502331496397e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.8342111706733704, + "num_tokens": 34039781.0, + "step": 894 + }, + { + "epoch": 0.11385319933850654, + "ewc_loss": 0.0020511432085186243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0511432012426667e-05, + "grad_norm": 2.401134967803955, + "learning_rate": 3.789741415854175e-07, + "loss": 0.5144, + "mean_token_accuracy": 0.8323661684989929, + "num_tokens": 34080487.0, + "step": 895 + }, + { + "epoch": 0.11398040961709706, + "ewc_loss": 0.00203283061273396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.032830707321409e-05, + "grad_norm": 2.4252424240112305, + "learning_rate": 3.793980500211954e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8397732973098755, + "num_tokens": 34119913.0, + "step": 896 + }, + { + "epoch": 0.11410761989568757, + "ewc_loss": 0.0020444546826183796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0444545953068882e-05, + "grad_norm": 2.377324342727661, + "learning_rate": 3.798219584569733e-07, + "loss": 0.4282, + "mean_token_accuracy": 0.8597906827926636, + "num_tokens": 34155811.0, + "step": 897 + }, + { + "epoch": 0.11423483017427809, + "ewc_loss": 0.002033807570114732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0338075046311133e-05, + "grad_norm": 2.520718812942505, + "learning_rate": 3.8024586689275115e-07, + "loss": 0.5484, + "mean_token_accuracy": 0.8273761868476868, + "num_tokens": 34195022.0, + "step": 898 + }, + { + "epoch": 0.11436204045286859, + "ewc_loss": 0.002071651164442301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.071651215374004e-05, + "grad_norm": 2.4179346561431885, + "learning_rate": 3.80669775328529e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8415204882621765, + "num_tokens": 34229182.0, + "step": 899 + }, + { + "epoch": 0.1144892507314591, + "ewc_loss": 0.002052212366834283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.052212403214071e-05, + "grad_norm": 2.3970439434051514, + "learning_rate": 3.810936837643069e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8527262806892395, + "num_tokens": 34266931.0, + "step": 900 + }, + { + "epoch": 0.11461646101004962, + "ewc_loss": 0.0020502626430243254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.05026262847241e-05, + "grad_norm": 2.441509962081909, + "learning_rate": 3.815175922000848e-07, + "loss": 0.4364, + "mean_token_accuracy": 0.8548998832702637, + "num_tokens": 34301705.0, + "step": 901 + }, + { + "epoch": 0.11474367128864012, + "ewc_loss": 0.002063768683001399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.063768624793738e-05, + "grad_norm": 2.3835902214050293, + "learning_rate": 3.8194150063586264e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.8355937004089355, + "num_tokens": 34340975.0, + "step": 902 + }, + { + "epoch": 0.11487088156723063, + "ewc_loss": 0.0020558908581733704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0558909454848617e-05, + "grad_norm": 2.4360082149505615, + "learning_rate": 3.823654090716405e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8387942314147949, + "num_tokens": 34380024.0, + "step": 903 + }, + { + "epoch": 0.11499809184582115, + "ewc_loss": 0.0020668364595621824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0668363504228182e-05, + "grad_norm": 2.469398021697998, + "learning_rate": 3.827893175074184e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8415710926055908, + "num_tokens": 34419654.0, + "step": 904 + }, + { + "epoch": 0.11512530212441165, + "ewc_loss": 0.002077023033052683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0770230548805557e-05, + "grad_norm": 2.457233190536499, + "learning_rate": 3.832132259431963e-07, + "loss": 0.4398, + "mean_token_accuracy": 0.8572743535041809, + "num_tokens": 34453034.0, + "step": 905 + }, + { + "epoch": 0.11525251240300216, + "ewc_loss": 0.002074634423479438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0746343579958193e-05, + "grad_norm": 2.4479899406433105, + "learning_rate": 3.8363713437897413e-07, + "loss": 0.5429, + "mean_token_accuracy": 0.8231961727142334, + "num_tokens": 34493694.0, + "step": 906 + }, + { + "epoch": 0.11537972268159268, + "ewc_loss": 0.002068249275907874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0682493413914926e-05, + "grad_norm": 2.4927546977996826, + "learning_rate": 3.8406104281475197e-07, + "loss": 0.5347, + "mean_token_accuracy": 0.8287754058837891, + "num_tokens": 34531636.0, + "step": 907 + }, + { + "epoch": 0.11550693296018319, + "ewc_loss": 0.0020781964994966984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.078196484944783e-05, + "grad_norm": 2.4448673725128174, + "learning_rate": 3.8448495125052987e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8290905952453613, + "num_tokens": 34570851.0, + "step": 908 + }, + { + "epoch": 0.11563414323877369, + "ewc_loss": 0.00207223417237401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0722342014778405e-05, + "grad_norm": 2.33798885345459, + "learning_rate": 3.8490885968630777e-07, + "loss": 0.4725, + "mean_token_accuracy": 0.8463674783706665, + "num_tokens": 34613314.0, + "step": 909 + }, + { + "epoch": 0.1157613535173642, + "ewc_loss": 0.002054572571069002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0545725419651717e-05, + "grad_norm": 2.3042349815368652, + "learning_rate": 3.853327681220856e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8392295837402344, + "num_tokens": 34656701.0, + "step": 910 + }, + { + "epoch": 0.11588856379595472, + "ewc_loss": 0.0020562715362757444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0562714780680835e-05, + "grad_norm": 2.561964273452759, + "learning_rate": 3.8575667655786346e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.8505301475524902, + "num_tokens": 34688595.0, + "step": 911 + }, + { + "epoch": 0.11601577407454522, + "ewc_loss": 0.002123404759913683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.12340473808581e-05, + "grad_norm": 2.3955185413360596, + "learning_rate": 3.8618058499364136e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8445094227790833, + "num_tokens": 34726609.0, + "step": 912 + }, + { + "epoch": 0.11614298435313573, + "ewc_loss": 0.0020804796367883682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.080479680444114e-05, + "grad_norm": 2.398106098175049, + "learning_rate": 3.8660449342941926e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.8304030895233154, + "num_tokens": 34769213.0, + "step": 913 + }, + { + "epoch": 0.11627019463172625, + "ewc_loss": 0.002075338736176491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0753388525918126e-05, + "grad_norm": 2.4494845867156982, + "learning_rate": 3.870284018651971e-07, + "loss": 0.5111, + "mean_token_accuracy": 0.8353691101074219, + "num_tokens": 34806722.0, + "step": 914 + }, + { + "epoch": 0.11639740491031675, + "ewc_loss": 0.0020938897505402565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0938896341249347e-05, + "grad_norm": 2.3672142028808594, + "learning_rate": 3.8745231030097495e-07, + "loss": 0.4664, + "mean_token_accuracy": 0.8500280380249023, + "num_tokens": 34846408.0, + "step": 915 + }, + { + "epoch": 0.11652461518890726, + "ewc_loss": 0.0020800624042749405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0800624042749405e-05, + "grad_norm": 2.4255712032318115, + "learning_rate": 3.8787621873675285e-07, + "loss": 0.455, + "mean_token_accuracy": 0.8503482937812805, + "num_tokens": 34883791.0, + "step": 916 + }, + { + "epoch": 0.11665182546749778, + "ewc_loss": 0.0020957374945282936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.095737545459997e-05, + "grad_norm": 2.3785362243652344, + "learning_rate": 3.883001271725307e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8526290059089661, + "num_tokens": 34922768.0, + "step": 917 + }, + { + "epoch": 0.11677903574608828, + "ewc_loss": 0.0020823508966714144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0823508748435415e-05, + "grad_norm": 2.3429911136627197, + "learning_rate": 3.887240356083086e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.8582260012626648, + "num_tokens": 34962184.0, + "step": 918 + }, + { + "epoch": 0.11690624602467879, + "ewc_loss": 0.002083785133436322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0837851479882374e-05, + "grad_norm": 2.6646578311920166, + "learning_rate": 3.8914794404408644e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8447636365890503, + "num_tokens": 34991455.0, + "step": 919 + }, + { + "epoch": 0.1170334563032693, + "ewc_loss": 0.0021696637850254774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1696638214052655e-05, + "grad_norm": 2.5620930194854736, + "learning_rate": 3.8957185247986434e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.8358318209648132, + "num_tokens": 35025044.0, + "step": 920 + }, + { + "epoch": 0.11716066658185982, + "ewc_loss": 0.0021308972500264645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.130897337337956e-05, + "grad_norm": 2.44111704826355, + "learning_rate": 3.899957609156422e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.8433846235275269, + "num_tokens": 35064564.0, + "step": 921 + }, + { + "epoch": 0.11728787686045032, + "ewc_loss": 0.002094195457175374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0941954062436707e-05, + "grad_norm": 2.4421725273132324, + "learning_rate": 3.904196693514201e-07, + "loss": 0.4935, + "mean_token_accuracy": 0.8406290411949158, + "num_tokens": 35109786.0, + "step": 922 + }, + { + "epoch": 0.11741508713904084, + "ewc_loss": 0.0021054644603282213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.105464409396518e-05, + "grad_norm": 2.490520477294922, + "learning_rate": 3.9084357778719793e-07, + "loss": 0.5564, + "mean_token_accuracy": 0.8204320669174194, + "num_tokens": 35148351.0, + "step": 923 + }, + { + "epoch": 0.11754229741763135, + "ewc_loss": 0.0021202287171036005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.120228782587219e-05, + "grad_norm": 2.4857988357543945, + "learning_rate": 3.9126748622297583e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.8328055143356323, + "num_tokens": 35183464.0, + "step": 924 + }, + { + "epoch": 0.11766950769622185, + "ewc_loss": 0.0021206634119153023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.120663339155726e-05, + "grad_norm": 2.486365795135498, + "learning_rate": 3.916913946587537e-07, + "loss": 0.5027, + "mean_token_accuracy": 0.8353384137153625, + "num_tokens": 35218561.0, + "step": 925 + }, + { + "epoch": 0.11779671797481236, + "ewc_loss": 0.0021188105456531048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1188105165492743e-05, + "grad_norm": 2.4141037464141846, + "learning_rate": 3.921153030945316e-07, + "loss": 0.5176, + "mean_token_accuracy": 0.8316712379455566, + "num_tokens": 35257740.0, + "step": 926 + }, + { + "epoch": 0.11792392825340288, + "ewc_loss": 0.002101178979501128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.101178870361764e-05, + "grad_norm": 2.52810001373291, + "learning_rate": 3.925392115303094e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.8452373743057251, + "num_tokens": 35291632.0, + "step": 927 + }, + { + "epoch": 0.11805113853199338, + "ewc_loss": 0.002138796029612422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.138795935024973e-05, + "grad_norm": 2.486727476119995, + "learning_rate": 3.929631199660873e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.8403168320655823, + "num_tokens": 35327686.0, + "step": 928 + }, + { + "epoch": 0.1181783488105839, + "ewc_loss": 0.002127979649230838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1279796783346683e-05, + "grad_norm": 2.438098192214966, + "learning_rate": 3.9338702840186517e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8416279554367065, + "num_tokens": 35364480.0, + "step": 929 + }, + { + "epoch": 0.11830555908917441, + "ewc_loss": 0.002120315097272396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1203151845838875e-05, + "grad_norm": 2.352391481399536, + "learning_rate": 3.9381093683764307e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8416500091552734, + "num_tokens": 35406017.0, + "step": 930 + }, + { + "epoch": 0.11843276936776491, + "ewc_loss": 0.0021075166296213865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1075165932415985e-05, + "grad_norm": 2.3792285919189453, + "learning_rate": 3.942348452734209e-07, + "loss": 0.5425, + "mean_token_accuracy": 0.8239102363586426, + "num_tokens": 35448447.0, + "step": 931 + }, + { + "epoch": 0.11855997964635542, + "ewc_loss": 0.0021242538932710886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1242538423393853e-05, + "grad_norm": 2.4035043716430664, + "learning_rate": 3.946587537091988e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.8386269807815552, + "num_tokens": 35488120.0, + "step": 932 + }, + { + "epoch": 0.11868718992494594, + "ewc_loss": 0.0021369243040680885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1369241949287243e-05, + "grad_norm": 2.29653263092041, + "learning_rate": 3.9508266214497666e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8468071222305298, + "num_tokens": 35531759.0, + "step": 933 + }, + { + "epoch": 0.11881440020353645, + "ewc_loss": 0.002107109408825636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1071093215141445e-05, + "grad_norm": 2.4048402309417725, + "learning_rate": 3.9550657058075456e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8535662889480591, + "num_tokens": 35571476.0, + "step": 934 + }, + { + "epoch": 0.11894161048212695, + "ewc_loss": 0.0021379683166742325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1379682948463596e-05, + "grad_norm": 2.488614559173584, + "learning_rate": 3.959304790165324e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.841206431388855, + "num_tokens": 35608440.0, + "step": 935 + }, + { + "epoch": 0.11906882076071747, + "ewc_loss": 0.0021544056944549084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.154405774490442e-05, + "grad_norm": 2.5347981452941895, + "learning_rate": 3.9635438745231025e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8425891399383545, + "num_tokens": 35641861.0, + "step": 936 + }, + { + "epoch": 0.11919603103930798, + "ewc_loss": 0.002159159630537033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.159159521397669e-05, + "grad_norm": 2.473780870437622, + "learning_rate": 3.9677829588808815e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8338720798492432, + "num_tokens": 35684626.0, + "step": 937 + }, + { + "epoch": 0.11932324131789848, + "ewc_loss": 0.00214362726546824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1436271708807908e-05, + "grad_norm": 2.4905357360839844, + "learning_rate": 3.9720220432386605e-07, + "loss": 0.5159, + "mean_token_accuracy": 0.8317468762397766, + "num_tokens": 35718125.0, + "step": 938 + }, + { + "epoch": 0.119450451596489, + "ewc_loss": 0.002149359555914998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1493595340871252e-05, + "grad_norm": 2.4375524520874023, + "learning_rate": 3.976261127596439e-07, + "loss": 0.4584, + "mean_token_accuracy": 0.8526859283447266, + "num_tokens": 35752818.0, + "step": 939 + }, + { + "epoch": 0.11957766187507951, + "ewc_loss": 0.0021419832482934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1419833501568064e-05, + "grad_norm": 2.4378068447113037, + "learning_rate": 3.9805002119542174e-07, + "loss": 0.4397, + "mean_token_accuracy": 0.8565986156463623, + "num_tokens": 35787097.0, + "step": 940 + }, + { + "epoch": 0.11970487215367001, + "ewc_loss": 0.002143476391211152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.143476376659237e-05, + "grad_norm": 2.446382999420166, + "learning_rate": 3.9847392963119964e-07, + "loss": 0.5343, + "mean_token_accuracy": 0.8303049802780151, + "num_tokens": 35825887.0, + "step": 941 + }, + { + "epoch": 0.11983208243226053, + "ewc_loss": 0.0021534967236220837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1534968254854903e-05, + "grad_norm": 2.4126548767089844, + "learning_rate": 3.9889783806697754e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8496781587600708, + "num_tokens": 35863592.0, + "step": 942 + }, + { + "epoch": 0.11995929271085104, + "ewc_loss": 0.0021415534429252148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1415535229607485e-05, + "grad_norm": 2.4637889862060547, + "learning_rate": 3.993217465027554e-07, + "loss": 0.5389, + "mean_token_accuracy": 0.8289439678192139, + "num_tokens": 35904483.0, + "step": 943 + }, + { + "epoch": 0.12008650298944154, + "ewc_loss": 0.002161688869819045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1616888261632994e-05, + "grad_norm": 2.4784762859344482, + "learning_rate": 3.9974565493853323e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.8593243956565857, + "num_tokens": 35938662.0, + "step": 944 + }, + { + "epoch": 0.12021371326803205, + "ewc_loss": 0.0021667873952537775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1667874534614384e-05, + "grad_norm": 2.4453043937683105, + "learning_rate": 4.0016956337431113e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8400006294250488, + "num_tokens": 35975176.0, + "step": 945 + }, + { + "epoch": 0.12034092354662257, + "ewc_loss": 0.002155330264940858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155330184905324e-05, + "grad_norm": 2.511260986328125, + "learning_rate": 4.0059347181008903e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8433718085289001, + "num_tokens": 36008893.0, + "step": 946 + }, + { + "epoch": 0.12046813382521308, + "ewc_loss": 0.002174915047362447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.174915061914362e-05, + "grad_norm": 2.3214352130889893, + "learning_rate": 4.010173802458669e-07, + "loss": 0.4364, + "mean_token_accuracy": 0.8578020334243774, + "num_tokens": 36049882.0, + "step": 947 + }, + { + "epoch": 0.12059534410380358, + "ewc_loss": 0.002133589470759034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1335894416552037e-05, + "grad_norm": 2.436704158782959, + "learning_rate": 4.014412886816447e-07, + "loss": 0.5484, + "mean_token_accuracy": 0.8255745768547058, + "num_tokens": 36090615.0, + "step": 948 + }, + { + "epoch": 0.1207225543823941, + "ewc_loss": 0.002164134755730629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.164134639315307e-05, + "grad_norm": 2.4296157360076904, + "learning_rate": 4.018651971174226e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8436608910560608, + "num_tokens": 36128159.0, + "step": 949 + }, + { + "epoch": 0.12084976466098461, + "ewc_loss": 0.002163039520382881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1630394257954322e-05, + "grad_norm": 2.3765602111816406, + "learning_rate": 4.022891055532005e-07, + "loss": 0.495, + "mean_token_accuracy": 0.8395256996154785, + "num_tokens": 36172851.0, + "step": 950 + }, + { + "epoch": 0.12097697493957511, + "ewc_loss": 0.0021482575684785843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.148257590306457e-05, + "grad_norm": 2.4530012607574463, + "learning_rate": 4.0271301398897837e-07, + "loss": 0.5442, + "mean_token_accuracy": 0.8271471261978149, + "num_tokens": 36214282.0, + "step": 951 + }, + { + "epoch": 0.12110418521816563, + "ewc_loss": 0.002177535556256771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1775354980491102e-05, + "grad_norm": 2.5594027042388916, + "learning_rate": 4.031369224247562e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8430753946304321, + "num_tokens": 36248619.0, + "step": 952 + }, + { + "epoch": 0.12123139549675614, + "ewc_loss": 0.0021985103376209736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1985104467603378e-05, + "grad_norm": 2.3910326957702637, + "learning_rate": 4.035608308605341e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8353315591812134, + "num_tokens": 36289070.0, + "step": 953 + }, + { + "epoch": 0.12135860577534664, + "ewc_loss": 0.0021573335397988558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1573336198343895e-05, + "grad_norm": 2.4297451972961426, + "learning_rate": 4.03984739296312e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8388605117797852, + "num_tokens": 36330883.0, + "step": 954 + }, + { + "epoch": 0.12148581605393716, + "ewc_loss": 0.0021759974770247936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1759973606094718e-05, + "grad_norm": 2.3602895736694336, + "learning_rate": 4.044086477320898e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8442926406860352, + "num_tokens": 36371507.0, + "step": 955 + }, + { + "epoch": 0.12161302633252767, + "ewc_loss": 0.002167050028219819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.16704993363237e-05, + "grad_norm": 2.4902684688568115, + "learning_rate": 4.048325561678677e-07, + "loss": 0.5042, + "mean_token_accuracy": 0.8357130289077759, + "num_tokens": 36407262.0, + "step": 956 + }, + { + "epoch": 0.12174023661111817, + "ewc_loss": 0.002199878916144371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1998788724886253e-05, + "grad_norm": 2.482551336288452, + "learning_rate": 4.052564646036456e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8367438912391663, + "num_tokens": 36448089.0, + "step": 957 + }, + { + "epoch": 0.12186744688970869, + "ewc_loss": 0.0021978546865284443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1978547010803595e-05, + "grad_norm": 2.5054514408111572, + "learning_rate": 4.056803730394235e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.8317480087280273, + "num_tokens": 36484022.0, + "step": 958 + }, + { + "epoch": 0.1219946571682992, + "ewc_loss": 0.0021954604890197515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1954605472274125e-05, + "grad_norm": 2.484771728515625, + "learning_rate": 4.061042814752013e-07, + "loss": 0.49, + "mean_token_accuracy": 0.842418909072876, + "num_tokens": 36519385.0, + "step": 959 + }, + { + "epoch": 0.12212186744688971, + "ewc_loss": 0.0021911198273301125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1911197109147906e-05, + "grad_norm": 2.669588565826416, + "learning_rate": 4.065281899109792e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8376482725143433, + "num_tokens": 36551943.0, + "step": 960 + }, + { + "epoch": 0.12224907772548022, + "ewc_loss": 0.002233864041045308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2338639610097744e-05, + "grad_norm": 2.3618390560150146, + "learning_rate": 4.069520983467571e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8563835620880127, + "num_tokens": 36590927.0, + "step": 961 + }, + { + "epoch": 0.12237628800407073, + "ewc_loss": 0.0021588995587080717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.158899587811902e-05, + "grad_norm": 2.4588866233825684, + "learning_rate": 4.07376006782535e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8442249894142151, + "num_tokens": 36625199.0, + "step": 962 + }, + { + "epoch": 0.12250349828266124, + "ewc_loss": 0.002194029279053211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1940291844657622e-05, + "grad_norm": 2.471787452697754, + "learning_rate": 4.077999152183128e-07, + "loss": 0.4344, + "mean_token_accuracy": 0.8566586971282959, + "num_tokens": 36658981.0, + "step": 963 + }, + { + "epoch": 0.12263070856125174, + "ewc_loss": 0.002203476382419467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2034764697309583e-05, + "grad_norm": 2.3251311779022217, + "learning_rate": 4.082238236540907e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8419636487960815, + "num_tokens": 36704214.0, + "step": 964 + }, + { + "epoch": 0.12275791883984226, + "ewc_loss": 0.0021633419673889875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.163341923733242e-05, + "grad_norm": 2.424720048904419, + "learning_rate": 4.086477320898686e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8462968468666077, + "num_tokens": 36744782.0, + "step": 965 + }, + { + "epoch": 0.12288512911843277, + "ewc_loss": 0.0021999834571033716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1999834643793292e-05, + "grad_norm": 2.4325504302978516, + "learning_rate": 4.090716405256465e-07, + "loss": 0.499, + "mean_token_accuracy": 0.8362767696380615, + "num_tokens": 36783645.0, + "step": 966 + }, + { + "epoch": 0.12301233939702327, + "ewc_loss": 0.002200559014454484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.200558992626611e-05, + "grad_norm": 2.5081124305725098, + "learning_rate": 4.094955489614243e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.8466288447380066, + "num_tokens": 36817539.0, + "step": 967 + }, + { + "epoch": 0.12313954967561379, + "ewc_loss": 0.0022146429400891066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2146428818814456e-05, + "grad_norm": 2.402482748031616, + "learning_rate": 4.099194573972022e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8451206684112549, + "num_tokens": 36859144.0, + "step": 968 + }, + { + "epoch": 0.1232667599542043, + "ewc_loss": 0.0021915880497545004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1915881006862037e-05, + "grad_norm": 2.4328432083129883, + "learning_rate": 4.1034336583298007e-07, + "loss": 0.513, + "mean_token_accuracy": 0.8342733383178711, + "num_tokens": 36900833.0, + "step": 969 + }, + { + "epoch": 0.1233939702327948, + "ewc_loss": 0.0022047795355319977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2047795937396586e-05, + "grad_norm": 2.408428430557251, + "learning_rate": 4.1076727426875797e-07, + "loss": 0.501, + "mean_token_accuracy": 0.8363603353500366, + "num_tokens": 36940935.0, + "step": 970 + }, + { + "epoch": 0.12352118051138532, + "ewc_loss": 0.0022002628538757563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.200262861151714e-05, + "grad_norm": 2.514312982559204, + "learning_rate": 4.1119118270453577e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.8168954849243164, + "num_tokens": 36979874.0, + "step": 971 + }, + { + "epoch": 0.12364839078997583, + "ewc_loss": 0.0022292137145996094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2292137145996094e-05, + "grad_norm": 2.4343156814575195, + "learning_rate": 4.1161509114031366e-07, + "loss": 0.4458, + "mean_token_accuracy": 0.8555218577384949, + "num_tokens": 37017917.0, + "step": 972 + }, + { + "epoch": 0.12377560106856635, + "ewc_loss": 0.002199991373345256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1999912860337645e-05, + "grad_norm": 2.4888839721679688, + "learning_rate": 4.1203899957609156e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8353219032287598, + "num_tokens": 37054914.0, + "step": 973 + }, + { + "epoch": 0.12390281134715685, + "ewc_loss": 0.0022184166591614485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.218416557298042e-05, + "grad_norm": 2.5759804248809814, + "learning_rate": 4.124629080118694e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8466588258743286, + "num_tokens": 37088277.0, + "step": 974 + }, + { + "epoch": 0.12403002162574736, + "ewc_loss": 0.0022458077874034643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.245807809231337e-05, + "grad_norm": 2.444711685180664, + "learning_rate": 4.1288681644764726e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8345789313316345, + "num_tokens": 37126963.0, + "step": 975 + }, + { + "epoch": 0.12415723190433788, + "ewc_loss": 0.0022013739217072725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2013738998793997e-05, + "grad_norm": 2.499150037765503, + "learning_rate": 4.1331072488342515e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.8336814641952515, + "num_tokens": 37161948.0, + "step": 976 + }, + { + "epoch": 0.12428444218292838, + "ewc_loss": 0.0022209680173546076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2209680537343957e-05, + "grad_norm": 2.397738456726074, + "learning_rate": 4.1373463331920305e-07, + "loss": 0.4337, + "mean_token_accuracy": 0.8580532670021057, + "num_tokens": 37198552.0, + "step": 977 + }, + { + "epoch": 0.12441165246151889, + "ewc_loss": 0.0022089320700615644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.208931982750073e-05, + "grad_norm": 2.442742109298706, + "learning_rate": 4.141585417549809e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8350836038589478, + "num_tokens": 37235990.0, + "step": 978 + }, + { + "epoch": 0.1245388627401094, + "ewc_loss": 0.002222903771325946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.222903822257649e-05, + "grad_norm": 2.552330493927002, + "learning_rate": 4.1458245019075875e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8474826812744141, + "num_tokens": 37269664.0, + "step": 979 + }, + { + "epoch": 0.1246660730186999, + "ewc_loss": 0.0022555997129529715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2555997929885052e-05, + "grad_norm": 2.4167680740356445, + "learning_rate": 4.1500635862653664e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8405382037162781, + "num_tokens": 37308647.0, + "step": 980 + }, + { + "epoch": 0.12479328329729042, + "ewc_loss": 0.0022153351455926895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2153351892484352e-05, + "grad_norm": 2.404965400695801, + "learning_rate": 4.1543026706231454e-07, + "loss": 0.4431, + "mean_token_accuracy": 0.854921817779541, + "num_tokens": 37345953.0, + "step": 981 + }, + { + "epoch": 0.12492049357588093, + "ewc_loss": 0.0022208141162991524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2208141672308557e-05, + "grad_norm": 2.449063777923584, + "learning_rate": 4.158541754980924e-07, + "loss": 0.4387, + "mean_token_accuracy": 0.8576686382293701, + "num_tokens": 37382735.0, + "step": 982 + }, + { + "epoch": 0.12504770385447145, + "ewc_loss": 0.002241943497210741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2419435481424443e-05, + "grad_norm": 2.4702095985412598, + "learning_rate": 4.1627808393387024e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8336492776870728, + "num_tokens": 37419605.0, + "step": 983 + }, + { + "epoch": 0.12517491413306195, + "ewc_loss": 0.0022506369277834892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.250637044198811e-05, + "grad_norm": 2.5570390224456787, + "learning_rate": 4.1670199236964813e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.833957850933075, + "num_tokens": 37455395.0, + "step": 984 + }, + { + "epoch": 0.12530212441165245, + "ewc_loss": 0.002271691570058465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2716914827469736e-05, + "grad_norm": 2.4357101917266846, + "learning_rate": 4.1712590080542603e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8521634936332703, + "num_tokens": 37492208.0, + "step": 985 + }, + { + "epoch": 0.12542933469024298, + "ewc_loss": 0.0022395451087504625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.239545210613869e-05, + "grad_norm": 2.4052114486694336, + "learning_rate": 4.175498092412039e-07, + "loss": 0.4968, + "mean_token_accuracy": 0.8400436639785767, + "num_tokens": 37533675.0, + "step": 986 + }, + { + "epoch": 0.12555654496883348, + "ewc_loss": 0.0022358852438628674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2358852220349945e-05, + "grad_norm": 2.4702649116516113, + "learning_rate": 4.179737176769817e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.8497340679168701, + "num_tokens": 37572792.0, + "step": 987 + }, + { + "epoch": 0.12568375524742398, + "ewc_loss": 0.002263133879750967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2631338651990518e-05, + "grad_norm": 2.3918471336364746, + "learning_rate": 4.183976261127596e-07, + "loss": 0.458, + "mean_token_accuracy": 0.848687469959259, + "num_tokens": 37611481.0, + "step": 988 + }, + { + "epoch": 0.1258109655260145, + "ewc_loss": 0.0022454173304140568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.245417272206396e-05, + "grad_norm": 2.547865152359009, + "learning_rate": 4.1882153454853747e-07, + "loss": 0.5358, + "mean_token_accuracy": 0.8258200883865356, + "num_tokens": 37648253.0, + "step": 989 + }, + { + "epoch": 0.125938175804605, + "ewc_loss": 0.0022864483762532473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.28644839808112e-05, + "grad_norm": 2.4301469326019287, + "learning_rate": 4.1924544298431537e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.846872866153717, + "num_tokens": 37687086.0, + "step": 990 + }, + { + "epoch": 0.12606538608319554, + "ewc_loss": 0.0022587135899811983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.258713539049495e-05, + "grad_norm": 2.40633487701416, + "learning_rate": 4.196693514200932e-07, + "loss": 0.4651, + "mean_token_accuracy": 0.8482925891876221, + "num_tokens": 37730928.0, + "step": 991 + }, + { + "epoch": 0.12619259636178604, + "ewc_loss": 0.002253586193546653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.25358617171878e-05, + "grad_norm": 2.4540462493896484, + "learning_rate": 4.200932598558711e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8385328054428101, + "num_tokens": 37771857.0, + "step": 992 + }, + { + "epoch": 0.12631980664037654, + "ewc_loss": 0.002274547005072236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2745471142116003e-05, + "grad_norm": 2.3937838077545166, + "learning_rate": 4.2051716829164896e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8424059152603149, + "num_tokens": 37817517.0, + "step": 993 + }, + { + "epoch": 0.12644701691896706, + "ewc_loss": 0.0022523575462400913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.252357626275625e-05, + "grad_norm": 2.4853782653808594, + "learning_rate": 4.2094107672742686e-07, + "loss": 0.5335, + "mean_token_accuracy": 0.8325435519218445, + "num_tokens": 37855891.0, + "step": 994 + }, + { + "epoch": 0.12657422719755757, + "ewc_loss": 0.0022771810181438923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2771810108679347e-05, + "grad_norm": 2.5358831882476807, + "learning_rate": 4.2136498516320476e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.855018138885498, + "num_tokens": 37888719.0, + "step": 995 + }, + { + "epoch": 0.12670143747614807, + "ewc_loss": 0.002295037033036351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2950369384489022e-05, + "grad_norm": 2.3685901165008545, + "learning_rate": 4.217888935989826e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.8353524208068848, + "num_tokens": 37932968.0, + "step": 996 + }, + { + "epoch": 0.1268286477547386, + "ewc_loss": 0.0022506644017994404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2506645109388046e-05, + "grad_norm": 2.4652838706970215, + "learning_rate": 4.2221280203476045e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.8368721008300781, + "num_tokens": 37974297.0, + "step": 997 + }, + { + "epoch": 0.1269558580333291, + "ewc_loss": 0.0022804923355579376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2804922991781496e-05, + "grad_norm": 2.4157631397247314, + "learning_rate": 4.2263671047053835e-07, + "loss": 0.513, + "mean_token_accuracy": 0.8338100910186768, + "num_tokens": 38020294.0, + "step": 998 + }, + { + "epoch": 0.1270830683119196, + "ewc_loss": 0.0022682982962578535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2682983399135992e-05, + "grad_norm": 2.3749072551727295, + "learning_rate": 4.2306061890631625e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8400815725326538, + "num_tokens": 38063322.0, + "step": 999 + }, + { + "epoch": 0.12721027859051012, + "ewc_loss": 0.002264621900394559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2646219804300927e-05, + "grad_norm": 2.53513240814209, + "learning_rate": 4.234845273420941e-07, + "loss": 0.5796, + "mean_token_accuracy": 0.8203784823417664, + "num_tokens": 38101052.0, + "step": 1000 + }, + { + "epoch": 0.12733748886910062, + "ewc_loss": 0.002311339834704995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3113398128771223e-05, + "grad_norm": 2.390474319458008, + "learning_rate": 4.2390843577787194e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.8355017900466919, + "num_tokens": 38145472.0, + "step": 1001 + }, + { + "epoch": 0.12746469914769112, + "ewc_loss": 0.0022672037594020367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2672038539894857e-05, + "grad_norm": 2.4545888900756836, + "learning_rate": 4.2433234421364984e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.843816876411438, + "num_tokens": 38182662.0, + "step": 1002 + }, + { + "epoch": 0.12759190942628165, + "ewc_loss": 0.0022862732876092196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2862732294015586e-05, + "grad_norm": 2.482463836669922, + "learning_rate": 4.2475625264942774e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.8490209579467773, + "num_tokens": 38224735.0, + "step": 1003 + }, + { + "epoch": 0.12771911970487215, + "ewc_loss": 0.0022938596084713936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2938596885069273e-05, + "grad_norm": 2.6606955528259277, + "learning_rate": 4.251801610852056e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8353826999664307, + "num_tokens": 38255847.0, + "step": 1004 + }, + { + "epoch": 0.12784632998346265, + "ewc_loss": 0.0023363898508250713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3363898435491137e-05, + "grad_norm": 2.469611644744873, + "learning_rate": 4.2560406952098343e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.832994818687439, + "num_tokens": 38296921.0, + "step": 1005 + }, + { + "epoch": 0.12797354026205318, + "ewc_loss": 0.0022836497519165277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2836497009848244e-05, + "grad_norm": 2.446641445159912, + "learning_rate": 4.2602797795676133e-07, + "loss": 0.5319, + "mean_token_accuracy": 0.829317569732666, + "num_tokens": 38336408.0, + "step": 1006 + }, + { + "epoch": 0.12810075054064368, + "ewc_loss": 0.0022874572314321995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2874572096043266e-05, + "grad_norm": 2.484989881515503, + "learning_rate": 4.2645188639253923e-07, + "loss": 0.4438, + "mean_token_accuracy": 0.8531791567802429, + "num_tokens": 38372809.0, + "step": 1007 + }, + { + "epoch": 0.12822796081923418, + "ewc_loss": 0.0023079407401382923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3079406673787162e-05, + "grad_norm": 2.5100879669189453, + "learning_rate": 4.26875794828317e-07, + "loss": 0.5002, + "mean_token_accuracy": 0.8374792337417603, + "num_tokens": 38413312.0, + "step": 1008 + }, + { + "epoch": 0.1283551710978247, + "ewc_loss": 0.002308116527274251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308116563654039e-05, + "grad_norm": 2.3975324630737305, + "learning_rate": 4.272997032640949e-07, + "loss": 0.5182, + "mean_token_accuracy": 0.8319482803344727, + "num_tokens": 38457710.0, + "step": 1009 + }, + { + "epoch": 0.1284823813764152, + "ewc_loss": 0.00228105834685266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.281058368680533e-05, + "grad_norm": 2.4673564434051514, + "learning_rate": 4.277236116998728e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8451325297355652, + "num_tokens": 38499630.0, + "step": 1010 + }, + { + "epoch": 0.1286095916550057, + "ewc_loss": 0.0023076182696968317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307618342456408e-05, + "grad_norm": 2.469238519668579, + "learning_rate": 4.281475201356507e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8426741361618042, + "num_tokens": 38538133.0, + "step": 1011 + }, + { + "epoch": 0.12873680193359624, + "ewc_loss": 0.0023095854558050632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3095853975974023e-05, + "grad_norm": 2.466233730316162, + "learning_rate": 4.285714285714285e-07, + "loss": 0.5173, + "mean_token_accuracy": 0.8333486318588257, + "num_tokens": 38576816.0, + "step": 1012 + }, + { + "epoch": 0.12886401221218674, + "ewc_loss": 0.0023057558573782444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3057558792061172e-05, + "grad_norm": 2.45587158203125, + "learning_rate": 4.289953370072064e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.848736584186554, + "num_tokens": 38615270.0, + "step": 1013 + }, + { + "epoch": 0.12899122249077727, + "ewc_loss": 0.0023079649545252323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3079650418367237e-05, + "grad_norm": 2.514078378677368, + "learning_rate": 4.294192454429843e-07, + "loss": 0.5102, + "mean_token_accuracy": 0.83385169506073, + "num_tokens": 38654173.0, + "step": 1014 + }, + { + "epoch": 0.12911843276936777, + "ewc_loss": 0.0023245711345225573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324571141798515e-05, + "grad_norm": 2.3572397232055664, + "learning_rate": 4.298431538787622e-07, + "loss": 0.3977, + "mean_token_accuracy": 0.8709053993225098, + "num_tokens": 38695758.0, + "step": 1015 + }, + { + "epoch": 0.12924564304795827, + "ewc_loss": 0.002285977592691779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2859776436234824e-05, + "grad_norm": 2.581864356994629, + "learning_rate": 4.3026706231454e-07, + "loss": 0.5412, + "mean_token_accuracy": 0.8253178596496582, + "num_tokens": 38729524.0, + "step": 1016 + }, + { + "epoch": 0.1293728533265488, + "ewc_loss": 0.002357268473133445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3572683858219534e-05, + "grad_norm": 2.5292625427246094, + "learning_rate": 4.306909707503179e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.8434370756149292, + "num_tokens": 38767500.0, + "step": 1017 + }, + { + "epoch": 0.1295000636051393, + "ewc_loss": 0.0023347497917711735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3347498427028768e-05, + "grad_norm": 2.665116548538208, + "learning_rate": 4.311148791860958e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.842851996421814, + "num_tokens": 38798683.0, + "step": 1018 + }, + { + "epoch": 0.1296272738837298, + "ewc_loss": 0.0023672517854720354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3672517272643745e-05, + "grad_norm": 2.5119237899780273, + "learning_rate": 4.315387876218737e-07, + "loss": 0.5261, + "mean_token_accuracy": 0.83349609375, + "num_tokens": 38833653.0, + "step": 1019 + }, + { + "epoch": 0.12975448416232033, + "ewc_loss": 0.0023300598841160536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3300599423237145e-05, + "grad_norm": 2.5578248500823975, + "learning_rate": 4.319626960576515e-07, + "loss": 0.5241, + "mean_token_accuracy": 0.828012228012085, + "num_tokens": 38869428.0, + "step": 1020 + }, + { + "epoch": 0.12988169444091083, + "ewc_loss": 0.0023382704239338636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3382704966934398e-05, + "grad_norm": 2.420724630355835, + "learning_rate": 4.323866044934294e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8458223342895508, + "num_tokens": 38909502.0, + "step": 1021 + }, + { + "epoch": 0.13000890471950133, + "ewc_loss": 0.0023156418465077877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3156419047154486e-05, + "grad_norm": 2.4296491146087646, + "learning_rate": 4.328105129292073e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8444194793701172, + "num_tokens": 38951730.0, + "step": 1022 + }, + { + "epoch": 0.13013611499809186, + "ewc_loss": 0.0023283343762159348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.328334448975511e-05, + "grad_norm": 2.4895498752593994, + "learning_rate": 4.332344213649852e-07, + "loss": 0.4613, + "mean_token_accuracy": 0.8465323448181152, + "num_tokens": 38988981.0, + "step": 1023 + }, + { + "epoch": 0.13026332527668236, + "ewc_loss": 0.002350092865526676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3500928364228457e-05, + "grad_norm": 2.4782447814941406, + "learning_rate": 4.33658329800763e-07, + "loss": 0.473, + "mean_token_accuracy": 0.846390962600708, + "num_tokens": 39026289.0, + "step": 1024 + }, + { + "epoch": 0.13039053555527286, + "ewc_loss": 0.0023395444732159376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3395445168716833e-05, + "grad_norm": 2.382327079772949, + "learning_rate": 4.340822382365409e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.8480924367904663, + "num_tokens": 39069113.0, + "step": 1025 + }, + { + "epoch": 0.13051774583386339, + "ewc_loss": 0.002314950106665492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.31495014304528e-05, + "grad_norm": 2.427393913269043, + "learning_rate": 4.345061466723188e-07, + "loss": 0.4461, + "mean_token_accuracy": 0.853824257850647, + "num_tokens": 39112991.0, + "step": 1026 + }, + { + "epoch": 0.13064495611245389, + "ewc_loss": 0.002335255965590477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3352558855549432e-05, + "grad_norm": 2.5105514526367188, + "learning_rate": 4.3493005510809663e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8375295996665955, + "num_tokens": 39149593.0, + "step": 1027 + }, + { + "epoch": 0.1307721663910444, + "ewc_loss": 0.0023611700162291527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.361169936193619e-05, + "grad_norm": 2.493274688720703, + "learning_rate": 4.353539635438745e-07, + "loss": 0.4737, + "mean_token_accuracy": 0.8438571095466614, + "num_tokens": 39186570.0, + "step": 1028 + }, + { + "epoch": 0.13089937666963491, + "ewc_loss": 0.0023480975069105625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.348097405047156e-05, + "grad_norm": 2.6089110374450684, + "learning_rate": 4.357778719796524e-07, + "loss": 0.5358, + "mean_token_accuracy": 0.8237276077270508, + "num_tokens": 39220666.0, + "step": 1029 + }, + { + "epoch": 0.13102658694822542, + "ewc_loss": 0.002374326577410102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.374326686549466e-05, + "grad_norm": 2.4308316707611084, + "learning_rate": 4.362017804154303e-07, + "loss": 0.4548, + "mean_token_accuracy": 0.8536332845687866, + "num_tokens": 39258372.0, + "step": 1030 + }, + { + "epoch": 0.13115379722681592, + "ewc_loss": 0.0023234160616993904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3234160835272633e-05, + "grad_norm": 2.4719927310943604, + "learning_rate": 4.366256888512081e-07, + "loss": 0.4512, + "mean_token_accuracy": 0.8528589010238647, + "num_tokens": 39293647.0, + "step": 1031 + }, + { + "epoch": 0.13128100750540644, + "ewc_loss": 0.002346021356061101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.346021392440889e-05, + "grad_norm": 2.460122585296631, + "learning_rate": 4.3704959728698597e-07, + "loss": 0.4773, + "mean_token_accuracy": 0.8424162864685059, + "num_tokens": 39332811.0, + "step": 1032 + }, + { + "epoch": 0.13140821778399694, + "ewc_loss": 0.0023471079766750336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3471080567105673e-05, + "grad_norm": 2.6367063522338867, + "learning_rate": 4.3747350572276386e-07, + "loss": 0.4802, + "mean_token_accuracy": 0.8422918319702148, + "num_tokens": 39365971.0, + "step": 1033 + }, + { + "epoch": 0.13153542806258745, + "ewc_loss": 0.0023908212315291166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.39082128246082e-05, + "grad_norm": 2.450340509414673, + "learning_rate": 4.3789741415854176e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.8411241173744202, + "num_tokens": 39403178.0, + "step": 1034 + }, + { + "epoch": 0.13166263834117797, + "ewc_loss": 0.0023349446710199118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3349446564679965e-05, + "grad_norm": 2.4600772857666016, + "learning_rate": 4.383213225943196e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8415287137031555, + "num_tokens": 39441303.0, + "step": 1035 + }, + { + "epoch": 0.13178984861976847, + "ewc_loss": 0.0023459848016500473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3459848307538778e-05, + "grad_norm": 2.4713971614837646, + "learning_rate": 4.3874523103009746e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8598448038101196, + "num_tokens": 39477359.0, + "step": 1036 + }, + { + "epoch": 0.13191705889835897, + "ewc_loss": 0.00235649268142879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.356492586841341e-05, + "grad_norm": 2.5471858978271484, + "learning_rate": 4.3916913946587536e-07, + "loss": 0.4395, + "mean_token_accuracy": 0.8526028990745544, + "num_tokens": 39511687.0, + "step": 1037 + }, + { + "epoch": 0.1320442691769495, + "ewc_loss": 0.0023721789475530386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3721790057606995e-05, + "grad_norm": 2.469517707824707, + "learning_rate": 4.3959304790165325e-07, + "loss": 0.454, + "mean_token_accuracy": 0.848993718624115, + "num_tokens": 39547453.0, + "step": 1038 + }, + { + "epoch": 0.13217147945554, + "ewc_loss": 0.0023519897367805243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3519896785728633e-05, + "grad_norm": 2.3760263919830322, + "learning_rate": 4.400169563374311e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8447566032409668, + "num_tokens": 39588934.0, + "step": 1039 + }, + { + "epoch": 0.13229868973413053, + "ewc_loss": 0.0023327600210905075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.332760050194338e-05, + "grad_norm": 2.587135076522827, + "learning_rate": 4.4044086477320895e-07, + "loss": 0.5587, + "mean_token_accuracy": 0.8227881789207458, + "num_tokens": 39627638.0, + "step": 1040 + }, + { + "epoch": 0.13242590001272103, + "ewc_loss": 0.0023989917244762182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398991637164727e-05, + "grad_norm": 2.5303611755371094, + "learning_rate": 4.4086477320898685e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.8441872596740723, + "num_tokens": 39661696.0, + "step": 1041 + }, + { + "epoch": 0.13255311029131153, + "ewc_loss": 0.002383015351369977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3830152713344432e-05, + "grad_norm": 2.4434478282928467, + "learning_rate": 4.4128868164476474e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.847515344619751, + "num_tokens": 39702357.0, + "step": 1042 + }, + { + "epoch": 0.13268032056990206, + "ewc_loss": 0.0023541857954114676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3541857444797643e-05, + "grad_norm": 2.4503045082092285, + "learning_rate": 4.417125900805426e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.849524736404419, + "num_tokens": 39744960.0, + "step": 1043 + }, + { + "epoch": 0.13280753084849256, + "ewc_loss": 0.0023704252671450377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370425318076741e-05, + "grad_norm": 2.5404305458068848, + "learning_rate": 4.4213649851632044e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8353679776191711, + "num_tokens": 39785394.0, + "step": 1044 + }, + { + "epoch": 0.13293474112708306, + "ewc_loss": 0.0023962208069860935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3962207706063055e-05, + "grad_norm": 2.516465902328491, + "learning_rate": 4.4256040695209834e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.8400025367736816, + "num_tokens": 39822698.0, + "step": 1045 + }, + { + "epoch": 0.1330619514056736, + "ewc_loss": 0.0023851816076785326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3851816877140664e-05, + "grad_norm": 2.5512852668762207, + "learning_rate": 4.429843153878762e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.836092472076416, + "num_tokens": 39856874.0, + "step": 1046 + }, + { + "epoch": 0.1331891616842641, + "ewc_loss": 0.0023886265698820353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3886264898465015e-05, + "grad_norm": 2.4714736938476562, + "learning_rate": 4.434082238236541e-07, + "loss": 0.5293, + "mean_token_accuracy": 0.8271346688270569, + "num_tokens": 39899683.0, + "step": 1047 + }, + { + "epoch": 0.1333163719628546, + "ewc_loss": 0.0023719542659819126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3719541786704212e-05, + "grad_norm": 2.4493696689605713, + "learning_rate": 4.4383213225943193e-07, + "loss": 0.5073, + "mean_token_accuracy": 0.8369883298873901, + "num_tokens": 39940565.0, + "step": 1048 + }, + { + "epoch": 0.13344358224144512, + "ewc_loss": 0.002376860473304987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3768605387886055e-05, + "grad_norm": 2.4982235431671143, + "learning_rate": 4.442560406952098e-07, + "loss": 0.5085, + "mean_token_accuracy": 0.8329598903656006, + "num_tokens": 39979010.0, + "step": 1049 + }, + { + "epoch": 0.13357079252003562, + "ewc_loss": 0.0023942673578858376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3942673578858376e-05, + "grad_norm": 2.4101250171661377, + "learning_rate": 4.4467994913098767e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8507480025291443, + "num_tokens": 40021151.0, + "step": 1050 + }, + { + "epoch": 0.13369800279862612, + "ewc_loss": 0.002371827606111765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3718275770079345e-05, + "grad_norm": 2.5064611434936523, + "learning_rate": 4.4510385756676557e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.8297799229621887, + "num_tokens": 40059324.0, + "step": 1051 + }, + { + "epoch": 0.13382521307721665, + "ewc_loss": 0.0024009982589632273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4009983462747186e-05, + "grad_norm": 2.3772997856140137, + "learning_rate": 4.455277660025434e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8443175554275513, + "num_tokens": 40104577.0, + "step": 1052 + }, + { + "epoch": 0.13395242335580715, + "ewc_loss": 0.0023769144900143147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3769145627738908e-05, + "grad_norm": 2.5777478218078613, + "learning_rate": 4.459516744383213e-07, + "loss": 0.4955, + "mean_token_accuracy": 0.8353455066680908, + "num_tokens": 40139734.0, + "step": 1053 + }, + { + "epoch": 0.13407963363439765, + "ewc_loss": 0.002429916989058256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4299170036101714e-05, + "grad_norm": 2.5090367794036865, + "learning_rate": 4.4637558287409916e-07, + "loss": 0.4952, + "mean_token_accuracy": 0.8380724191665649, + "num_tokens": 40180724.0, + "step": 1054 + }, + { + "epoch": 0.13420684391298818, + "ewc_loss": 0.0024079165887087584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407916690572165e-05, + "grad_norm": 2.4675798416137695, + "learning_rate": 4.4679949130987706e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8435428142547607, + "num_tokens": 40220475.0, + "step": 1055 + }, + { + "epoch": 0.13433405419157868, + "ewc_loss": 0.0023944589775055647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3944590793689713e-05, + "grad_norm": 2.5512876510620117, + "learning_rate": 4.472233997456549e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8381896018981934, + "num_tokens": 40258208.0, + "step": 1056 + }, + { + "epoch": 0.13446126447016918, + "ewc_loss": 0.0024251018185168505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4251017748611048e-05, + "grad_norm": 2.424659252166748, + "learning_rate": 4.476473081814328e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.8399350047111511, + "num_tokens": 40299127.0, + "step": 1057 + }, + { + "epoch": 0.1345884747487597, + "ewc_loss": 0.0023868654388934374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3868655262049288e-05, + "grad_norm": 2.4483847618103027, + "learning_rate": 4.4807121661721065e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8506169319152832, + "num_tokens": 40339331.0, + "step": 1058 + }, + { + "epoch": 0.1347156850273502, + "ewc_loss": 0.0023999728728085756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.39997298194794e-05, + "grad_norm": 2.435150384902954, + "learning_rate": 4.4849512505298855e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8396207094192505, + "num_tokens": 40379386.0, + "step": 1059 + }, + { + "epoch": 0.1348428953059407, + "ewc_loss": 0.002402118407189846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4021182980504818e-05, + "grad_norm": 2.4622087478637695, + "learning_rate": 4.489190334887664e-07, + "loss": 0.5254, + "mean_token_accuracy": 0.8357899188995361, + "num_tokens": 40419868.0, + "step": 1060 + }, + { + "epoch": 0.13497010558453124, + "ewc_loss": 0.002406970364972949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4069704522844404e-05, + "grad_norm": 2.564089775085449, + "learning_rate": 4.493429419245443e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8406095504760742, + "num_tokens": 40455346.0, + "step": 1061 + }, + { + "epoch": 0.13509731586312174, + "ewc_loss": 0.0024360474199056625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4360473616980016e-05, + "grad_norm": 2.553102970123291, + "learning_rate": 4.4976685036032214e-07, + "loss": 0.4544, + "mean_token_accuracy": 0.8495559692382812, + "num_tokens": 40490087.0, + "step": 1062 + }, + { + "epoch": 0.13522452614171224, + "ewc_loss": 0.0024271816946566105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.427181607345119e-05, + "grad_norm": 2.538419485092163, + "learning_rate": 4.5019075879610004e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8300070762634277, + "num_tokens": 40524799.0, + "step": 1063 + }, + { + "epoch": 0.13535173642030277, + "ewc_loss": 0.0024194135330617428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4194134311983362e-05, + "grad_norm": 2.4899473190307617, + "learning_rate": 4.506146672318779e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8419100642204285, + "num_tokens": 40563740.0, + "step": 1064 + }, + { + "epoch": 0.13547894669889327, + "ewc_loss": 0.002416843082755804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.416843199171126e-05, + "grad_norm": 2.493755340576172, + "learning_rate": 4.5103857566765573e-07, + "loss": 0.5386, + "mean_token_accuracy": 0.8298995494842529, + "num_tokens": 40604743.0, + "step": 1065 + }, + { + "epoch": 0.1356061569774838, + "ewc_loss": 0.002424106001853943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4241060600616038e-05, + "grad_norm": 2.4931447505950928, + "learning_rate": 4.5146248410343363e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8546346426010132, + "num_tokens": 40640909.0, + "step": 1066 + }, + { + "epoch": 0.1357333672560743, + "ewc_loss": 0.0024244491942226887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.424449303362053e-05, + "grad_norm": 2.460336446762085, + "learning_rate": 4.5188639253921153e-07, + "loss": 0.4529, + "mean_token_accuracy": 0.8521924614906311, + "num_tokens": 40678930.0, + "step": 1067 + }, + { + "epoch": 0.1358605775346648, + "ewc_loss": 0.0024178235325962305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4178234525606968e-05, + "grad_norm": 2.5002055168151855, + "learning_rate": 4.523103009749894e-07, + "loss": 0.4466, + "mean_token_accuracy": 0.8523393869400024, + "num_tokens": 40715609.0, + "step": 1068 + }, + { + "epoch": 0.13598778781325532, + "ewc_loss": 0.0024379354435950518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4379354726988822e-05, + "grad_norm": 2.4328601360321045, + "learning_rate": 4.527342094107672e-07, + "loss": 0.472, + "mean_token_accuracy": 0.845584511756897, + "num_tokens": 40760533.0, + "step": 1069 + }, + { + "epoch": 0.13611499809184582, + "ewc_loss": 0.0024120535235852003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.412053436273709e-05, + "grad_norm": 2.4679455757141113, + "learning_rate": 4.531581178465451e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8382954597473145, + "num_tokens": 40800039.0, + "step": 1070 + }, + { + "epoch": 0.13624220837043632, + "ewc_loss": 0.002430722350254655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4307224521180615e-05, + "grad_norm": 2.4869675636291504, + "learning_rate": 4.53582026282323e-07, + "loss": 0.461, + "mean_token_accuracy": 0.8490214347839355, + "num_tokens": 40840207.0, + "step": 1071 + }, + { + "epoch": 0.13636941864902685, + "ewc_loss": 0.002431205241009593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4312052119057626e-05, + "grad_norm": 2.5621016025543213, + "learning_rate": 4.5400593471810087e-07, + "loss": 0.5433, + "mean_token_accuracy": 0.823914647102356, + "num_tokens": 40879559.0, + "step": 1072 + }, + { + "epoch": 0.13649662892761735, + "ewc_loss": 0.002456098794937134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456098809489049e-05, + "grad_norm": 2.513720989227295, + "learning_rate": 4.544298431538787e-07, + "loss": 0.4713, + "mean_token_accuracy": 0.8461408019065857, + "num_tokens": 40919608.0, + "step": 1073 + }, + { + "epoch": 0.13662383920620785, + "ewc_loss": 0.0024371840991079807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4371840481762774e-05, + "grad_norm": 2.551453113555908, + "learning_rate": 4.548537515896566e-07, + "loss": 0.5303, + "mean_token_accuracy": 0.8278086185455322, + "num_tokens": 40953648.0, + "step": 1074 + }, + { + "epoch": 0.13675104948479838, + "ewc_loss": 0.002452300628647208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.452300577715505e-05, + "grad_norm": 2.6095895767211914, + "learning_rate": 4.552776600254345e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8373905420303345, + "num_tokens": 40989675.0, + "step": 1075 + }, + { + "epoch": 0.13687825976338888, + "ewc_loss": 0.0024662278592586517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466227851982694e-05, + "grad_norm": 2.5026025772094727, + "learning_rate": 4.5570156846121236e-07, + "loss": 0.455, + "mean_token_accuracy": 0.8512760400772095, + "num_tokens": 41027992.0, + "step": 1076 + }, + { + "epoch": 0.13700547004197938, + "ewc_loss": 0.0024322981480509043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4322982426383533e-05, + "grad_norm": 2.4927024841308594, + "learning_rate": 4.561254768969902e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8483125567436218, + "num_tokens": 41066925.0, + "step": 1077 + }, + { + "epoch": 0.1371326803205699, + "ewc_loss": 0.002438399475067854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4383994968957268e-05, + "grad_norm": 2.479191303253174, + "learning_rate": 4.565493853327681e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8492626547813416, + "num_tokens": 41105090.0, + "step": 1078 + }, + { + "epoch": 0.1372598905991604, + "ewc_loss": 0.0024369845632463694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4369845050387084e-05, + "grad_norm": 2.5839083194732666, + "learning_rate": 4.56973293768546e-07, + "loss": 0.5055, + "mean_token_accuracy": 0.8392777442932129, + "num_tokens": 41141924.0, + "step": 1079 + }, + { + "epoch": 0.1373871008777509, + "ewc_loss": 0.0024735559709370136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4735560145927593e-05, + "grad_norm": 2.590374231338501, + "learning_rate": 4.573972022043238e-07, + "loss": 0.565, + "mean_token_accuracy": 0.8184483051300049, + "num_tokens": 41181512.0, + "step": 1080 + }, + { + "epoch": 0.13751431115634144, + "ewc_loss": 0.0024701955262571573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4701954316697083e-05, + "grad_norm": 2.456658363342285, + "learning_rate": 4.578211106401017e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8336721062660217, + "num_tokens": 41224015.0, + "step": 1081 + }, + { + "epoch": 0.13764152143493194, + "ewc_loss": 0.0024295549374073744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4295548428199254e-05, + "grad_norm": 2.4913272857666016, + "learning_rate": 4.582450190758796e-07, + "loss": 0.5159, + "mean_token_accuracy": 0.8346838355064392, + "num_tokens": 41265568.0, + "step": 1082 + }, + { + "epoch": 0.13776873171352244, + "ewc_loss": 0.002456259448081255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456259426253382e-05, + "grad_norm": 2.455181121826172, + "learning_rate": 4.586689275116575e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.8578251600265503, + "num_tokens": 41303595.0, + "step": 1083 + }, + { + "epoch": 0.13789594199211297, + "ewc_loss": 0.0024540508165955544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4540508093195967e-05, + "grad_norm": 2.5146541595458984, + "learning_rate": 4.590928359474353e-07, + "loss": 0.532, + "mean_token_accuracy": 0.8282114267349243, + "num_tokens": 41340119.0, + "step": 1084 + }, + { + "epoch": 0.13802315227070347, + "ewc_loss": 0.0024737033527344465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4737033527344465e-05, + "grad_norm": 2.489920139312744, + "learning_rate": 4.595167443832132e-07, + "loss": 0.4844, + "mean_token_accuracy": 0.8451358079910278, + "num_tokens": 41380184.0, + "step": 1085 + }, + { + "epoch": 0.13815036254929397, + "ewc_loss": 0.0024661263450980186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4661263523739763e-05, + "grad_norm": 2.443840503692627, + "learning_rate": 4.599406528189911e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8579477071762085, + "num_tokens": 41419038.0, + "step": 1086 + }, + { + "epoch": 0.1382775728278845, + "ewc_loss": 0.0024544610641896725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4544611733290367e-05, + "grad_norm": 2.4884448051452637, + "learning_rate": 4.60364561254769e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8422327041625977, + "num_tokens": 41455767.0, + "step": 1087 + }, + { + "epoch": 0.138404783106475, + "ewc_loss": 0.002473088912665844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4730888981139287e-05, + "grad_norm": 2.4449737071990967, + "learning_rate": 4.607884696905468e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8455392122268677, + "num_tokens": 41497493.0, + "step": 1088 + }, + { + "epoch": 0.1385319933850655, + "ewc_loss": 0.0024672227445989847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4672226572874933e-05, + "grad_norm": 2.441007375717163, + "learning_rate": 4.612123781263247e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8365732431411743, + "num_tokens": 41542940.0, + "step": 1089 + }, + { + "epoch": 0.13865920366365603, + "ewc_loss": 0.0024721715599298477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4721715817577206e-05, + "grad_norm": 2.4501070976257324, + "learning_rate": 4.616362865621026e-07, + "loss": 0.4009, + "mean_token_accuracy": 0.8655081987380981, + "num_tokens": 41579302.0, + "step": 1090 + }, + { + "epoch": 0.13878641394224653, + "ewc_loss": 0.0024786817375570536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4786817448330112e-05, + "grad_norm": 2.5392491817474365, + "learning_rate": 4.620601949978805e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8401651978492737, + "num_tokens": 41617555.0, + "step": 1091 + }, + { + "epoch": 0.13891362422083706, + "ewc_loss": 0.002499194350093603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4991943064378574e-05, + "grad_norm": 2.599177122116089, + "learning_rate": 4.6248410343365827e-07, + "loss": 0.5516, + "mean_token_accuracy": 0.8209114670753479, + "num_tokens": 41654660.0, + "step": 1092 + }, + { + "epoch": 0.13904083449942756, + "ewc_loss": 0.0025103683583438396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5103683583438396e-05, + "grad_norm": 2.4785773754119873, + "learning_rate": 4.6290801186943617e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8487650156021118, + "num_tokens": 41693773.0, + "step": 1093 + }, + { + "epoch": 0.13916804477801806, + "ewc_loss": 0.0024739515502005816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4739514628890902e-05, + "grad_norm": 2.5633504390716553, + "learning_rate": 4.6333192030521407e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8470662832260132, + "num_tokens": 41728864.0, + "step": 1094 + }, + { + "epoch": 0.13929525505660859, + "ewc_loss": 0.0025086847599595785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5086847017519176e-05, + "grad_norm": 2.43923282623291, + "learning_rate": 4.6375582874099196e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.8526514172554016, + "num_tokens": 41770596.0, + "step": 1095 + }, + { + "epoch": 0.1394224653351991, + "ewc_loss": 0.002470678649842739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47067855525529e-05, + "grad_norm": 2.5206565856933594, + "learning_rate": 4.6417973717676976e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.8510671854019165, + "num_tokens": 41806585.0, + "step": 1096 + }, + { + "epoch": 0.1395496756137896, + "ewc_loss": 0.0024976429995149374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4976430722745135e-05, + "grad_norm": 2.500901937484741, + "learning_rate": 4.6460364561254766e-07, + "loss": 0.4924, + "mean_token_accuracy": 0.838111162185669, + "num_tokens": 41845488.0, + "step": 1097 + }, + { + "epoch": 0.13967688589238011, + "ewc_loss": 0.0024962974712252617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4962975658127107e-05, + "grad_norm": 2.568500280380249, + "learning_rate": 4.6502755404832556e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8481581211090088, + "num_tokens": 41880674.0, + "step": 1098 + }, + { + "epoch": 0.13980409617097062, + "ewc_loss": 0.0025111627764999866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5111628929153085e-05, + "grad_norm": 2.61452317237854, + "learning_rate": 4.654514624841034e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.8439235687255859, + "num_tokens": 41913020.0, + "step": 1099 + }, + { + "epoch": 0.13993130644956112, + "ewc_loss": 0.0025174294132739305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517429311410524e-05, + "grad_norm": 2.523369073867798, + "learning_rate": 4.6587537091988125e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8439027070999146, + "num_tokens": 41948483.0, + "step": 1100 + }, + { + "epoch": 0.14005851672815164, + "ewc_loss": 0.002493677893653512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.493677857273724e-05, + "grad_norm": 2.530745029449463, + "learning_rate": 4.6629927935565915e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8522180914878845, + "num_tokens": 41985659.0, + "step": 1101 + }, + { + "epoch": 0.14018572700674214, + "ewc_loss": 0.002502431394532323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5024313799804077e-05, + "grad_norm": 2.502019166946411, + "learning_rate": 4.6672318779143705e-07, + "loss": 0.5447, + "mean_token_accuracy": 0.8233298063278198, + "num_tokens": 42027746.0, + "step": 1102 + }, + { + "epoch": 0.14031293728533265, + "ewc_loss": 0.00250424281693995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.504242911527399e-05, + "grad_norm": 2.573763847351074, + "learning_rate": 4.671470962272149e-07, + "loss": 0.5005, + "mean_token_accuracy": 0.836607038974762, + "num_tokens": 42060903.0, + "step": 1103 + }, + { + "epoch": 0.14044014756392317, + "ewc_loss": 0.0025312351062893867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5312350771855563e-05, + "grad_norm": 2.4654979705810547, + "learning_rate": 4.6757100466299274e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8421500325202942, + "num_tokens": 42102722.0, + "step": 1104 + }, + { + "epoch": 0.14056735784251367, + "ewc_loss": 0.0024944317992776632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4944318283814937e-05, + "grad_norm": 2.4628589153289795, + "learning_rate": 4.6799491309877064e-07, + "loss": 0.506, + "mean_token_accuracy": 0.8329211473464966, + "num_tokens": 42143418.0, + "step": 1105 + }, + { + "epoch": 0.14069456812110417, + "ewc_loss": 0.0025049508549273014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.504950862203259e-05, + "grad_norm": 2.5624144077301025, + "learning_rate": 4.6841882153454854e-07, + "loss": 0.4455, + "mean_token_accuracy": 0.8498564958572388, + "num_tokens": 42177313.0, + "step": 1106 + }, + { + "epoch": 0.1408217783996947, + "ewc_loss": 0.002544683637097478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5446835934417322e-05, + "grad_norm": 2.5020689964294434, + "learning_rate": 4.688427299703264e-07, + "loss": 0.4286, + "mean_token_accuracy": 0.8552693724632263, + "num_tokens": 42214158.0, + "step": 1107 + }, + { + "epoch": 0.1409489886782852, + "ewc_loss": 0.002520262263715267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5202622055076063e-05, + "grad_norm": 2.447252035140991, + "learning_rate": 4.6926663840610423e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.8429669141769409, + "num_tokens": 42254189.0, + "step": 1108 + }, + { + "epoch": 0.1410761989568757, + "ewc_loss": 0.002519751898944378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5197517970809713e-05, + "grad_norm": 2.5270416736602783, + "learning_rate": 4.6969054684188213e-07, + "loss": 0.5056, + "mean_token_accuracy": 0.8325870633125305, + "num_tokens": 42296853.0, + "step": 1109 + }, + { + "epoch": 0.14120340923546623, + "ewc_loss": 0.0025486466474831104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5486466256552376e-05, + "grad_norm": 2.567606210708618, + "learning_rate": 4.7011445527766003e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.840657114982605, + "num_tokens": 42331983.0, + "step": 1110 + }, + { + "epoch": 0.14133061951405673, + "ewc_loss": 0.002549887401983142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5498873583273962e-05, + "grad_norm": 2.542935609817505, + "learning_rate": 4.7053836371343787e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8421857953071594, + "num_tokens": 42370496.0, + "step": 1111 + }, + { + "epoch": 0.14145782979264723, + "ewc_loss": 0.0025431662797927856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5431663743802346e-05, + "grad_norm": 2.5227112770080566, + "learning_rate": 4.709622721492157e-07, + "loss": 0.4642, + "mean_token_accuracy": 0.8486290574073792, + "num_tokens": 42406489.0, + "step": 1112 + }, + { + "epoch": 0.14158504007123776, + "ewc_loss": 0.0025355725083500147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5355724574183114e-05, + "grad_norm": 2.550926923751831, + "learning_rate": 4.713861805849936e-07, + "loss": 0.4654, + "mean_token_accuracy": 0.8448076248168945, + "num_tokens": 42440679.0, + "step": 1113 + }, + { + "epoch": 0.14171225034982826, + "ewc_loss": 0.002548592397943139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548592419771012e-05, + "grad_norm": 2.6373355388641357, + "learning_rate": 4.718100890207715e-07, + "loss": 0.5366, + "mean_token_accuracy": 0.8256123661994934, + "num_tokens": 42476699.0, + "step": 1114 + }, + { + "epoch": 0.1418394606284188, + "ewc_loss": 0.0025754347443580627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575434700702317e-05, + "grad_norm": 2.606372117996216, + "learning_rate": 4.7223399745654936e-07, + "loss": 0.4929, + "mean_token_accuracy": 0.8392828702926636, + "num_tokens": 42513348.0, + "step": 1115 + }, + { + "epoch": 0.1419666709070093, + "ewc_loss": 0.002562412992119789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5624131012591533e-05, + "grad_norm": 2.473229169845581, + "learning_rate": 4.726579058923272e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8432967662811279, + "num_tokens": 42550633.0, + "step": 1116 + }, + { + "epoch": 0.1420938811855998, + "ewc_loss": 0.002526424825191498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5264249416068196e-05, + "grad_norm": 2.537720203399658, + "learning_rate": 4.730818143281051e-07, + "loss": 0.5112, + "mean_token_accuracy": 0.836247444152832, + "num_tokens": 42593649.0, + "step": 1117 + }, + { + "epoch": 0.14222109146419032, + "ewc_loss": 0.002564667258411646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5646673748269677e-05, + "grad_norm": 2.5766313076019287, + "learning_rate": 4.7350572276388295e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8497312068939209, + "num_tokens": 42632379.0, + "step": 1118 + }, + { + "epoch": 0.14234830174278082, + "ewc_loss": 0.0025727967731654644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.572796802269295e-05, + "grad_norm": 2.531637191772461, + "learning_rate": 4.7392963119966085e-07, + "loss": 0.484, + "mean_token_accuracy": 0.8397172093391418, + "num_tokens": 42670319.0, + "step": 1119 + }, + { + "epoch": 0.14247551202137132, + "ewc_loss": 0.0025587643031030893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5587642085156403e-05, + "grad_norm": 2.46972918510437, + "learning_rate": 4.7435353963543875e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.8471313714981079, + "num_tokens": 42712680.0, + "step": 1120 + }, + { + "epoch": 0.14260272229996185, + "ewc_loss": 0.0025481011252850294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5481011107331142e-05, + "grad_norm": 2.585799217224121, + "learning_rate": 4.747774480712166e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.8416333198547363, + "num_tokens": 42748241.0, + "step": 1121 + }, + { + "epoch": 0.14272993257855235, + "ewc_loss": 0.0025897007435560226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5897006707964465e-05, + "grad_norm": 2.586625576019287, + "learning_rate": 4.7520135650699444e-07, + "loss": 0.5134, + "mean_token_accuracy": 0.8336547613143921, + "num_tokens": 42783208.0, + "step": 1122 + }, + { + "epoch": 0.14285714285714285, + "ewc_loss": 0.0025862352922558784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5862353140837513e-05, + "grad_norm": 2.578294038772583, + "learning_rate": 4.7562526494277234e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.8420357704162598, + "num_tokens": 42818570.0, + "step": 1123 + }, + { + "epoch": 0.14298435313573338, + "ewc_loss": 0.002577276900410652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5772769731702283e-05, + "grad_norm": 2.5882437229156494, + "learning_rate": 4.7604917337855024e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8574024438858032, + "num_tokens": 42852891.0, + "step": 1124 + }, + { + "epoch": 0.14311156341432388, + "ewc_loss": 0.002584392437711358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.584392495919019e-05, + "grad_norm": 2.5320088863372803, + "learning_rate": 4.764730818143281e-07, + "loss": 0.4769, + "mean_token_accuracy": 0.8382540941238403, + "num_tokens": 42887359.0, + "step": 1125 + }, + { + "epoch": 0.14323877369291438, + "ewc_loss": 0.002571482677012682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5714827643241733e-05, + "grad_norm": 2.558047294616699, + "learning_rate": 4.768969902501059e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8281447291374207, + "num_tokens": 42929308.0, + "step": 1126 + }, + { + "epoch": 0.1433659839715049, + "ewc_loss": 0.0025818811263889074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5818810172495432e-05, + "grad_norm": 2.4997103214263916, + "learning_rate": 4.773208986858838e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8400875329971313, + "num_tokens": 42969373.0, + "step": 1127 + }, + { + "epoch": 0.1434931942500954, + "ewc_loss": 0.002579537918791175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5795379769988358e-05, + "grad_norm": 2.5536000728607178, + "learning_rate": 4.777448071216617e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.8424968123435974, + "num_tokens": 43010133.0, + "step": 1128 + }, + { + "epoch": 0.1436204045286859, + "ewc_loss": 0.0025979347992688417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.597934872028418e-05, + "grad_norm": 2.469571352005005, + "learning_rate": 4.781687155574396e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8336220383644104, + "num_tokens": 43053743.0, + "step": 1129 + }, + { + "epoch": 0.14374761480727644, + "ewc_loss": 0.0025652677286416292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5652678232290782e-05, + "grad_norm": 2.4239261150360107, + "learning_rate": 4.785926239932175e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8517299890518188, + "num_tokens": 43094125.0, + "step": 1130 + }, + { + "epoch": 0.14387482508586694, + "ewc_loss": 0.002564028836786747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5640287276473828e-05, + "grad_norm": 2.7012977600097656, + "learning_rate": 4.790165324289953e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8429585695266724, + "num_tokens": 43132399.0, + "step": 1131 + }, + { + "epoch": 0.14400203536445744, + "ewc_loss": 0.0026453931350260973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6453930331626907e-05, + "grad_norm": 2.620844602584839, + "learning_rate": 4.794404408647732e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8412978053092957, + "num_tokens": 43169036.0, + "step": 1132 + }, + { + "epoch": 0.14412924564304797, + "ewc_loss": 0.0026067886501550674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.606788621051237e-05, + "grad_norm": 2.470670700073242, + "learning_rate": 4.798643493005511e-07, + "loss": 0.4238, + "mean_token_accuracy": 0.85849928855896, + "num_tokens": 43209397.0, + "step": 1133 + }, + { + "epoch": 0.14425645592163847, + "ewc_loss": 0.002549931174144149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5499311959720217e-05, + "grad_norm": 2.529244899749756, + "learning_rate": 4.80288257736329e-07, + "loss": 0.5193, + "mean_token_accuracy": 0.833381175994873, + "num_tokens": 43246788.0, + "step": 1134 + }, + { + "epoch": 0.14438366620022897, + "ewc_loss": 0.0025923526845872402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5923527573468164e-05, + "grad_norm": 2.513521194458008, + "learning_rate": 4.807121661721068e-07, + "loss": 0.4326, + "mean_token_accuracy": 0.8575906753540039, + "num_tokens": 43282774.0, + "step": 1135 + }, + { + "epoch": 0.1445108764788195, + "ewc_loss": 0.0025879787281155586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.58797881542705e-05, + "grad_norm": 2.520770311355591, + "learning_rate": 4.811360746078847e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8444680571556091, + "num_tokens": 43322303.0, + "step": 1136 + }, + { + "epoch": 0.14463808675741, + "ewc_loss": 0.0025833596009761095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5833594918367453e-05, + "grad_norm": 2.6818740367889404, + "learning_rate": 4.815599830436625e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.8431477546691895, + "num_tokens": 43354611.0, + "step": 1137 + }, + { + "epoch": 0.1447652970360005, + "ewc_loss": 0.0026351360138505697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.635135933815036e-05, + "grad_norm": 2.5636894702911377, + "learning_rate": 4.819838914794405e-07, + "loss": 0.4641, + "mean_token_accuracy": 0.8479411602020264, + "num_tokens": 43390640.0, + "step": 1138 + }, + { + "epoch": 0.14489250731459102, + "ewc_loss": 0.0025911256670951843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5911256670951843e-05, + "grad_norm": 2.552591562271118, + "learning_rate": 4.824077999152183e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8383772969245911, + "num_tokens": 43424513.0, + "step": 1139 + }, + { + "epoch": 0.14501971759318152, + "ewc_loss": 0.002587998053058982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5879980967147276e-05, + "grad_norm": 2.5247561931610107, + "learning_rate": 4.828317083509962e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.84007728099823, + "num_tokens": 43461699.0, + "step": 1140 + }, + { + "epoch": 0.14514692787177205, + "ewc_loss": 0.0025919743347913027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5919742256519385e-05, + "grad_norm": 2.6118204593658447, + "learning_rate": 4.83255616786774e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.853083610534668, + "num_tokens": 43494681.0, + "step": 1141 + }, + { + "epoch": 0.14527413815036255, + "ewc_loss": 0.0026274060364812613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.627406138344668e-05, + "grad_norm": 2.5176098346710205, + "learning_rate": 4.83679525222552e-07, + "loss": 0.44, + "mean_token_accuracy": 0.8547719120979309, + "num_tokens": 43531318.0, + "step": 1142 + }, + { + "epoch": 0.14540134842895305, + "ewc_loss": 0.002601310843601823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.601310916361399e-05, + "grad_norm": 2.4979920387268066, + "learning_rate": 4.841034336583298e-07, + "loss": 0.4569, + "mean_token_accuracy": 0.8547895550727844, + "num_tokens": 43569991.0, + "step": 1143 + }, + { + "epoch": 0.14552855870754358, + "ewc_loss": 0.00260225054807961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6022506062872708e-05, + "grad_norm": 2.5417706966400146, + "learning_rate": 4.845273420941076e-07, + "loss": 0.4443, + "mean_token_accuracy": 0.8564649820327759, + "num_tokens": 43609978.0, + "step": 1144 + }, + { + "epoch": 0.14565576898613408, + "ewc_loss": 0.0026166706811636686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6166706447838806e-05, + "grad_norm": 2.660782814025879, + "learning_rate": 4.849512505298855e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8340598940849304, + "num_tokens": 43640419.0, + "step": 1145 + }, + { + "epoch": 0.14578297926472458, + "ewc_loss": 0.002653787611052394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6537876692600548e-05, + "grad_norm": 2.4891836643218994, + "learning_rate": 4.853751589656634e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.8505834341049194, + "num_tokens": 43682408.0, + "step": 1146 + }, + { + "epoch": 0.1459101895433151, + "ewc_loss": 0.002592124743387103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592124837974552e-05, + "grad_norm": 2.4863672256469727, + "learning_rate": 4.857990674014413e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.8400452733039856, + "num_tokens": 43726158.0, + "step": 1147 + }, + { + "epoch": 0.1460373998219056, + "ewc_loss": 0.0026032698806375265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.603269967949018e-05, + "grad_norm": 2.5321788787841797, + "learning_rate": 4.862229758372191e-07, + "loss": 0.5186, + "mean_token_accuracy": 0.8292228579521179, + "num_tokens": 43766466.0, + "step": 1148 + }, + { + "epoch": 0.1461646101004961, + "ewc_loss": 0.0026276097632944584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6276096832589246e-05, + "grad_norm": 2.5123770236968994, + "learning_rate": 4.86646884272997e-07, + "loss": 0.508, + "mean_token_accuracy": 0.8322412967681885, + "num_tokens": 43807255.0, + "step": 1149 + }, + { + "epoch": 0.14629182037908664, + "ewc_loss": 0.00261692120693624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6169213015236892e-05, + "grad_norm": 2.5430312156677246, + "learning_rate": 4.870707927087749e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8402764797210693, + "num_tokens": 43845419.0, + "step": 1150 + }, + { + "epoch": 0.14641903065767714, + "ewc_loss": 0.0026324549689888954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6324549253331497e-05, + "grad_norm": 2.528470039367676, + "learning_rate": 4.874947011445528e-07, + "loss": 0.4935, + "mean_token_accuracy": 0.8413658142089844, + "num_tokens": 43882942.0, + "step": 1151 + }, + { + "epoch": 0.14654624093626764, + "ewc_loss": 0.002625470282509923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.625470187922474e-05, + "grad_norm": 2.4353690147399902, + "learning_rate": 4.879186095803306e-07, + "loss": 0.4178, + "mean_token_accuracy": 0.8627387881278992, + "num_tokens": 43923109.0, + "step": 1152 + }, + { + "epoch": 0.14667345121485817, + "ewc_loss": 0.002602645196020603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6026451450889e-05, + "grad_norm": 2.5888404846191406, + "learning_rate": 4.883425180161085e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.8504862785339355, + "num_tokens": 43959752.0, + "step": 1153 + }, + { + "epoch": 0.14680066149344867, + "ewc_loss": 0.0026588942855596542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6588943001115695e-05, + "grad_norm": 2.602107524871826, + "learning_rate": 4.887664264518864e-07, + "loss": 0.499, + "mean_token_accuracy": 0.841960608959198, + "num_tokens": 43996247.0, + "step": 1154 + }, + { + "epoch": 0.14692787177203917, + "ewc_loss": 0.0026556234806776047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6556233933661133e-05, + "grad_norm": 2.5254101753234863, + "learning_rate": 4.891903348876643e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8450879454612732, + "num_tokens": 44035751.0, + "step": 1155 + }, + { + "epoch": 0.1470550820506297, + "ewc_loss": 0.002628057263791561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.628057154652197e-05, + "grad_norm": 2.5771052837371826, + "learning_rate": 4.896142433234421e-07, + "loss": 0.5178, + "mean_token_accuracy": 0.8379005193710327, + "num_tokens": 44073225.0, + "step": 1156 + }, + { + "epoch": 0.1471822923292202, + "ewc_loss": 0.002642351668328047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.642351682879962e-05, + "grad_norm": 2.57509446144104, + "learning_rate": 4.9003815175922e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8369173407554626, + "num_tokens": 44111761.0, + "step": 1157 + }, + { + "epoch": 0.1473095026078107, + "ewc_loss": 0.002650547306984663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6505473215365782e-05, + "grad_norm": 2.5172104835510254, + "learning_rate": 4.904620601949979e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8450880646705627, + "num_tokens": 44151311.0, + "step": 1158 + }, + { + "epoch": 0.14743671288640123, + "ewc_loss": 0.0026308917440474033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6308916858397424e-05, + "grad_norm": 2.6545445919036865, + "learning_rate": 4.908859686307758e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.8353486061096191, + "num_tokens": 44184623.0, + "step": 1159 + }, + { + "epoch": 0.14756392316499173, + "ewc_loss": 0.002675172872841358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.675172800081782e-05, + "grad_norm": 2.5702550411224365, + "learning_rate": 4.913098770665536e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.840517520904541, + "num_tokens": 44221753.0, + "step": 1160 + }, + { + "epoch": 0.14769113344358223, + "ewc_loss": 0.002646728651598096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6467287170817144e-05, + "grad_norm": 2.5223798751831055, + "learning_rate": 4.917337855023314e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.846028208732605, + "num_tokens": 44262542.0, + "step": 1161 + }, + { + "epoch": 0.14781834372217276, + "ewc_loss": 0.002638518810272217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6385187084088102e-05, + "grad_norm": 2.580219030380249, + "learning_rate": 4.921576939381094e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8487656712532043, + "num_tokens": 44300294.0, + "step": 1162 + }, + { + "epoch": 0.14794555400076326, + "ewc_loss": 0.0026631460059434175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6631460059434175e-05, + "grad_norm": 2.570003032684326, + "learning_rate": 4.925816023738872e-07, + "loss": 0.5219, + "mean_token_accuracy": 0.8302323818206787, + "num_tokens": 44338476.0, + "step": 1163 + }, + { + "epoch": 0.14807276427935376, + "ewc_loss": 0.0026653253007680178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.665325337147806e-05, + "grad_norm": 2.6893703937530518, + "learning_rate": 4.930055108096651e-07, + "loss": 0.4219, + "mean_token_accuracy": 0.8611538410186768, + "num_tokens": 44367437.0, + "step": 1164 + }, + { + "epoch": 0.1481999745579443, + "ewc_loss": 0.0026988685131073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6988684112438932e-05, + "grad_norm": 2.6090238094329834, + "learning_rate": 4.934294192454429e-07, + "loss": 0.5361, + "mean_token_accuracy": 0.8265691995620728, + "num_tokens": 44402636.0, + "step": 1165 + }, + { + "epoch": 0.1483271848365348, + "ewc_loss": 0.0026683432515710592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6683432224672288e-05, + "grad_norm": 2.5452728271484375, + "learning_rate": 4.938533276812209e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.8447316884994507, + "num_tokens": 44438187.0, + "step": 1166 + }, + { + "epoch": 0.14845439511512531, + "ewc_loss": 0.0026537899393588305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6537900339462794e-05, + "grad_norm": 2.4632339477539062, + "learning_rate": 4.942772361169987e-07, + "loss": 0.4004, + "mean_token_accuracy": 0.8630948066711426, + "num_tokens": 44478345.0, + "step": 1167 + }, + { + "epoch": 0.14858160539371582, + "ewc_loss": 0.0026515042409300804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6515042918617837e-05, + "grad_norm": 2.466770887374878, + "learning_rate": 4.947011445527766e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8448219299316406, + "num_tokens": 44520272.0, + "step": 1168 + }, + { + "epoch": 0.14870881567230632, + "ewc_loss": 0.0026679006405174732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.667900662345346e-05, + "grad_norm": 2.503427505493164, + "learning_rate": 4.951250529885544e-07, + "loss": 0.4336, + "mean_token_accuracy": 0.859359085559845, + "num_tokens": 44561058.0, + "step": 1169 + }, + { + "epoch": 0.14883602595089684, + "ewc_loss": 0.0026783510111272335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6783509383676574e-05, + "grad_norm": 2.6392550468444824, + "learning_rate": 4.955489614243324e-07, + "loss": 0.5638, + "mean_token_accuracy": 0.8148790597915649, + "num_tokens": 44599371.0, + "step": 1170 + }, + { + "epoch": 0.14896323622948734, + "ewc_loss": 0.0027176032308489084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.717603274504654e-05, + "grad_norm": 2.499070405960083, + "learning_rate": 4.959728698601102e-07, + "loss": 0.4759, + "mean_token_accuracy": 0.8375464677810669, + "num_tokens": 44640499.0, + "step": 1171 + }, + { + "epoch": 0.14909044650807785, + "ewc_loss": 0.0026652768719941378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6652769520296715e-05, + "grad_norm": 2.4714105129241943, + "learning_rate": 4.963967782958881e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8489002585411072, + "num_tokens": 44683164.0, + "step": 1172 + }, + { + "epoch": 0.14921765678666837, + "ewc_loss": 0.0026632824447005987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6632824301486835e-05, + "grad_norm": 2.535808801651001, + "learning_rate": 4.968206867316659e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8448361754417419, + "num_tokens": 44721938.0, + "step": 1173 + }, + { + "epoch": 0.14934486706525887, + "ewc_loss": 0.0026923888362944126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.692388807190582e-05, + "grad_norm": 2.730799436569214, + "learning_rate": 4.972445951674439e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.8518639802932739, + "num_tokens": 44753506.0, + "step": 1174 + }, + { + "epoch": 0.14947207734384937, + "ewc_loss": 0.002746324287727475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7463243895908818e-05, + "grad_norm": 2.6467857360839844, + "learning_rate": 4.976685036032216e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.824925422668457, + "num_tokens": 44792170.0, + "step": 1175 + }, + { + "epoch": 0.1495992876224399, + "ewc_loss": 0.002708230633288622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7082305678050034e-05, + "grad_norm": 2.466728925704956, + "learning_rate": 4.980924120389996e-07, + "loss": 0.4154, + "mean_token_accuracy": 0.8644267320632935, + "num_tokens": 44830733.0, + "step": 1176 + }, + { + "epoch": 0.1497264979010304, + "ewc_loss": 0.0026562500279396772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.656250035215635e-05, + "grad_norm": 2.507190465927124, + "learning_rate": 4.985163204747774e-07, + "loss": 0.427, + "mean_token_accuracy": 0.8594459295272827, + "num_tokens": 44869743.0, + "step": 1177 + }, + { + "epoch": 0.1498537081796209, + "ewc_loss": 0.0026945758145302534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6945757781504653e-05, + "grad_norm": 2.5325589179992676, + "learning_rate": 4.989402289105554e-07, + "loss": 0.4737, + "mean_token_accuracy": 0.8508780598640442, + "num_tokens": 44913358.0, + "step": 1178 + }, + { + "epoch": 0.14998091845821143, + "ewc_loss": 0.002701397053897381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7013969884137623e-05, + "grad_norm": 2.6939330101013184, + "learning_rate": 4.993641373463331e-07, + "loss": 0.533, + "mean_token_accuracy": 0.8263342380523682, + "num_tokens": 44949963.0, + "step": 1179 + }, + { + "epoch": 0.15010812873680193, + "ewc_loss": 0.002740978030487895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7409780159359798e-05, + "grad_norm": 2.5361123085021973, + "learning_rate": 4.997880457821111e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8388289213180542, + "num_tokens": 44988569.0, + "step": 1180 + }, + { + "epoch": 0.15023533901539243, + "ewc_loss": 0.0026869908906519413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6869909561355598e-05, + "grad_norm": 2.4705731868743896, + "learning_rate": 5.002119542178889e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8472936153411865, + "num_tokens": 45032634.0, + "step": 1181 + }, + { + "epoch": 0.15036254929398296, + "ewc_loss": 0.002676208270713687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6762081688502803e-05, + "grad_norm": 2.541600465774536, + "learning_rate": 5.006358626536667e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.8577492237091064, + "num_tokens": 45069685.0, + "step": 1182 + }, + { + "epoch": 0.15048975957257346, + "ewc_loss": 0.0027152583934366703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.715258415264543e-05, + "grad_norm": 2.5588538646698, + "learning_rate": 5.010597710894446e-07, + "loss": 0.457, + "mean_token_accuracy": 0.8501347303390503, + "num_tokens": 45107481.0, + "step": 1183 + }, + { + "epoch": 0.15061696985116396, + "ewc_loss": 0.0027144914492964745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7144915293320082e-05, + "grad_norm": 2.5517313480377197, + "learning_rate": 5.014836795252225e-07, + "loss": 0.4423, + "mean_token_accuracy": 0.8537200689315796, + "num_tokens": 45145747.0, + "step": 1184 + }, + { + "epoch": 0.1507441801297545, + "ewc_loss": 0.0027073342353105545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7073343517258763e-05, + "grad_norm": 2.5714688301086426, + "learning_rate": 5.019075879610004e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8559057116508484, + "num_tokens": 45183140.0, + "step": 1185 + }, + { + "epoch": 0.150871390408345, + "ewc_loss": 0.0027091731317341328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.709173168113921e-05, + "grad_norm": 2.5023133754730225, + "learning_rate": 5.023314963967783e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8408783674240112, + "num_tokens": 45224911.0, + "step": 1186 + }, + { + "epoch": 0.1509986006869355, + "ewc_loss": 0.0026905627455562353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6905627237283625e-05, + "grad_norm": 2.664280414581299, + "learning_rate": 5.027554048325562e-07, + "loss": 0.5347, + "mean_token_accuracy": 0.8267614245414734, + "num_tokens": 45257388.0, + "step": 1187 + }, + { + "epoch": 0.15112581096552602, + "ewc_loss": 0.0027500742580741644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7500742362462915e-05, + "grad_norm": 2.566565752029419, + "learning_rate": 5.03179313268334e-07, + "loss": 0.5217, + "mean_token_accuracy": 0.8317645788192749, + "num_tokens": 45297876.0, + "step": 1188 + }, + { + "epoch": 0.15125302124411652, + "ewc_loss": 0.0027030983474105597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7030982892028987e-05, + "grad_norm": 2.5890183448791504, + "learning_rate": 5.036032217041119e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.8350836634635925, + "num_tokens": 45334040.0, + "step": 1189 + }, + { + "epoch": 0.15138023152270705, + "ewc_loss": 0.0027173138223588467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.71731387329055e-05, + "grad_norm": 2.4772467613220215, + "learning_rate": 5.040271301398897e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8401468992233276, + "num_tokens": 45375985.0, + "step": 1190 + }, + { + "epoch": 0.15150744180129755, + "ewc_loss": 0.0026878931093961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6878931748797186e-05, + "grad_norm": 2.484642505645752, + "learning_rate": 5.044510385756676e-07, + "loss": 0.449, + "mean_token_accuracy": 0.848978579044342, + "num_tokens": 45419951.0, + "step": 1191 + }, + { + "epoch": 0.15163465207988805, + "ewc_loss": 0.0027013930957764387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7013929866370745e-05, + "grad_norm": 2.523327350616455, + "learning_rate": 5.048749470114455e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8495953679084778, + "num_tokens": 45456988.0, + "step": 1192 + }, + { + "epoch": 0.15176186235847858, + "ewc_loss": 0.0027178965974599123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7178964955965057e-05, + "grad_norm": 2.692898750305176, + "learning_rate": 5.052988554472234e-07, + "loss": 0.4807, + "mean_token_accuracy": 0.8410559296607971, + "num_tokens": 45488226.0, + "step": 1193 + }, + { + "epoch": 0.15188907263706908, + "ewc_loss": 0.0027693912852555513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.769391176116187e-05, + "grad_norm": 2.5654516220092773, + "learning_rate": 5.057227638830013e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.8439781665802002, + "num_tokens": 45524661.0, + "step": 1194 + }, + { + "epoch": 0.15201628291565958, + "ewc_loss": 0.0027175333816558123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.717533425311558e-05, + "grad_norm": 2.604012966156006, + "learning_rate": 5.061466723187792e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.8436267971992493, + "num_tokens": 45563491.0, + "step": 1195 + }, + { + "epoch": 0.1521434931942501, + "ewc_loss": 0.002741579432040453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.741579373832792e-05, + "grad_norm": 2.673365831375122, + "learning_rate": 5.065705807545569e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8519602417945862, + "num_tokens": 45596771.0, + "step": 1196 + }, + { + "epoch": 0.1522707034728406, + "ewc_loss": 0.0027668941766023636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7668940674629994e-05, + "grad_norm": 2.493177652359009, + "learning_rate": 5.069944891903349e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.8454790115356445, + "num_tokens": 45640049.0, + "step": 1197 + }, + { + "epoch": 0.1523979137514311, + "ewc_loss": 0.002705286256968975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7052861696574837e-05, + "grad_norm": 2.619102954864502, + "learning_rate": 5.074183976261127e-07, + "loss": 0.47, + "mean_token_accuracy": 0.8440405130386353, + "num_tokens": 45677375.0, + "step": 1198 + }, + { + "epoch": 0.15252512403002164, + "ewc_loss": 0.002766819903627038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7668198526953347e-05, + "grad_norm": 2.5785834789276123, + "learning_rate": 5.078423060618906e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8463661074638367, + "num_tokens": 45719253.0, + "step": 1199 + }, + { + "epoch": 0.15265233430861214, + "ewc_loss": 0.002745056990534067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7450570996734314e-05, + "grad_norm": 2.5800065994262695, + "learning_rate": 5.082662144976685e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8428896069526672, + "num_tokens": 45756065.0, + "step": 1200 + }, + { + "epoch": 0.15277954458720264, + "ewc_loss": 0.0027446397580206394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.744639823504258e-05, + "grad_norm": 2.5644404888153076, + "learning_rate": 5.086901229334464e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8515443801879883, + "num_tokens": 45791707.0, + "step": 1201 + }, + { + "epoch": 0.15290675486579317, + "ewc_loss": 0.002751828171312809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7518282877281308e-05, + "grad_norm": 2.580566883087158, + "learning_rate": 5.091140313692243e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8426980972290039, + "num_tokens": 45828464.0, + "step": 1202 + }, + { + "epoch": 0.15303396514438367, + "ewc_loss": 0.0027546309866011143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7546309866011143e-05, + "grad_norm": 2.5301907062530518, + "learning_rate": 5.095379398050022e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.848473846912384, + "num_tokens": 45867046.0, + "step": 1203 + }, + { + "epoch": 0.15316117542297417, + "ewc_loss": 0.0027466739993542433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7466739993542433e-05, + "grad_norm": 2.6869959831237793, + "learning_rate": 5.099618482407799e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8399938941001892, + "num_tokens": 45900530.0, + "step": 1204 + }, + { + "epoch": 0.1532883857015647, + "ewc_loss": 0.00279766833409667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7976682758890092e-05, + "grad_norm": 2.581249475479126, + "learning_rate": 5.103857566765578e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.848006010055542, + "num_tokens": 45940002.0, + "step": 1205 + }, + { + "epoch": 0.1534155959801552, + "ewc_loss": 0.0027601562906056643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.760156348813325e-05, + "grad_norm": 2.596294641494751, + "learning_rate": 5.108096651123357e-07, + "loss": 0.4279, + "mean_token_accuracy": 0.8592420816421509, + "num_tokens": 45975299.0, + "step": 1206 + }, + { + "epoch": 0.1535428062587457, + "ewc_loss": 0.002765495330095291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7654952646116726e-05, + "grad_norm": 2.5876920223236084, + "learning_rate": 5.112335735481135e-07, + "loss": 0.467, + "mean_token_accuracy": 0.8460529446601868, + "num_tokens": 46012693.0, + "step": 1207 + }, + { + "epoch": 0.15367001653733622, + "ewc_loss": 0.0027717279735952616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7717280318029225e-05, + "grad_norm": 2.4849774837493896, + "learning_rate": 5.116574819838915e-07, + "loss": 0.3759, + "mean_token_accuracy": 0.8764937520027161, + "num_tokens": 46051558.0, + "step": 1208 + }, + { + "epoch": 0.15379722681592672, + "ewc_loss": 0.0027403703425079584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.740370291576255e-05, + "grad_norm": 2.632523536682129, + "learning_rate": 5.120813904196693e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.851845383644104, + "num_tokens": 46087173.0, + "step": 1209 + }, + { + "epoch": 0.15392443709451722, + "ewc_loss": 0.002792621497064829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7926214897888713e-05, + "grad_norm": 2.6543238162994385, + "learning_rate": 5.125052988554473e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.8358270525932312, + "num_tokens": 46120482.0, + "step": 1210 + }, + { + "epoch": 0.15405164737310775, + "ewc_loss": 0.0027918776031583548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7918777050217614e-05, + "grad_norm": 2.601687431335449, + "learning_rate": 5.12929207291225e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8488711714744568, + "num_tokens": 46154478.0, + "step": 1211 + }, + { + "epoch": 0.15417885765169825, + "ewc_loss": 0.0027717328630387783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7717327611753717e-05, + "grad_norm": 2.5347280502319336, + "learning_rate": 5.133531157270029e-07, + "loss": 0.4363, + "mean_token_accuracy": 0.8572784662246704, + "num_tokens": 46195012.0, + "step": 1212 + }, + { + "epoch": 0.15430606793028875, + "ewc_loss": 0.0027631919365376234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7631918783299625e-05, + "grad_norm": 2.5556976795196533, + "learning_rate": 5.137770241627808e-07, + "loss": 0.4023, + "mean_token_accuracy": 0.8665189743041992, + "num_tokens": 46230641.0, + "step": 1213 + }, + { + "epoch": 0.15443327820887928, + "ewc_loss": 0.0027919525746256113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.791952465486247e-05, + "grad_norm": 2.6168196201324463, + "learning_rate": 5.142009325985587e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8573288917541504, + "num_tokens": 46268993.0, + "step": 1214 + }, + { + "epoch": 0.15456048848746978, + "ewc_loss": 0.0028026639483869076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8026639483869076e-05, + "grad_norm": 2.5134806632995605, + "learning_rate": 5.146248410343365e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8451399207115173, + "num_tokens": 46309310.0, + "step": 1215 + }, + { + "epoch": 0.1546876987660603, + "ewc_loss": 0.0027674168813973665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7674168450175785e-05, + "grad_norm": 2.481476306915283, + "learning_rate": 5.150487494701145e-07, + "loss": 0.4151, + "mean_token_accuracy": 0.8623610734939575, + "num_tokens": 46352415.0, + "step": 1216 + }, + { + "epoch": 0.1548149090446508, + "ewc_loss": 0.002763657597824931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7636575396172702e-05, + "grad_norm": 2.6133193969726562, + "learning_rate": 5.154726579058923e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8455712795257568, + "num_tokens": 46387838.0, + "step": 1217 + }, + { + "epoch": 0.1549421193232413, + "ewc_loss": 0.002812914317473769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8129143174737692e-05, + "grad_norm": 2.4890105724334717, + "learning_rate": 5.158965663416703e-07, + "loss": 0.4656, + "mean_token_accuracy": 0.8472476005554199, + "num_tokens": 46432669.0, + "step": 1218 + }, + { + "epoch": 0.15506932960183184, + "ewc_loss": 0.002769761485978961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.769761522358749e-05, + "grad_norm": 2.48388409614563, + "learning_rate": 5.16320474777448e-07, + "loss": 0.4231, + "mean_token_accuracy": 0.8597022294998169, + "num_tokens": 46474383.0, + "step": 1219 + }, + { + "epoch": 0.15519653988042234, + "ewc_loss": 0.002771869534626603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7718695491785184e-05, + "grad_norm": 2.5827884674072266, + "learning_rate": 5.167443832132259e-07, + "loss": 0.4708, + "mean_token_accuracy": 0.844717264175415, + "num_tokens": 46514263.0, + "step": 1220 + }, + { + "epoch": 0.15532375015901284, + "ewc_loss": 0.0028023223858326674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8023223421769217e-05, + "grad_norm": 2.5808138847351074, + "learning_rate": 5.171682916490038e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8545519113540649, + "num_tokens": 46549835.0, + "step": 1221 + }, + { + "epoch": 0.15545096043760337, + "ewc_loss": 0.0027966066263616085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7966067136731e-05, + "grad_norm": 2.628019332885742, + "learning_rate": 5.175922000847816e-07, + "loss": 0.4267, + "mean_token_accuracy": 0.8578535318374634, + "num_tokens": 46584811.0, + "step": 1222 + }, + { + "epoch": 0.15557817071619387, + "ewc_loss": 0.002803188282996416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.80318836303195e-05, + "grad_norm": 2.5562820434570312, + "learning_rate": 5.180161085205595e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8519055843353271, + "num_tokens": 46621441.0, + "step": 1223 + }, + { + "epoch": 0.15570538099478437, + "ewc_loss": 0.0027862077113240957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7862077331519686e-05, + "grad_norm": 2.564471483230591, + "learning_rate": 5.184400169563374e-07, + "loss": 0.5002, + "mean_token_accuracy": 0.8374488949775696, + "num_tokens": 46661572.0, + "step": 1224 + }, + { + "epoch": 0.1558325912733749, + "ewc_loss": 0.002789967693388462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7899677661480382e-05, + "grad_norm": 2.5438942909240723, + "learning_rate": 5.188639253921153e-07, + "loss": 0.4756, + "mean_token_accuracy": 0.8431648015975952, + "num_tokens": 46702157.0, + "step": 1225 + }, + { + "epoch": 0.1559598015519654, + "ewc_loss": 0.002789908554404974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.789908648992423e-05, + "grad_norm": 2.5622260570526123, + "learning_rate": 5.192878338278932e-07, + "loss": 0.4497, + "mean_token_accuracy": 0.8514975309371948, + "num_tokens": 46739119.0, + "step": 1226 + }, + { + "epoch": 0.1560870118305559, + "ewc_loss": 0.0027993549592792988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.799354842863977e-05, + "grad_norm": 2.5955638885498047, + "learning_rate": 5.19711742263671e-07, + "loss": 0.5098, + "mean_token_accuracy": 0.8340878486633301, + "num_tokens": 46783990.0, + "step": 1227 + }, + { + "epoch": 0.15621422210914643, + "ewc_loss": 0.0028079773765057325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8079773983336054e-05, + "grad_norm": 2.6657016277313232, + "learning_rate": 5.201356506994488e-07, + "loss": 0.4508, + "mean_token_accuracy": 0.8467807173728943, + "num_tokens": 46816131.0, + "step": 1228 + }, + { + "epoch": 0.15634143238773693, + "ewc_loss": 0.0028282892890274525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8282893254072405e-05, + "grad_norm": 2.617070436477661, + "learning_rate": 5.205595591352268e-07, + "loss": 0.5302, + "mean_token_accuracy": 0.8279269933700562, + "num_tokens": 46856453.0, + "step": 1229 + }, + { + "epoch": 0.15646864266632743, + "ewc_loss": 0.002804899588227272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8048994863638654e-05, + "grad_norm": 2.53946590423584, + "learning_rate": 5.209834675710046e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.846341609954834, + "num_tokens": 46897961.0, + "step": 1230 + }, + { + "epoch": 0.15659585294491796, + "ewc_loss": 0.00279063917696476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.790639155136887e-05, + "grad_norm": 2.6945879459381104, + "learning_rate": 5.214073760067825e-07, + "loss": 0.5435, + "mean_token_accuracy": 0.8256620168685913, + "num_tokens": 46931961.0, + "step": 1231 + }, + { + "epoch": 0.15672306322350846, + "ewc_loss": 0.002849217038601637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.849216980393976e-05, + "grad_norm": 2.593782663345337, + "learning_rate": 5.218312844425604e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.8495328426361084, + "num_tokens": 46967822.0, + "step": 1232 + }, + { + "epoch": 0.15685027350209896, + "ewc_loss": 0.0028102751821279526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8102751457481645e-05, + "grad_norm": 2.5965371131896973, + "learning_rate": 5.222551928783383e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.8519566655158997, + "num_tokens": 47004511.0, + "step": 1233 + }, + { + "epoch": 0.1569774837806895, + "ewc_loss": 0.0028224922716617584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8224923880770802e-05, + "grad_norm": 2.602616310119629, + "learning_rate": 5.226791013141161e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8397517204284668, + "num_tokens": 47047244.0, + "step": 1234 + }, + { + "epoch": 0.15710469405928, + "ewc_loss": 0.002827460179105401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8274602300371043e-05, + "grad_norm": 2.5628890991210938, + "learning_rate": 5.23103009749894e-07, + "loss": 0.4562, + "mean_token_accuracy": 0.8514228463172913, + "num_tokens": 47088206.0, + "step": 1235 + }, + { + "epoch": 0.1572319043378705, + "ewc_loss": 0.0028177269268780947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8177269996376708e-05, + "grad_norm": 2.529697895050049, + "learning_rate": 5.235269181856718e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8472429513931274, + "num_tokens": 47132733.0, + "step": 1236 + }, + { + "epoch": 0.15735911461646102, + "ewc_loss": 0.002808230696246028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.808230601658579e-05, + "grad_norm": 2.651442050933838, + "learning_rate": 5.239508266214498e-07, + "loss": 0.522, + "mean_token_accuracy": 0.8288585543632507, + "num_tokens": 47166840.0, + "step": 1237 + }, + { + "epoch": 0.15748632489505152, + "ewc_loss": 0.002852774690836668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8527747417683713e-05, + "grad_norm": 2.612835645675659, + "learning_rate": 5.243747350572276e-07, + "loss": 0.4815, + "mean_token_accuracy": 0.8437860012054443, + "num_tokens": 47203444.0, + "step": 1238 + }, + { + "epoch": 0.15761353517364202, + "ewc_loss": 0.0028360215947031975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8360214855638333e-05, + "grad_norm": 2.5881247520446777, + "learning_rate": 5.247986434930056e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8478418588638306, + "num_tokens": 47240380.0, + "step": 1239 + }, + { + "epoch": 0.15774074545223254, + "ewc_loss": 0.0028302865102887154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8302863938733935e-05, + "grad_norm": 2.6324901580810547, + "learning_rate": 5.252225519287834e-07, + "loss": 0.5346, + "mean_token_accuracy": 0.8268444538116455, + "num_tokens": 47277886.0, + "step": 1240 + }, + { + "epoch": 0.15786795573082305, + "ewc_loss": 0.0028501811902970076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8501812266767956e-05, + "grad_norm": 2.6557304859161377, + "learning_rate": 5.256464603645613e-07, + "loss": 0.4651, + "mean_token_accuracy": 0.8505563139915466, + "num_tokens": 47315261.0, + "step": 1241 + }, + { + "epoch": 0.15799516600941357, + "ewc_loss": 0.0028614390175789595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.861438952095341e-05, + "grad_norm": 2.5300800800323486, + "learning_rate": 5.260703688003391e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.8474764227867126, + "num_tokens": 47357489.0, + "step": 1242 + }, + { + "epoch": 0.15812237628800407, + "ewc_loss": 0.002811543410643935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8115433451603167e-05, + "grad_norm": 2.6699674129486084, + "learning_rate": 5.26494277236117e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8427259922027588, + "num_tokens": 47389851.0, + "step": 1243 + }, + { + "epoch": 0.15824958656659457, + "ewc_loss": 0.002874158788472414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8741587811964564e-05, + "grad_norm": 2.6346912384033203, + "learning_rate": 5.269181856718948e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8340482711791992, + "num_tokens": 47427491.0, + "step": 1244 + }, + { + "epoch": 0.1583767968451851, + "ewc_loss": 0.0028580189682543278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.858018888218794e-05, + "grad_norm": 2.568047046661377, + "learning_rate": 5.273420941076727e-07, + "loss": 0.4493, + "mean_token_accuracy": 0.8529731035232544, + "num_tokens": 47462790.0, + "step": 1245 + }, + { + "epoch": 0.1585040071237756, + "ewc_loss": 0.002831468591466546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.831468555086758e-05, + "grad_norm": 2.5090341567993164, + "learning_rate": 5.277660025434506e-07, + "loss": 0.4139, + "mean_token_accuracy": 0.862859845161438, + "num_tokens": 47502240.0, + "step": 1246 + }, + { + "epoch": 0.1586312174023661, + "ewc_loss": 0.002832825994119048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8328260668786243e-05, + "grad_norm": 2.517648935317993, + "learning_rate": 5.281899109792285e-07, + "loss": 0.4372, + "mean_token_accuracy": 0.8557885885238647, + "num_tokens": 47542493.0, + "step": 1247 + }, + { + "epoch": 0.15875842768095663, + "ewc_loss": 0.002843666123226285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8436661523301154e-05, + "grad_norm": 2.5649681091308594, + "learning_rate": 5.286138194150064e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8459203243255615, + "num_tokens": 47583378.0, + "step": 1248 + }, + { + "epoch": 0.15888563795954713, + "ewc_loss": 0.0028529248666018248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.852924808394164e-05, + "grad_norm": 2.595999002456665, + "learning_rate": 5.290377278507841e-07, + "loss": 0.499, + "mean_token_accuracy": 0.842491626739502, + "num_tokens": 47622671.0, + "step": 1249 + }, + { + "epoch": 0.15901284823813763, + "ewc_loss": 0.002864065347239375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8640653908951208e-05, + "grad_norm": 2.572849750518799, + "learning_rate": 5.294616362865621e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8537516593933105, + "num_tokens": 47663774.0, + "step": 1250 + }, + { + "epoch": 0.15914005851672816, + "ewc_loss": 0.0028556801844388247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.855680213542655e-05, + "grad_norm": 2.585120916366577, + "learning_rate": 5.298855447223399e-07, + "loss": 0.5269, + "mean_token_accuracy": 0.8376402854919434, + "num_tokens": 47706143.0, + "step": 1251 + }, + { + "epoch": 0.15926726879531866, + "ewc_loss": 0.0028634779155254364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8634778573177755e-05, + "grad_norm": 2.6778388023376465, + "learning_rate": 5.303094531581178e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.8413690328598022, + "num_tokens": 47741197.0, + "step": 1252 + }, + { + "epoch": 0.15939447907390916, + "ewc_loss": 0.002890330273658037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8903303245897405e-05, + "grad_norm": 2.731762647628784, + "learning_rate": 5.307333615938957e-07, + "loss": 0.4124, + "mean_token_accuracy": 0.8626996874809265, + "num_tokens": 47779306.0, + "step": 1253 + }, + { + "epoch": 0.1595216893524997, + "ewc_loss": 0.0028975331224501133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8975331588299014e-05, + "grad_norm": 2.6929385662078857, + "learning_rate": 5.311572700296736e-07, + "loss": 0.4969, + "mean_token_accuracy": 0.8364636301994324, + "num_tokens": 47811056.0, + "step": 1254 + }, + { + "epoch": 0.1596488996310902, + "ewc_loss": 0.0028772205114364624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8772205041605048e-05, + "grad_norm": 2.594420909881592, + "learning_rate": 5.315811784654515e-07, + "loss": 0.4063, + "mean_token_accuracy": 0.8617194890975952, + "num_tokens": 47847075.0, + "step": 1255 + }, + { + "epoch": 0.1597761099096807, + "ewc_loss": 0.002847951138392091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8479511456680484e-05, + "grad_norm": 2.489779233932495, + "learning_rate": 5.320050869012294e-07, + "loss": 0.4641, + "mean_token_accuracy": 0.848368227481842, + "num_tokens": 47892639.0, + "step": 1256 + }, + { + "epoch": 0.15990332018827122, + "ewc_loss": 0.002825636649504304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8256366931600496e-05, + "grad_norm": 2.5863773822784424, + "learning_rate": 5.324289953370071e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.8333272933959961, + "num_tokens": 47933522.0, + "step": 1257 + }, + { + "epoch": 0.16003053046686172, + "ewc_loss": 0.002882549539208412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.882549597416073e-05, + "grad_norm": 2.556002378463745, + "learning_rate": 5.328529037727851e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.849307119846344, + "num_tokens": 47977307.0, + "step": 1258 + }, + { + "epoch": 0.16015774074545222, + "ewc_loss": 0.002861830173060298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8618302167160437e-05, + "grad_norm": 2.6650822162628174, + "learning_rate": 5.332768122085629e-07, + "loss": 0.4539, + "mean_token_accuracy": 0.8497723340988159, + "num_tokens": 48015129.0, + "step": 1259 + }, + { + "epoch": 0.16028495102404275, + "ewc_loss": 0.002895386889576912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8953869332326576e-05, + "grad_norm": 2.6337380409240723, + "learning_rate": 5.337007206443408e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.8389871120452881, + "num_tokens": 48051904.0, + "step": 1260 + }, + { + "epoch": 0.16041216130263325, + "ewc_loss": 0.002887058537453413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8870585083495826e-05, + "grad_norm": 2.532486915588379, + "learning_rate": 5.341246290801187e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8554943799972534, + "num_tokens": 48094001.0, + "step": 1261 + }, + { + "epoch": 0.16053937158122375, + "ewc_loss": 0.002850770251825452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8507702154456638e-05, + "grad_norm": 2.5461971759796143, + "learning_rate": 5.345485375158966e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8515421748161316, + "num_tokens": 48134124.0, + "step": 1262 + }, + { + "epoch": 0.16066658185981428, + "ewc_loss": 0.00287368381395936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8736838430631906e-05, + "grad_norm": 2.6037540435791016, + "learning_rate": 5.349724459516745e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.8558779954910278, + "num_tokens": 48170730.0, + "step": 1263 + }, + { + "epoch": 0.16079379213840478, + "ewc_loss": 0.00289686257019639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8968626793357544e-05, + "grad_norm": 2.66414213180542, + "learning_rate": 5.353963543874522e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8367253541946411, + "num_tokens": 48212640.0, + "step": 1264 + }, + { + "epoch": 0.1609210024169953, + "ewc_loss": 0.0029015166219323874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9015165637247264e-05, + "grad_norm": 2.780041456222534, + "learning_rate": 5.358202628232301e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8450533151626587, + "num_tokens": 48243136.0, + "step": 1265 + }, + { + "epoch": 0.1610482126955858, + "ewc_loss": 0.0029342654161155224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9342654670472257e-05, + "grad_norm": 2.594897985458374, + "learning_rate": 5.36244171259008e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8489387035369873, + "num_tokens": 48282905.0, + "step": 1266 + }, + { + "epoch": 0.1611754229741763, + "ewc_loss": 0.0028664993587881327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8664993806160055e-05, + "grad_norm": 2.5800094604492188, + "learning_rate": 5.366680796947859e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.839755654335022, + "num_tokens": 48323325.0, + "step": 1267 + }, + { + "epoch": 0.16130263325276684, + "ewc_loss": 0.0028860089369118214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8860089514637366e-05, + "grad_norm": 2.7144317626953125, + "learning_rate": 5.370919881305637e-07, + "loss": 0.4769, + "mean_token_accuracy": 0.8470291495323181, + "num_tokens": 48358400.0, + "step": 1268 + }, + { + "epoch": 0.16142984353135734, + "ewc_loss": 0.002936330856755376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9363307476160116e-05, + "grad_norm": 2.650442361831665, + "learning_rate": 5.375158965663417e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.8464833498001099, + "num_tokens": 48393018.0, + "step": 1269 + }, + { + "epoch": 0.16155705380994784, + "ewc_loss": 0.0029082507826387882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.908250826294534e-05, + "grad_norm": 2.5763843059539795, + "learning_rate": 5.379398050021195e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8475515842437744, + "num_tokens": 48432952.0, + "step": 1270 + }, + { + "epoch": 0.16168426408853837, + "ewc_loss": 0.002884527202695608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.884527202695608e-05, + "grad_norm": 2.6766979694366455, + "learning_rate": 5.383637134378975e-07, + "loss": 0.4526, + "mean_token_accuracy": 0.8497401475906372, + "num_tokens": 48466913.0, + "step": 1271 + }, + { + "epoch": 0.16181147436712887, + "ewc_loss": 0.0029317073058336973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9317072403500788e-05, + "grad_norm": 2.5932388305664062, + "learning_rate": 5.387876218736752e-07, + "loss": 0.5337, + "mean_token_accuracy": 0.8232161998748779, + "num_tokens": 48512658.0, + "step": 1272 + }, + { + "epoch": 0.16193868464571937, + "ewc_loss": 0.0029073443729430437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.907344423874747e-05, + "grad_norm": 2.5837481021881104, + "learning_rate": 5.392115303094531e-07, + "loss": 0.4307, + "mean_token_accuracy": 0.8566509485244751, + "num_tokens": 48552283.0, + "step": 1273 + }, + { + "epoch": 0.1620658949243099, + "ewc_loss": 0.0028973305597901344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8973305234103464e-05, + "grad_norm": 2.592698812484741, + "learning_rate": 5.39635438745231e-07, + "loss": 0.4099, + "mean_token_accuracy": 0.8638057708740234, + "num_tokens": 48590504.0, + "step": 1274 + }, + { + "epoch": 0.1621931052029004, + "ewc_loss": 0.002916539553552866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9165395972086117e-05, + "grad_norm": 2.612689256668091, + "learning_rate": 5.400593471810089e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.8369372487068176, + "num_tokens": 48629993.0, + "step": 1275 + }, + { + "epoch": 0.1623203154814909, + "ewc_loss": 0.0029226087499409914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9226088372524828e-05, + "grad_norm": 2.7429821491241455, + "learning_rate": 5.404832556167867e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.837867021560669, + "num_tokens": 48665074.0, + "step": 1276 + }, + { + "epoch": 0.16244752576008142, + "ewc_loss": 0.002963357837870717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.963357837870717e-05, + "grad_norm": 2.6105401515960693, + "learning_rate": 5.409071640525647e-07, + "loss": 0.4605, + "mean_token_accuracy": 0.8514394760131836, + "num_tokens": 48705524.0, + "step": 1277 + }, + { + "epoch": 0.16257473603867192, + "ewc_loss": 0.0029091821052134037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9091821488691494e-05, + "grad_norm": 2.6478171348571777, + "learning_rate": 5.413310724883425e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8284264802932739, + "num_tokens": 48744030.0, + "step": 1278 + }, + { + "epoch": 0.16270194631726242, + "ewc_loss": 0.0029344174545258284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.934417534561362e-05, + "grad_norm": 2.590911388397217, + "learning_rate": 5.417549809241205e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8415242433547974, + "num_tokens": 48788401.0, + "step": 1279 + }, + { + "epoch": 0.16282915659585295, + "ewc_loss": 0.0029148005414754152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9148006433388218e-05, + "grad_norm": 2.726790428161621, + "learning_rate": 5.421788893598982e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8381842374801636, + "num_tokens": 48823046.0, + "step": 1280 + }, + { + "epoch": 0.16295636687444345, + "ewc_loss": 0.002955388044938445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.955387935799081e-05, + "grad_norm": 2.5654752254486084, + "learning_rate": 5.42602797795676e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8331775665283203, + "num_tokens": 48868293.0, + "step": 1281 + }, + { + "epoch": 0.16308357715303395, + "ewc_loss": 0.002897650934755802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8976508474443108e-05, + "grad_norm": 2.6404645442962646, + "learning_rate": 5.43026706231454e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.825870931148529, + "num_tokens": 48908775.0, + "step": 1282 + }, + { + "epoch": 0.16321078743162448, + "ewc_loss": 0.0029307459481060505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9307459044503048e-05, + "grad_norm": 2.545030117034912, + "learning_rate": 5.434506146672319e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.8326951265335083, + "num_tokens": 48957640.0, + "step": 1283 + }, + { + "epoch": 0.16333799771021498, + "ewc_loss": 0.0029098973609507084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9098973755026236e-05, + "grad_norm": 2.686319589614868, + "learning_rate": 5.438745231030097e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8331153988838196, + "num_tokens": 48993601.0, + "step": 1284 + }, + { + "epoch": 0.16346520798880548, + "ewc_loss": 0.0029581545386463404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9581544367829338e-05, + "grad_norm": 2.6447901725769043, + "learning_rate": 5.442984315387876e-07, + "loss": 0.4094, + "mean_token_accuracy": 0.8626986742019653, + "num_tokens": 49028411.0, + "step": 1285 + }, + { + "epoch": 0.163592418267396, + "ewc_loss": 0.0029389499686658382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9389499104581773e-05, + "grad_norm": 2.5872771739959717, + "learning_rate": 5.447223399745655e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.8485673069953918, + "num_tokens": 49067449.0, + "step": 1286 + }, + { + "epoch": 0.1637196285459865, + "ewc_loss": 0.0029250499792397022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9250499210320413e-05, + "grad_norm": 2.6663732528686523, + "learning_rate": 5.451462484103433e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.853352427482605, + "num_tokens": 49102650.0, + "step": 1287 + }, + { + "epoch": 0.163846838824577, + "ewc_loss": 0.00295288790948689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9528879167628475e-05, + "grad_norm": 2.666433572769165, + "learning_rate": 5.455701568461212e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8452314138412476, + "num_tokens": 49137110.0, + "step": 1288 + }, + { + "epoch": 0.16397404910316754, + "ewc_loss": 0.002948333974927664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9483340767910704e-05, + "grad_norm": 2.699418306350708, + "learning_rate": 5.45994065281899e-07, + "loss": 0.5436, + "mean_token_accuracy": 0.8217021822929382, + "num_tokens": 49177765.0, + "step": 1289 + }, + { + "epoch": 0.16410125938175804, + "ewc_loss": 0.0029591345228254795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9591345082735643e-05, + "grad_norm": 2.7574574947357178, + "learning_rate": 5.46417973717677e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8390403985977173, + "num_tokens": 49207860.0, + "step": 1290 + }, + { + "epoch": 0.16422846966034857, + "ewc_loss": 0.002981598023325205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9815979360137135e-05, + "grad_norm": 2.688753128051758, + "learning_rate": 5.468418821534548e-07, + "loss": 0.4187, + "mean_token_accuracy": 0.8603756427764893, + "num_tokens": 49239994.0, + "step": 1291 + }, + { + "epoch": 0.16435567993893907, + "ewc_loss": 0.0029612400103360415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9612399885081686e-05, + "grad_norm": 2.6421775817871094, + "learning_rate": 5.472657905892327e-07, + "loss": 0.4325, + "mean_token_accuracy": 0.858453631401062, + "num_tokens": 49273464.0, + "step": 1292 + }, + { + "epoch": 0.16448289021752957, + "ewc_loss": 0.0029503614641726017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9503615223802626e-05, + "grad_norm": 2.6371188163757324, + "learning_rate": 5.476896990250106e-07, + "loss": 0.5447, + "mean_token_accuracy": 0.8242499232292175, + "num_tokens": 49313144.0, + "step": 1293 + }, + { + "epoch": 0.1646101004961201, + "ewc_loss": 0.002965944819152355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.96594480460044e-05, + "grad_norm": 2.5610334873199463, + "learning_rate": 5.481136074607885e-07, + "loss": 0.4322, + "mean_token_accuracy": 0.8532590866088867, + "num_tokens": 49351392.0, + "step": 1294 + }, + { + "epoch": 0.1647373107747106, + "ewc_loss": 0.002951642731204629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9516428185161203e-05, + "grad_norm": 2.5447072982788086, + "learning_rate": 5.485375158965663e-07, + "loss": 0.4362, + "mean_token_accuracy": 0.8561780452728271, + "num_tokens": 49396726.0, + "step": 1295 + }, + { + "epoch": 0.1648645210533011, + "ewc_loss": 0.002955302596092224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9553026251960546e-05, + "grad_norm": 2.599642276763916, + "learning_rate": 5.489614243323442e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.8451226949691772, + "num_tokens": 49435856.0, + "step": 1296 + }, + { + "epoch": 0.16499173133189163, + "ewc_loss": 0.0029802380595356226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9802380595356226e-05, + "grad_norm": 2.585254192352295, + "learning_rate": 5.49385332768122e-07, + "loss": 0.4107, + "mean_token_accuracy": 0.8611079454421997, + "num_tokens": 49473695.0, + "step": 1297 + }, + { + "epoch": 0.16511894161048213, + "ewc_loss": 0.0029752750415354967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9752749469480477e-05, + "grad_norm": 2.690190553665161, + "learning_rate": 5.498092412039e-07, + "loss": 0.5348, + "mean_token_accuracy": 0.8305873870849609, + "num_tokens": 49509165.0, + "step": 1298 + }, + { + "epoch": 0.16524615188907263, + "ewc_loss": 0.0030126257333904505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0126257115625776e-05, + "grad_norm": 2.4961493015289307, + "learning_rate": 5.502331496396778e-07, + "loss": 0.4388, + "mean_token_accuracy": 0.8540187478065491, + "num_tokens": 49553790.0, + "step": 1299 + }, + { + "epoch": 0.16537336216766316, + "ewc_loss": 0.0029464089311659336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9464088584063575e-05, + "grad_norm": 2.602776050567627, + "learning_rate": 5.506570580754557e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8478667736053467, + "num_tokens": 49593124.0, + "step": 1300 + }, + { + "epoch": 0.16550057244625366, + "ewc_loss": 0.0030007902532815933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.000790275109466e-05, + "grad_norm": 2.7308216094970703, + "learning_rate": 5.510809665112336e-07, + "loss": 0.4492, + "mean_token_accuracy": 0.8543627262115479, + "num_tokens": 49632016.0, + "step": 1301 + }, + { + "epoch": 0.16562778272484416, + "ewc_loss": 0.0030419693794101477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0419692848226987e-05, + "grad_norm": 2.5910913944244385, + "learning_rate": 5.515048749470113e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8487087488174438, + "num_tokens": 49670689.0, + "step": 1302 + }, + { + "epoch": 0.1657549930034347, + "ewc_loss": 0.0029786003287881613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.97860024147667e-05, + "grad_norm": 2.6388144493103027, + "learning_rate": 5.519287833827893e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8508884906768799, + "num_tokens": 49709230.0, + "step": 1303 + }, + { + "epoch": 0.1658822032820252, + "ewc_loss": 0.003001822391524911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.001822369697038e-05, + "grad_norm": 2.7568447589874268, + "learning_rate": 5.523526918185671e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8340597152709961, + "num_tokens": 49741648.0, + "step": 1304 + }, + { + "epoch": 0.1660094135606157, + "ewc_loss": 0.0030409398023039103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0409397368202917e-05, + "grad_norm": 2.684312343597412, + "learning_rate": 5.52776600254345e-07, + "loss": 0.4261, + "mean_token_accuracy": 0.8546112775802612, + "num_tokens": 49773906.0, + "step": 1305 + }, + { + "epoch": 0.16613662383920622, + "ewc_loss": 0.003006749553605914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.006749648193363e-05, + "grad_norm": 2.6043903827667236, + "learning_rate": 5.532005086901229e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8465408682823181, + "num_tokens": 49812645.0, + "step": 1306 + }, + { + "epoch": 0.16626383411779672, + "ewc_loss": 0.0029821100179105997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9821099815308116e-05, + "grad_norm": 2.5407819747924805, + "learning_rate": 5.536244171259008e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8518443703651428, + "num_tokens": 49854500.0, + "step": 1307 + }, + { + "epoch": 0.16639104439638722, + "ewc_loss": 0.002985062776133418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9850627470295876e-05, + "grad_norm": 2.854706287384033, + "learning_rate": 5.540483255616786e-07, + "loss": 0.5476, + "mean_token_accuracy": 0.8247727155685425, + "num_tokens": 49889408.0, + "step": 1308 + }, + { + "epoch": 0.16651825467497774, + "ewc_loss": 0.003098823828622699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.098823799518868e-05, + "grad_norm": 2.6705398559570312, + "learning_rate": 5.544722339974566e-07, + "loss": 0.4691, + "mean_token_accuracy": 0.8468347787857056, + "num_tokens": 49923581.0, + "step": 1309 + }, + { + "epoch": 0.16664546495356825, + "ewc_loss": 0.002998573938384652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9985738365212455e-05, + "grad_norm": 2.619896650314331, + "learning_rate": 5.548961424332343e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.8470675945281982, + "num_tokens": 49959433.0, + "step": 1310 + }, + { + "epoch": 0.16677267523215875, + "ewc_loss": 0.0029837118927389383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9837119654985145e-05, + "grad_norm": 2.588101625442505, + "learning_rate": 5.553200508690123e-07, + "loss": 0.4269, + "mean_token_accuracy": 0.8599295020103455, + "num_tokens": 50000307.0, + "step": 1311 + }, + { + "epoch": 0.16689988551074927, + "ewc_loss": 0.0029919117223471403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.991911787830759e-05, + "grad_norm": 2.7191436290740967, + "learning_rate": 5.557439593047901e-07, + "loss": 0.5191, + "mean_token_accuracy": 0.8289232850074768, + "num_tokens": 50035587.0, + "step": 1312 + }, + { + "epoch": 0.16702709578933977, + "ewc_loss": 0.003039390780031681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.039390685444232e-05, + "grad_norm": 2.561163902282715, + "learning_rate": 5.56167867740568e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8489717245101929, + "num_tokens": 50079425.0, + "step": 1313 + }, + { + "epoch": 0.16715430606793028, + "ewc_loss": 0.0029774364084005356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9774364520562813e-05, + "grad_norm": 2.702528238296509, + "learning_rate": 5.565917761763459e-07, + "loss": 0.4434, + "mean_token_accuracy": 0.8542870283126831, + "num_tokens": 50110674.0, + "step": 1314 + }, + { + "epoch": 0.1672815163465208, + "ewc_loss": 0.003041923977434635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0419239919865504e-05, + "grad_norm": 2.6277952194213867, + "learning_rate": 5.570156846121238e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8426008224487305, + "num_tokens": 50149033.0, + "step": 1315 + }, + { + "epoch": 0.1674087266251113, + "ewc_loss": 0.0030080697033554316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0080696888035163e-05, + "grad_norm": 2.6647908687591553, + "learning_rate": 5.574395930479016e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8366806507110596, + "num_tokens": 50185728.0, + "step": 1316 + }, + { + "epoch": 0.16753593690370183, + "ewc_loss": 0.0030174776911735535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.017477683897596e-05, + "grad_norm": 2.6683077812194824, + "learning_rate": 5.578635014836796e-07, + "loss": 0.4517, + "mean_token_accuracy": 0.8515905141830444, + "num_tokens": 50219671.0, + "step": 1317 + }, + { + "epoch": 0.16766314718229233, + "ewc_loss": 0.0030204644426703453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.020464464498218e-05, + "grad_norm": 2.7290360927581787, + "learning_rate": 5.582874099194573e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8373340368270874, + "num_tokens": 50251989.0, + "step": 1318 + }, + { + "epoch": 0.16779035746088283, + "ewc_loss": 0.003047724487259984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0477243853965774e-05, + "grad_norm": 2.718809127807617, + "learning_rate": 5.587113183552353e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.8387295603752136, + "num_tokens": 50291079.0, + "step": 1319 + }, + { + "epoch": 0.16791756773947336, + "ewc_loss": 0.0030409866012632847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.040986484847963e-05, + "grad_norm": 2.591759204864502, + "learning_rate": 5.591352267910131e-07, + "loss": 0.4526, + "mean_token_accuracy": 0.8511922359466553, + "num_tokens": 50332262.0, + "step": 1320 + }, + { + "epoch": 0.16804477801806386, + "ewc_loss": 0.0030025565065443516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0025565138203092e-05, + "grad_norm": 2.661837100982666, + "learning_rate": 5.59559135226791e-07, + "loss": 0.482, + "mean_token_accuracy": 0.8454796075820923, + "num_tokens": 50371278.0, + "step": 1321 + }, + { + "epoch": 0.16817198829665436, + "ewc_loss": 0.003040822921320796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.040822957700584e-05, + "grad_norm": 2.6815099716186523, + "learning_rate": 5.599830436625689e-07, + "loss": 0.4577, + "mean_token_accuracy": 0.8505111932754517, + "num_tokens": 50409463.0, + "step": 1322 + }, + { + "epoch": 0.1682991985752449, + "ewc_loss": 0.003043210133910179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0432100174948573e-05, + "grad_norm": 2.6545069217681885, + "learning_rate": 5.604069520983468e-07, + "loss": 0.4535, + "mean_token_accuracy": 0.8522866368293762, + "num_tokens": 50446745.0, + "step": 1323 + }, + { + "epoch": 0.1684264088538354, + "ewc_loss": 0.0030416352674365044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0416353183682077e-05, + "grad_norm": 2.747063159942627, + "learning_rate": 5.608308605341246e-07, + "loss": 0.4453, + "mean_token_accuracy": 0.8524326682090759, + "num_tokens": 50479725.0, + "step": 1324 + }, + { + "epoch": 0.1685536191324259, + "ewc_loss": 0.00307757337577641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0775732739130035e-05, + "grad_norm": 2.7092349529266357, + "learning_rate": 5.612547689699024e-07, + "loss": 0.5063, + "mean_token_accuracy": 0.8351367712020874, + "num_tokens": 50514734.0, + "step": 1325 + }, + { + "epoch": 0.16868082941101642, + "ewc_loss": 0.003060685470700264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.060685412492603e-05, + "grad_norm": 2.64023494720459, + "learning_rate": 5.616786774056803e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8438607454299927, + "num_tokens": 50555165.0, + "step": 1326 + }, + { + "epoch": 0.16880803968960692, + "ewc_loss": 0.0030408292077481747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0408293241634965e-05, + "grad_norm": 2.5942888259887695, + "learning_rate": 5.621025858414582e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8407943248748779, + "num_tokens": 50598251.0, + "step": 1327 + }, + { + "epoch": 0.16893524996819742, + "ewc_loss": 0.003041852032765746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.04185196000617e-05, + "grad_norm": 2.5896832942962646, + "learning_rate": 5.625264942772361e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.8450503349304199, + "num_tokens": 50638423.0, + "step": 1328 + }, + { + "epoch": 0.16906246024678795, + "ewc_loss": 0.003045277902856469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0452778446488082e-05, + "grad_norm": 2.640625476837158, + "learning_rate": 5.629504027130139e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.8352969288825989, + "num_tokens": 50678226.0, + "step": 1329 + }, + { + "epoch": 0.16918967052537845, + "ewc_loss": 0.0030689069535583258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0689068807987496e-05, + "grad_norm": 2.6613340377807617, + "learning_rate": 5.633743111487919e-07, + "loss": 0.5355, + "mean_token_accuracy": 0.8249291181564331, + "num_tokens": 50717542.0, + "step": 1330 + }, + { + "epoch": 0.16931688080396895, + "ewc_loss": 0.003071077633649111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.071077662752941e-05, + "grad_norm": 2.702286958694458, + "learning_rate": 5.637982195845697e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.8616209030151367, + "num_tokens": 50752914.0, + "step": 1331 + }, + { + "epoch": 0.16944409108255948, + "ewc_loss": 0.0030761631205677986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.076163193327375e-05, + "grad_norm": 2.6109871864318848, + "learning_rate": 5.642221280203476e-07, + "loss": 0.4172, + "mean_token_accuracy": 0.8638993501663208, + "num_tokens": 50789276.0, + "step": 1332 + }, + { + "epoch": 0.16957130136114998, + "ewc_loss": 0.0030433654319494963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.04336535918992e-05, + "grad_norm": 2.769631862640381, + "learning_rate": 5.646460364561254e-07, + "loss": 0.436, + "mean_token_accuracy": 0.8554234504699707, + "num_tokens": 50820759.0, + "step": 1333 + }, + { + "epoch": 0.16969851163974048, + "ewc_loss": 0.0031050327233970165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.105032737948932e-05, + "grad_norm": 2.6286234855651855, + "learning_rate": 5.650699448919033e-07, + "loss": 0.5265, + "mean_token_accuracy": 0.8290612697601318, + "num_tokens": 50863241.0, + "step": 1334 + }, + { + "epoch": 0.169825721918331, + "ewc_loss": 0.003044696059077978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.044695949938614e-05, + "grad_norm": 2.631495714187622, + "learning_rate": 5.654938533276812e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8462406992912292, + "num_tokens": 50904296.0, + "step": 1335 + }, + { + "epoch": 0.1699529321969215, + "ewc_loss": 0.003058048663660884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.058048605453223e-05, + "grad_norm": 2.636509418487549, + "learning_rate": 5.659177617634591e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.851466953754425, + "num_tokens": 50942180.0, + "step": 1336 + }, + { + "epoch": 0.170080142475512, + "ewc_loss": 0.003070157254114747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.070157254114747e-05, + "grad_norm": 2.6194307804107666, + "learning_rate": 5.663416701992369e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8445525169372559, + "num_tokens": 50980287.0, + "step": 1337 + }, + { + "epoch": 0.17020735275410254, + "ewc_loss": 0.00305998045951128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.059980372199789e-05, + "grad_norm": 2.7557315826416016, + "learning_rate": 5.667655786350149e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.8200231790542603, + "num_tokens": 51014218.0, + "step": 1338 + }, + { + "epoch": 0.17033456303269304, + "ewc_loss": 0.003111760364845395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1117604521568865e-05, + "grad_norm": 2.613490343093872, + "learning_rate": 5.671894870707927e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.852324903011322, + "num_tokens": 51055915.0, + "step": 1339 + }, + { + "epoch": 0.17046177331128357, + "ewc_loss": 0.0030566821806132793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0566821806132793e-05, + "grad_norm": 2.643580436706543, + "learning_rate": 5.676133955065705e-07, + "loss": 0.4048, + "mean_token_accuracy": 0.8642955422401428, + "num_tokens": 51090684.0, + "step": 1340 + }, + { + "epoch": 0.17058898358987407, + "ewc_loss": 0.0030736858025193214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.073685729759745e-05, + "grad_norm": 2.6426784992218018, + "learning_rate": 5.680373039423484e-07, + "loss": 0.4477, + "mean_token_accuracy": 0.8522931337356567, + "num_tokens": 51130156.0, + "step": 1341 + }, + { + "epoch": 0.17071619386846457, + "ewc_loss": 0.003081689355894923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.081689283135347e-05, + "grad_norm": 2.7515182495117188, + "learning_rate": 5.684612123781263e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8388500213623047, + "num_tokens": 51171282.0, + "step": 1342 + }, + { + "epoch": 0.1708434041470551, + "ewc_loss": 0.0031161627266556025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.116162770311348e-05, + "grad_norm": 2.6606719493865967, + "learning_rate": 5.688851208139042e-07, + "loss": 0.5037, + "mean_token_accuracy": 0.8331712484359741, + "num_tokens": 51210616.0, + "step": 1343 + }, + { + "epoch": 0.1709706144256456, + "ewc_loss": 0.0030769656877964735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.076965731452219e-05, + "grad_norm": 2.619887590408325, + "learning_rate": 5.69309029249682e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.8447777628898621, + "num_tokens": 51253357.0, + "step": 1344 + }, + { + "epoch": 0.1710978247042361, + "ewc_loss": 0.003068274352699518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0682742362841964e-05, + "grad_norm": 2.6239051818847656, + "learning_rate": 5.697329376854599e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8446172475814819, + "num_tokens": 51292671.0, + "step": 1345 + }, + { + "epoch": 0.17122503498282662, + "ewc_loss": 0.003080580150708556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.080580063397065e-05, + "grad_norm": 2.6856963634490967, + "learning_rate": 5.701568461212378e-07, + "loss": 0.412, + "mean_token_accuracy": 0.8640612363815308, + "num_tokens": 51329145.0, + "step": 1346 + }, + { + "epoch": 0.17135224526141712, + "ewc_loss": 0.0031051263213157654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.105126234004274e-05, + "grad_norm": 2.5954301357269287, + "learning_rate": 5.705807545570157e-07, + "loss": 0.4215, + "mean_token_accuracy": 0.8598805665969849, + "num_tokens": 51371265.0, + "step": 1347 + }, + { + "epoch": 0.17147945554000762, + "ewc_loss": 0.0030615562573075294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.061556344619021e-05, + "grad_norm": 2.647026538848877, + "learning_rate": 5.710046629927934e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8493699431419373, + "num_tokens": 51410262.0, + "step": 1348 + }, + { + "epoch": 0.17160666581859815, + "ewc_loss": 0.003096314612776041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.096314685535617e-05, + "grad_norm": 2.6345889568328857, + "learning_rate": 5.714285714285714e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8588796854019165, + "num_tokens": 51447796.0, + "step": 1349 + }, + { + "epoch": 0.17173387609718865, + "ewc_loss": 0.00309027754701674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0902774597052485e-05, + "grad_norm": 2.804868221282959, + "learning_rate": 5.718524798643492e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8401220440864563, + "num_tokens": 51482677.0, + "step": 1350 + }, + { + "epoch": 0.17186108637577915, + "ewc_loss": 0.0031484095379710197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1484094506595284e-05, + "grad_norm": 2.6567294597625732, + "learning_rate": 5.722763883001272e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8403693437576294, + "num_tokens": 51524111.0, + "step": 1351 + }, + { + "epoch": 0.17198829665436968, + "ewc_loss": 0.003080483991652727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0804840207565576e-05, + "grad_norm": 2.684556484222412, + "learning_rate": 5.72700296735905e-07, + "loss": 0.4651, + "mean_token_accuracy": 0.8463122844696045, + "num_tokens": 51561008.0, + "step": 1352 + }, + { + "epoch": 0.17211550693296018, + "ewc_loss": 0.0031029346864670515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.102934715570882e-05, + "grad_norm": 2.6593732833862305, + "learning_rate": 5.731242051716829e-07, + "loss": 0.5198, + "mean_token_accuracy": 0.8285301327705383, + "num_tokens": 51603091.0, + "step": 1353 + }, + { + "epoch": 0.17224271721155068, + "ewc_loss": 0.0030965718906372786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0965718906372786e-05, + "grad_norm": 2.6579599380493164, + "learning_rate": 5.735481136074608e-07, + "loss": 0.4356, + "mean_token_accuracy": 0.8548725247383118, + "num_tokens": 51637789.0, + "step": 1354 + }, + { + "epoch": 0.1723699274901412, + "ewc_loss": 0.0031025304924696684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.102530536125414e-05, + "grad_norm": 2.6992599964141846, + "learning_rate": 5.739720220432386e-07, + "loss": 0.4397, + "mean_token_accuracy": 0.8521904349327087, + "num_tokens": 51674207.0, + "step": 1355 + }, + { + "epoch": 0.1724971377687317, + "ewc_loss": 0.003108191303908825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.108191231149249e-05, + "grad_norm": 2.684311628341675, + "learning_rate": 5.743959304790164e-07, + "loss": 0.5336, + "mean_token_accuracy": 0.8290277719497681, + "num_tokens": 51715109.0, + "step": 1356 + }, + { + "epoch": 0.1726243480473222, + "ewc_loss": 0.0031033188570290804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.103318886132911e-05, + "grad_norm": 2.652803897857666, + "learning_rate": 5.748198389147944e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8449516296386719, + "num_tokens": 51753373.0, + "step": 1357 + }, + { + "epoch": 0.17275155832591274, + "ewc_loss": 0.003097822656854987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.097822627751157e-05, + "grad_norm": 2.6549081802368164, + "learning_rate": 5.752437473505722e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8357457518577576, + "num_tokens": 51795100.0, + "step": 1358 + }, + { + "epoch": 0.17287876860450324, + "ewc_loss": 0.0031109871342778206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.110987017862499e-05, + "grad_norm": 2.681662082672119, + "learning_rate": 5.756676557863502e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.8385480046272278, + "num_tokens": 51831840.0, + "step": 1359 + }, + { + "epoch": 0.17300597888309374, + "ewc_loss": 0.0031183434184640646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.118343374808319e-05, + "grad_norm": 2.6130285263061523, + "learning_rate": 5.76091564222128e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.8547748327255249, + "num_tokens": 51870227.0, + "step": 1360 + }, + { + "epoch": 0.17313318916168427, + "ewc_loss": 0.0030969856306910515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.096985528827645e-05, + "grad_norm": 2.673264741897583, + "learning_rate": 5.765154726579059e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8474669456481934, + "num_tokens": 51908251.0, + "step": 1361 + }, + { + "epoch": 0.17326039944027477, + "ewc_loss": 0.0031338210683315992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1338211556430906e-05, + "grad_norm": 2.666421890258789, + "learning_rate": 5.769393810936838e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8487318754196167, + "num_tokens": 51944531.0, + "step": 1362 + }, + { + "epoch": 0.17338760971886527, + "ewc_loss": 0.003128823358565569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.128823300357908e-05, + "grad_norm": 2.6435484886169434, + "learning_rate": 5.773632895294616e-07, + "loss": 0.5122, + "mean_token_accuracy": 0.8326114416122437, + "num_tokens": 51985742.0, + "step": 1363 + }, + { + "epoch": 0.1735148199974558, + "ewc_loss": 0.003117686603218317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.117686719633639e-05, + "grad_norm": 2.672231435775757, + "learning_rate": 5.777871979652394e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8510316610336304, + "num_tokens": 52022036.0, + "step": 1364 + }, + { + "epoch": 0.1736420302760463, + "ewc_loss": 0.003139123786240816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1391238735523075e-05, + "grad_norm": 2.589599847793579, + "learning_rate": 5.782111064010173e-07, + "loss": 0.476, + "mean_token_accuracy": 0.8433171510696411, + "num_tokens": 52066077.0, + "step": 1365 + }, + { + "epoch": 0.17376924055463683, + "ewc_loss": 0.0031130309216678143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.113030834356323e-05, + "grad_norm": 2.6155643463134766, + "learning_rate": 5.786350148367952e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8482423424720764, + "num_tokens": 52111280.0, + "step": 1366 + }, + { + "epoch": 0.17389645083322733, + "ewc_loss": 0.0031300464179366827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.130046388832852e-05, + "grad_norm": 2.6105363368988037, + "learning_rate": 5.790589232725731e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.8461594581604004, + "num_tokens": 52153148.0, + "step": 1367 + }, + { + "epoch": 0.17402366111181783, + "ewc_loss": 0.0031314652878791094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.131465200567618e-05, + "grad_norm": 2.7575623989105225, + "learning_rate": 5.79482831708351e-07, + "loss": 0.4506, + "mean_token_accuracy": 0.8516751527786255, + "num_tokens": 52186722.0, + "step": 1368 + }, + { + "epoch": 0.17415087139040836, + "ewc_loss": 0.003183805150911212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.183805165463127e-05, + "grad_norm": 2.777106285095215, + "learning_rate": 5.799067401441288e-07, + "loss": 0.4586, + "mean_token_accuracy": 0.8470723628997803, + "num_tokens": 52220837.0, + "step": 1369 + }, + { + "epoch": 0.17427808166899886, + "ewc_loss": 0.0031733724754303694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.17337253363803e-05, + "grad_norm": 2.6524507999420166, + "learning_rate": 5.803306485799068e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8555083870887756, + "num_tokens": 52257049.0, + "step": 1370 + }, + { + "epoch": 0.17440529194758936, + "ewc_loss": 0.0031279437243938446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.127943637082353e-05, + "grad_norm": 2.6052632331848145, + "learning_rate": 5.807545570156845e-07, + "loss": 0.4228, + "mean_token_accuracy": 0.8594738245010376, + "num_tokens": 52297632.0, + "step": 1371 + }, + { + "epoch": 0.1745325022261799, + "ewc_loss": 0.0031250508036464453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.125050716334954e-05, + "grad_norm": 2.8516321182250977, + "learning_rate": 5.811784654514624e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8317656517028809, + "num_tokens": 52331008.0, + "step": 1372 + }, + { + "epoch": 0.1746597125047704, + "ewc_loss": 0.0032316260039806366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.231626033084467e-05, + "grad_norm": 2.6280953884124756, + "learning_rate": 5.816023738872403e-07, + "loss": 0.45, + "mean_token_accuracy": 0.852204442024231, + "num_tokens": 52369420.0, + "step": 1373 + }, + { + "epoch": 0.1747869227833609, + "ewc_loss": 0.003127266652882099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1272666092263535e-05, + "grad_norm": 2.6315488815307617, + "learning_rate": 5.820262823230182e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8434914350509644, + "num_tokens": 52411759.0, + "step": 1374 + }, + { + "epoch": 0.17491413306195142, + "ewc_loss": 0.0031516258604824543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.151625787722878e-05, + "grad_norm": 2.7638676166534424, + "learning_rate": 5.824501907587961e-07, + "loss": 0.4416, + "mean_token_accuracy": 0.8537982702255249, + "num_tokens": 52446459.0, + "step": 1375 + }, + { + "epoch": 0.17504134334054192, + "ewc_loss": 0.0032038602512329817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.203860251232982e-05, + "grad_norm": 2.6977827548980713, + "learning_rate": 5.82874099194574e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.8286361694335938, + "num_tokens": 52484615.0, + "step": 1376 + }, + { + "epoch": 0.17516855361913242, + "ewc_loss": 0.0031662227120250463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.166222813888453e-05, + "grad_norm": 2.6601462364196777, + "learning_rate": 5.832980076303518e-07, + "loss": 0.5342, + "mean_token_accuracy": 0.8259708881378174, + "num_tokens": 52525195.0, + "step": 1377 + }, + { + "epoch": 0.17529576389772294, + "ewc_loss": 0.0031592503655701876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.159250263706781e-05, + "grad_norm": 2.7028658390045166, + "learning_rate": 5.837219160661297e-07, + "loss": 0.4223, + "mean_token_accuracy": 0.8592175841331482, + "num_tokens": 52558783.0, + "step": 1378 + }, + { + "epoch": 0.17542297417631345, + "ewc_loss": 0.0031836440321058035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.183644003001973e-05, + "grad_norm": 2.614192008972168, + "learning_rate": 5.841458245019075e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8601099252700806, + "num_tokens": 52599176.0, + "step": 1379 + }, + { + "epoch": 0.17555018445490395, + "ewc_loss": 0.0031538133043795824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1538133043795824e-05, + "grad_norm": 2.765183210372925, + "learning_rate": 5.845697329376855e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8482143878936768, + "num_tokens": 52630865.0, + "step": 1380 + }, + { + "epoch": 0.17567739473349447, + "ewc_loss": 0.003215417033061385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.21541701850947e-05, + "grad_norm": 2.7343685626983643, + "learning_rate": 5.849936413734633e-07, + "loss": 0.523, + "mean_token_accuracy": 0.8301659822463989, + "num_tokens": 52671702.0, + "step": 1381 + }, + { + "epoch": 0.17580460501208497, + "ewc_loss": 0.0031898035667836666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.18980346492026e-05, + "grad_norm": 2.5921947956085205, + "learning_rate": 5.854175498092412e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8589464426040649, + "num_tokens": 52712288.0, + "step": 1382 + }, + { + "epoch": 0.17593181529067548, + "ewc_loss": 0.0031416930723935366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1416930141858757e-05, + "grad_norm": 2.590082883834839, + "learning_rate": 5.858414582450191e-07, + "loss": 0.4705, + "mean_token_accuracy": 0.846556544303894, + "num_tokens": 52756569.0, + "step": 1383 + }, + { + "epoch": 0.176059025569266, + "ewc_loss": 0.003163373563438654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.163373548886739e-05, + "grad_norm": 2.661992311477661, + "learning_rate": 5.86265366680797e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8396360874176025, + "num_tokens": 52796195.0, + "step": 1384 + }, + { + "epoch": 0.1761862358478565, + "ewc_loss": 0.003190775169059634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.190775169059634e-05, + "grad_norm": 2.6187920570373535, + "learning_rate": 5.866892751165748e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.8491977453231812, + "num_tokens": 52839687.0, + "step": 1385 + }, + { + "epoch": 0.176313446126447, + "ewc_loss": 0.003173654433339834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.17365447699558e-05, + "grad_norm": 2.7259368896484375, + "learning_rate": 5.871131835523526e-07, + "loss": 0.5186, + "mean_token_accuracy": 0.8306975364685059, + "num_tokens": 52875660.0, + "step": 1386 + }, + { + "epoch": 0.17644065640503753, + "ewc_loss": 0.0032172417268157005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.217241828679107e-05, + "grad_norm": 2.622884511947632, + "learning_rate": 5.875370919881305e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8436520099639893, + "num_tokens": 52921220.0, + "step": 1387 + }, + { + "epoch": 0.17656786668362803, + "ewc_loss": 0.0031683826819062233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.168382681906223e-05, + "grad_norm": 2.7202186584472656, + "learning_rate": 5.879610004239084e-07, + "loss": 0.5271, + "mean_token_accuracy": 0.8320891857147217, + "num_tokens": 52960042.0, + "step": 1388 + }, + { + "epoch": 0.17669507696221853, + "ewc_loss": 0.0032172261271625757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2172261853702366e-05, + "grad_norm": 2.716522693634033, + "learning_rate": 5.883849088596863e-07, + "loss": 0.5122, + "mean_token_accuracy": 0.8360223770141602, + "num_tokens": 53002817.0, + "step": 1389 + }, + { + "epoch": 0.17682228724080906, + "ewc_loss": 0.003211081027984619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.211080911569297e-05, + "grad_norm": 2.7365410327911377, + "learning_rate": 5.888088172954641e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8431082963943481, + "num_tokens": 53037751.0, + "step": 1390 + }, + { + "epoch": 0.17694949751939956, + "ewc_loss": 0.0032063464168459177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.206346445949748e-05, + "grad_norm": 2.6610898971557617, + "learning_rate": 5.892327257312421e-07, + "loss": 0.4018, + "mean_token_accuracy": 0.8682665824890137, + "num_tokens": 53074286.0, + "step": 1391 + }, + { + "epoch": 0.1770767077979901, + "ewc_loss": 0.003191390074789524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.191389987478033e-05, + "grad_norm": 2.663898229598999, + "learning_rate": 5.896566341670199e-07, + "loss": 0.4205, + "mean_token_accuracy": 0.8565307855606079, + "num_tokens": 53113525.0, + "step": 1392 + }, + { + "epoch": 0.1772039180765806, + "ewc_loss": 0.003195255296304822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1952553399605677e-05, + "grad_norm": 2.704160690307617, + "learning_rate": 5.900805426027977e-07, + "loss": 0.4643, + "mean_token_accuracy": 0.847690999507904, + "num_tokens": 53150137.0, + "step": 1393 + }, + { + "epoch": 0.1773311283551711, + "ewc_loss": 0.0032139297109097242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.213929812773131e-05, + "grad_norm": 2.621965169906616, + "learning_rate": 5.905044510385756e-07, + "loss": 0.4189, + "mean_token_accuracy": 0.8610814809799194, + "num_tokens": 53189519.0, + "step": 1394 + }, + { + "epoch": 0.17745833863376162, + "ewc_loss": 0.0031818184070289135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1818184652365744e-05, + "grad_norm": 2.7125027179718018, + "learning_rate": 5.909283594743535e-07, + "loss": 0.5264, + "mean_token_accuracy": 0.829247772693634, + "num_tokens": 53227452.0, + "step": 1395 + }, + { + "epoch": 0.17758554891235212, + "ewc_loss": 0.003217376535758376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2173764338949695e-05, + "grad_norm": 2.701789379119873, + "learning_rate": 5.913522679101314e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8465089797973633, + "num_tokens": 53265875.0, + "step": 1396 + }, + { + "epoch": 0.17771275919094262, + "ewc_loss": 0.0032208289485424757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2208288757828996e-05, + "grad_norm": 2.7038605213165283, + "learning_rate": 5.917761763459093e-07, + "loss": 0.5183, + "mean_token_accuracy": 0.8309587240219116, + "num_tokens": 53305477.0, + "step": 1397 + }, + { + "epoch": 0.17783996946953315, + "ewc_loss": 0.003212262410670519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.212262527085841e-05, + "grad_norm": 2.68487548828125, + "learning_rate": 5.922000847816871e-07, + "loss": 0.4402, + "mean_token_accuracy": 0.8560366630554199, + "num_tokens": 53342800.0, + "step": 1398 + }, + { + "epoch": 0.17796717974812365, + "ewc_loss": 0.003210981609299779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.210981594747864e-05, + "grad_norm": 2.653446674346924, + "learning_rate": 5.926239932174651e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8364071249961853, + "num_tokens": 53387476.0, + "step": 1399 + }, + { + "epoch": 0.17809439002671415, + "ewc_loss": 0.0032069317530840635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.206931796739809e-05, + "grad_norm": 2.663637161254883, + "learning_rate": 5.930479016532429e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8481410145759583, + "num_tokens": 53424628.0, + "step": 1400 + }, + { + "epoch": 0.17822160030530468, + "ewc_loss": 0.0032153790816664696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.215379183529876e-05, + "grad_norm": 2.7211639881134033, + "learning_rate": 5.934718100890207e-07, + "loss": 0.4422, + "mean_token_accuracy": 0.852202832698822, + "num_tokens": 53464459.0, + "step": 1401 + }, + { + "epoch": 0.17834881058389518, + "ewc_loss": 0.0032292308751493692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.229230787837878e-05, + "grad_norm": 2.7585723400115967, + "learning_rate": 5.938957185247986e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8362575769424438, + "num_tokens": 53501721.0, + "step": 1402 + }, + { + "epoch": 0.17847602086248568, + "ewc_loss": 0.00324024586006999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.24024586006999e-05, + "grad_norm": 2.788113832473755, + "learning_rate": 5.943196269605765e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.8475279808044434, + "num_tokens": 53536607.0, + "step": 1403 + }, + { + "epoch": 0.1786032311410762, + "ewc_loss": 0.0032409264240413904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.240926525904797e-05, + "grad_norm": 2.7340452671051025, + "learning_rate": 5.947435353963544e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8404941558837891, + "num_tokens": 53569809.0, + "step": 1404 + }, + { + "epoch": 0.1787304414196667, + "ewc_loss": 0.0032297743018716574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2297743018716574e-05, + "grad_norm": 2.7253901958465576, + "learning_rate": 5.951674438321323e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.8436732888221741, + "num_tokens": 53604423.0, + "step": 1405 + }, + { + "epoch": 0.1788576516982572, + "ewc_loss": 0.0032333279959857464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2333278795704246e-05, + "grad_norm": 2.690078020095825, + "learning_rate": 5.955913522679101e-07, + "loss": 0.475, + "mean_token_accuracy": 0.8414911031723022, + "num_tokens": 53642145.0, + "step": 1406 + }, + { + "epoch": 0.17898486197684774, + "ewc_loss": 0.0032322106417268515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.232210656278767e-05, + "grad_norm": 2.7100682258605957, + "learning_rate": 5.96015260703688e-07, + "loss": 0.5489, + "mean_token_accuracy": 0.8202707171440125, + "num_tokens": 53682998.0, + "step": 1407 + }, + { + "epoch": 0.17911207225543824, + "ewc_loss": 0.0032462284434586763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2462285162182525e-05, + "grad_norm": 2.595365524291992, + "learning_rate": 5.964391691394659e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8447875380516052, + "num_tokens": 53729764.0, + "step": 1408 + }, + { + "epoch": 0.17923928253402874, + "ewc_loss": 0.003208997892215848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2089978049043566e-05, + "grad_norm": 2.7887680530548096, + "learning_rate": 5.968630775752436e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8330817818641663, + "num_tokens": 53764522.0, + "step": 1409 + }, + { + "epoch": 0.17936649281261927, + "ewc_loss": 0.0032927037682384253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.292703695478849e-05, + "grad_norm": 2.8091628551483154, + "learning_rate": 5.972869860110216e-07, + "loss": 0.4981, + "mean_token_accuracy": 0.8330287337303162, + "num_tokens": 53799791.0, + "step": 1410 + }, + { + "epoch": 0.17949370309120977, + "ewc_loss": 0.003282433608546853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.282433681306429e-05, + "grad_norm": 2.721022367477417, + "learning_rate": 5.977108944467994e-07, + "loss": 0.5362, + "mean_token_accuracy": 0.8303643465042114, + "num_tokens": 53840181.0, + "step": 1411 + }, + { + "epoch": 0.17962091336980027, + "ewc_loss": 0.0032447257544845343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.244725667173043e-05, + "grad_norm": 2.629784345626831, + "learning_rate": 5.981348028825774e-07, + "loss": 0.444, + "mean_token_accuracy": 0.8568727970123291, + "num_tokens": 53882813.0, + "step": 1412 + }, + { + "epoch": 0.1797481236483908, + "ewc_loss": 0.003229648107662797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.229648064007051e-05, + "grad_norm": 2.6996607780456543, + "learning_rate": 5.985587113183552e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8409394025802612, + "num_tokens": 53922198.0, + "step": 1413 + }, + { + "epoch": 0.1798753339269813, + "ewc_loss": 0.003267907304689288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.267907231929712e-05, + "grad_norm": 2.725994110107422, + "learning_rate": 5.989826197541331e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8408036231994629, + "num_tokens": 53959390.0, + "step": 1414 + }, + { + "epoch": 0.18000254420557182, + "ewc_loss": 0.0032722793985158205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.272279354860075e-05, + "grad_norm": 2.654069423675537, + "learning_rate": 5.99406528189911e-07, + "loss": 0.512, + "mean_token_accuracy": 0.8384263515472412, + "num_tokens": 54005707.0, + "step": 1415 + }, + { + "epoch": 0.18012975448416232, + "ewc_loss": 0.0032431308645755053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.243130777264014e-05, + "grad_norm": 2.638294219970703, + "learning_rate": 5.998304366256888e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8432705402374268, + "num_tokens": 54047808.0, + "step": 1416 + }, + { + "epoch": 0.18025696476275282, + "ewc_loss": 0.0032427909318357706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2427909900434315e-05, + "grad_norm": 2.712113380432129, + "learning_rate": 6.002543450614666e-07, + "loss": 0.4033, + "mean_token_accuracy": 0.8697761297225952, + "num_tokens": 54082929.0, + "step": 1417 + }, + { + "epoch": 0.18038417504134335, + "ewc_loss": 0.0032790424302220345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.279042357462458e-05, + "grad_norm": 2.7071621417999268, + "learning_rate": 6.006782534972446e-07, + "loss": 0.4643, + "mean_token_accuracy": 0.8483534455299377, + "num_tokens": 54120199.0, + "step": 1418 + }, + { + "epoch": 0.18051138531993385, + "ewc_loss": 0.0032670549117028713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2670548534952104e-05, + "grad_norm": 2.770570755004883, + "learning_rate": 6.011021619330224e-07, + "loss": 0.499, + "mean_token_accuracy": 0.8364872932434082, + "num_tokens": 54158629.0, + "step": 1419 + }, + { + "epoch": 0.18063859559852435, + "ewc_loss": 0.0032870061695575714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.287006256869063e-05, + "grad_norm": 2.759032964706421, + "learning_rate": 6.015260703688004e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.8416351079940796, + "num_tokens": 54194772.0, + "step": 1420 + }, + { + "epoch": 0.18076580587711488, + "ewc_loss": 0.0032780314795672894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2780313631519675e-05, + "grad_norm": 2.749713659286499, + "learning_rate": 6.019499788045782e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.8431512713432312, + "num_tokens": 54229825.0, + "step": 1421 + }, + { + "epoch": 0.18089301615570538, + "ewc_loss": 0.0032819309271872044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.281930912635289e-05, + "grad_norm": 2.8670551776885986, + "learning_rate": 6.023738872403561e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8526169061660767, + "num_tokens": 54260051.0, + "step": 1422 + }, + { + "epoch": 0.18102022643429588, + "ewc_loss": 0.0033275082241743803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3275082387262955e-05, + "grad_norm": 2.684753656387329, + "learning_rate": 6.02797795676134e-07, + "loss": 0.4691, + "mean_token_accuracy": 0.8503056764602661, + "num_tokens": 54297865.0, + "step": 1423 + }, + { + "epoch": 0.1811474367128864, + "ewc_loss": 0.0032490380108356476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2490381272509694e-05, + "grad_norm": 2.7363641262054443, + "learning_rate": 6.032217041119118e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.8365467190742493, + "num_tokens": 54334618.0, + "step": 1424 + }, + { + "epoch": 0.1812746469914769, + "ewc_loss": 0.003295334056019783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2953339541563764e-05, + "grad_norm": 2.7422423362731934, + "learning_rate": 6.036456125476896e-07, + "loss": 0.5185, + "mean_token_accuracy": 0.829741358757019, + "num_tokens": 54369761.0, + "step": 1425 + }, + { + "epoch": 0.1814018572700674, + "ewc_loss": 0.003305833088234067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.305833160993643e-05, + "grad_norm": 2.670433521270752, + "learning_rate": 6.040695209834675e-07, + "loss": 0.42, + "mean_token_accuracy": 0.8578324317932129, + "num_tokens": 54411549.0, + "step": 1426 + }, + { + "epoch": 0.18152906754865794, + "ewc_loss": 0.0032771662808954716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.277166251791641e-05, + "grad_norm": 2.696718692779541, + "learning_rate": 6.044934294192454e-07, + "loss": 0.5059, + "mean_token_accuracy": 0.8360487222671509, + "num_tokens": 54449137.0, + "step": 1427 + }, + { + "epoch": 0.18165627782724844, + "ewc_loss": 0.0033028319012373686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3028318284777924e-05, + "grad_norm": 2.595457077026367, + "learning_rate": 6.049173378550233e-07, + "loss": 0.4605, + "mean_token_accuracy": 0.8503996133804321, + "num_tokens": 54494604.0, + "step": 1428 + }, + { + "epoch": 0.18178348810583894, + "ewc_loss": 0.003264398779720068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.264398765168153e-05, + "grad_norm": 2.6362075805664062, + "learning_rate": 6.053412462908012e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.8522046208381653, + "num_tokens": 54533533.0, + "step": 1429 + }, + { + "epoch": 0.18191069838442947, + "ewc_loss": 0.0032974109053611755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.297410876257345e-05, + "grad_norm": 2.6982767581939697, + "learning_rate": 6.05765154726579e-07, + "loss": 0.5121, + "mean_token_accuracy": 0.8345421552658081, + "num_tokens": 54575211.0, + "step": 1430 + }, + { + "epoch": 0.18203790866301997, + "ewc_loss": 0.0033225107472389936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3225107472389936e-05, + "grad_norm": 2.6267824172973633, + "learning_rate": 6.061890631623569e-07, + "loss": 0.4204, + "mean_token_accuracy": 0.86070716381073, + "num_tokens": 54618308.0, + "step": 1431 + }, + { + "epoch": 0.18216511894161047, + "ewc_loss": 0.003287282306700945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.287282379460521e-05, + "grad_norm": 2.6459221839904785, + "learning_rate": 6.066129715981347e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8481422066688538, + "num_tokens": 54660260.0, + "step": 1432 + }, + { + "epoch": 0.182292329220201, + "ewc_loss": 0.003303465899080038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3034659281838685e-05, + "grad_norm": 2.770761013031006, + "learning_rate": 6.070368800339126e-07, + "loss": 0.4264, + "mean_token_accuracy": 0.8581681251525879, + "num_tokens": 54692056.0, + "step": 1433 + }, + { + "epoch": 0.1824195394987915, + "ewc_loss": 0.0033546732738614082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.35467339027673e-05, + "grad_norm": 2.7742538452148438, + "learning_rate": 6.074607884696905e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.849923312664032, + "num_tokens": 54729252.0, + "step": 1434 + }, + { + "epoch": 0.182546749777382, + "ewc_loss": 0.0033308330457657576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3308329875580966e-05, + "grad_norm": 2.7732181549072266, + "learning_rate": 6.078846969054684e-07, + "loss": 0.4725, + "mean_token_accuracy": 0.8434274196624756, + "num_tokens": 54763422.0, + "step": 1435 + }, + { + "epoch": 0.18267396005597253, + "ewc_loss": 0.0033262930810451508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.326293153804727e-05, + "grad_norm": 2.668851852416992, + "learning_rate": 6.083086053412463e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8504718542098999, + "num_tokens": 54803494.0, + "step": 1436 + }, + { + "epoch": 0.18280117033456303, + "ewc_loss": 0.003290321445092559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2903215469559655e-05, + "grad_norm": 2.7810628414154053, + "learning_rate": 6.087325137770242e-07, + "loss": 0.4372, + "mean_token_accuracy": 0.8542483448982239, + "num_tokens": 54839349.0, + "step": 1437 + }, + { + "epoch": 0.18292838061315353, + "ewc_loss": 0.0033496704418212175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3496704418212175e-05, + "grad_norm": 2.719566583633423, + "learning_rate": 6.09156422212802e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8378374576568604, + "num_tokens": 54881941.0, + "step": 1438 + }, + { + "epoch": 0.18305559089174406, + "ewc_loss": 0.003318155650049448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.318155722809024e-05, + "grad_norm": 2.738734245300293, + "learning_rate": 6.095803306485799e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.8448009490966797, + "num_tokens": 54922927.0, + "step": 1439 + }, + { + "epoch": 0.18318280117033456, + "ewc_loss": 0.0033307387493550777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.330738763906993e-05, + "grad_norm": 2.6811187267303467, + "learning_rate": 6.100042390843577e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8394704461097717, + "num_tokens": 54963885.0, + "step": 1440 + }, + { + "epoch": 0.1833100114489251, + "ewc_loss": 0.00331709161400795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.31709161400795e-05, + "grad_norm": 2.697726011276245, + "learning_rate": 6.104281475201356e-07, + "loss": 0.4409, + "mean_token_accuracy": 0.8529183864593506, + "num_tokens": 55001284.0, + "step": 1441 + }, + { + "epoch": 0.1834372217275156, + "ewc_loss": 0.003322568256407976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.322568227304146e-05, + "grad_norm": 2.7667829990386963, + "learning_rate": 6.108520559559135e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8371263742446899, + "num_tokens": 55034813.0, + "step": 1442 + }, + { + "epoch": 0.1835644320061061, + "ewc_loss": 0.003359914291650057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3599142625462264e-05, + "grad_norm": 2.687413215637207, + "learning_rate": 6.112759643916914e-07, + "loss": 0.4061, + "mean_token_accuracy": 0.8641111254692078, + "num_tokens": 55070620.0, + "step": 1443 + }, + { + "epoch": 0.18369164228469662, + "ewc_loss": 0.00332157826051116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.321578333270736e-05, + "grad_norm": 2.7913002967834473, + "learning_rate": 6.116998728274693e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8453289270401001, + "num_tokens": 55108394.0, + "step": 1444 + }, + { + "epoch": 0.18381885256328712, + "ewc_loss": 0.0033671606797724962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3671607525320724e-05, + "grad_norm": 2.703580856323242, + "learning_rate": 6.121237812632472e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.8381171822547913, + "num_tokens": 55148170.0, + "step": 1445 + }, + { + "epoch": 0.18394606284187762, + "ewc_loss": 0.0033353185281157494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.335318615427241e-05, + "grad_norm": 2.632754325866699, + "learning_rate": 6.125476896990249e-07, + "loss": 0.4332, + "mean_token_accuracy": 0.8587610721588135, + "num_tokens": 55188102.0, + "step": 1446 + }, + { + "epoch": 0.18407327312046814, + "ewc_loss": 0.0033175856806337833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.317585651529953e-05, + "grad_norm": 2.718319892883301, + "learning_rate": 6.129715981348028e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8493816256523132, + "num_tokens": 55225893.0, + "step": 1447 + }, + { + "epoch": 0.18420048339905865, + "ewc_loss": 0.0033600693568587303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3600692404434085e-05, + "grad_norm": 2.776002883911133, + "learning_rate": 6.133955065705807e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8372877836227417, + "num_tokens": 55265321.0, + "step": 1448 + }, + { + "epoch": 0.18432769367764915, + "ewc_loss": 0.0033764648251235485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3764648833312094e-05, + "grad_norm": 2.7213828563690186, + "learning_rate": 6.138194150063585e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8514790534973145, + "num_tokens": 55304179.0, + "step": 1449 + }, + { + "epoch": 0.18445490395623967, + "ewc_loss": 0.00334444222971797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.344442302477546e-05, + "grad_norm": 2.709834098815918, + "learning_rate": 6.142433234421365e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8397307991981506, + "num_tokens": 55343266.0, + "step": 1450 + }, + { + "epoch": 0.18458211423483017, + "ewc_loss": 0.0033481467980891466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3481468562968075e-05, + "grad_norm": 2.732790470123291, + "learning_rate": 6.146672318779143e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.8399524688720703, + "num_tokens": 55381147.0, + "step": 1451 + }, + { + "epoch": 0.18470932451342068, + "ewc_loss": 0.0033571585081517696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3571584935998544e-05, + "grad_norm": 2.8096837997436523, + "learning_rate": 6.150911403136923e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8424580693244934, + "num_tokens": 55414855.0, + "step": 1452 + }, + { + "epoch": 0.1848365347920112, + "ewc_loss": 0.003389063524082303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.389063567738049e-05, + "grad_norm": 2.717384099960327, + "learning_rate": 6.155150487494701e-07, + "loss": 0.4136, + "mean_token_accuracy": 0.862495481967926, + "num_tokens": 55449699.0, + "step": 1453 + }, + { + "epoch": 0.1849637450706017, + "ewc_loss": 0.0033470219932496548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.347021993249655e-05, + "grad_norm": 2.6387312412261963, + "learning_rate": 6.159389571852479e-07, + "loss": 0.4408, + "mean_token_accuracy": 0.8561514019966125, + "num_tokens": 55493202.0, + "step": 1454 + }, + { + "epoch": 0.1850909553491922, + "ewc_loss": 0.003340380499139428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.340380499139428e-05, + "grad_norm": 2.6292760372161865, + "learning_rate": 6.163628656210258e-07, + "loss": 0.4598, + "mean_token_accuracy": 0.8502714037895203, + "num_tokens": 55539873.0, + "step": 1455 + }, + { + "epoch": 0.18521816562778273, + "ewc_loss": 0.003358574816957116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.358574758749455e-05, + "grad_norm": 2.7710068225860596, + "learning_rate": 6.167867740568037e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.8322831392288208, + "num_tokens": 55576338.0, + "step": 1456 + }, + { + "epoch": 0.18534537590637323, + "ewc_loss": 0.003410494886338711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.410494900890626e-05, + "grad_norm": 2.678158760070801, + "learning_rate": 6.172106824925815e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.851790189743042, + "num_tokens": 55618330.0, + "step": 1457 + }, + { + "epoch": 0.18547258618496373, + "ewc_loss": 0.003351960564032197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.351960549480282e-05, + "grad_norm": 2.717111110687256, + "learning_rate": 6.176345909283595e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.8480151891708374, + "num_tokens": 55657394.0, + "step": 1458 + }, + { + "epoch": 0.18559979646355426, + "ewc_loss": 0.0033834511414170265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3834512578323483e-05, + "grad_norm": 2.810283899307251, + "learning_rate": 6.180584993641373e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.8534450531005859, + "num_tokens": 55695087.0, + "step": 1459 + }, + { + "epoch": 0.18572700674214476, + "ewc_loss": 0.003416212974116206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.416213075979613e-05, + "grad_norm": 2.7271029949188232, + "learning_rate": 6.184824077999153e-07, + "loss": 0.4292, + "mean_token_accuracy": 0.8568596839904785, + "num_tokens": 55730852.0, + "step": 1460 + }, + { + "epoch": 0.18585421702073526, + "ewc_loss": 0.003368646139279008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.368646139279008e-05, + "grad_norm": 2.8325908184051514, + "learning_rate": 6.189063162356931e-07, + "loss": 0.5248, + "mean_token_accuracy": 0.8308219313621521, + "num_tokens": 55765678.0, + "step": 1461 + }, + { + "epoch": 0.1859814272993258, + "ewc_loss": 0.003416873048990965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.416873005335219e-05, + "grad_norm": 2.760601282119751, + "learning_rate": 6.193302246714709e-07, + "loss": 0.4517, + "mean_token_accuracy": 0.8532675504684448, + "num_tokens": 55804409.0, + "step": 1462 + }, + { + "epoch": 0.1861086375779163, + "ewc_loss": 0.0033803891856223345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.380389171070419e-05, + "grad_norm": 2.7271523475646973, + "learning_rate": 6.197541331072488e-07, + "loss": 0.4801, + "mean_token_accuracy": 0.8419520854949951, + "num_tokens": 55839520.0, + "step": 1463 + }, + { + "epoch": 0.1862358478565068, + "ewc_loss": 0.0033747972920536995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3747972338460386e-05, + "grad_norm": 2.7035024166107178, + "learning_rate": 6.201780415430267e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8480790853500366, + "num_tokens": 55879089.0, + "step": 1464 + }, + { + "epoch": 0.18636305813509732, + "ewc_loss": 0.003383421804755926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.383421790204011e-05, + "grad_norm": 2.771054744720459, + "learning_rate": 6.206019499788045e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8364412784576416, + "num_tokens": 55916655.0, + "step": 1465 + }, + { + "epoch": 0.18649026841368782, + "ewc_loss": 0.0034100906923413277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.410090721445158e-05, + "grad_norm": 2.7050259113311768, + "learning_rate": 6.210258584145825e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8534226417541504, + "num_tokens": 55953828.0, + "step": 1466 + }, + { + "epoch": 0.18661747869227835, + "ewc_loss": 0.003382315393537283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.382315480848774e-05, + "grad_norm": 2.735194683074951, + "learning_rate": 6.214497668503603e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.836838960647583, + "num_tokens": 55992037.0, + "step": 1467 + }, + { + "epoch": 0.18674468897086885, + "ewc_loss": 0.003395653795450926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3956537663470954e-05, + "grad_norm": 2.85321044921875, + "learning_rate": 6.218736752861383e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8537285327911377, + "num_tokens": 56020424.0, + "step": 1468 + }, + { + "epoch": 0.18687189924945935, + "ewc_loss": 0.0034504758659750223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.450475924182683e-05, + "grad_norm": 2.747770071029663, + "learning_rate": 6.22297583721916e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8467193841934204, + "num_tokens": 56060839.0, + "step": 1469 + }, + { + "epoch": 0.18699910952804988, + "ewc_loss": 0.0034025367349386215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.402536822250113e-05, + "grad_norm": 2.7629148960113525, + "learning_rate": 6.227214921576938e-07, + "loss": 0.5098, + "mean_token_accuracy": 0.8387430906295776, + "num_tokens": 56099095.0, + "step": 1470 + }, + { + "epoch": 0.18712631980664038, + "ewc_loss": 0.0034185131080448627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.418513006181456e-05, + "grad_norm": 2.7216875553131104, + "learning_rate": 6.231454005934718e-07, + "loss": 0.4094, + "mean_token_accuracy": 0.865391731262207, + "num_tokens": 56133805.0, + "step": 1471 + }, + { + "epoch": 0.18725353008523088, + "ewc_loss": 0.0034203222021460533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.420322173042223e-05, + "grad_norm": 2.776916027069092, + "learning_rate": 6.235693090292496e-07, + "loss": 0.5261, + "mean_token_accuracy": 0.8328901529312134, + "num_tokens": 56173064.0, + "step": 1472 + }, + { + "epoch": 0.1873807403638214, + "ewc_loss": 0.00343444780446589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4344477171543986e-05, + "grad_norm": 2.657139539718628, + "learning_rate": 6.239932174650275e-07, + "loss": 0.4097, + "mean_token_accuracy": 0.8666480779647827, + "num_tokens": 56213285.0, + "step": 1473 + }, + { + "epoch": 0.1875079506424119, + "ewc_loss": 0.003382147289812565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.382147406227887e-05, + "grad_norm": 2.8674559593200684, + "learning_rate": 6.244171259008054e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8378369808197021, + "num_tokens": 56248218.0, + "step": 1474 + }, + { + "epoch": 0.1876351609210024, + "ewc_loss": 0.0034817017149180174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4817017876775935e-05, + "grad_norm": 2.7110798358917236, + "learning_rate": 6.248410343365833e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.8570277094841003, + "num_tokens": 56286098.0, + "step": 1475 + }, + { + "epoch": 0.18776237119959294, + "ewc_loss": 0.003406162140890956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.40616206813138e-05, + "grad_norm": 2.6426196098327637, + "learning_rate": 6.252649427723612e-07, + "loss": 0.4437, + "mean_token_accuracy": 0.8563176989555359, + "num_tokens": 56329732.0, + "step": 1476 + }, + { + "epoch": 0.18788958147818344, + "ewc_loss": 0.003386016469448805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.386016396689229e-05, + "grad_norm": 2.714179277420044, + "learning_rate": 6.25688851208139e-07, + "loss": 0.4473, + "mean_token_accuracy": 0.8542598485946655, + "num_tokens": 56367838.0, + "step": 1477 + }, + { + "epoch": 0.18801679175677394, + "ewc_loss": 0.003430946497246623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4309465263504535e-05, + "grad_norm": 2.72346830368042, + "learning_rate": 6.261127596439168e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.834621787071228, + "num_tokens": 56413059.0, + "step": 1478 + }, + { + "epoch": 0.18814400203536447, + "ewc_loss": 0.003419008804485202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.419008862692863e-05, + "grad_norm": 2.707240104675293, + "learning_rate": 6.265366680796948e-07, + "loss": 0.4042, + "mean_token_accuracy": 0.8621392250061035, + "num_tokens": 56453806.0, + "step": 1479 + }, + { + "epoch": 0.18827121231395497, + "ewc_loss": 0.0034092881251126528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.409288183320314e-05, + "grad_norm": 2.7107651233673096, + "learning_rate": 6.269605765154726e-07, + "loss": 0.4515, + "mean_token_accuracy": 0.8505915403366089, + "num_tokens": 56497448.0, + "step": 1480 + }, + { + "epoch": 0.18839842259254547, + "ewc_loss": 0.0034182772506028414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4182772651547566e-05, + "grad_norm": 2.7384848594665527, + "learning_rate": 6.273844849512505e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8497967720031738, + "num_tokens": 56537731.0, + "step": 1481 + }, + { + "epoch": 0.188525632871136, + "ewc_loss": 0.003425084985792637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.425085014896467e-05, + "grad_norm": 2.6417975425720215, + "learning_rate": 6.278083933870284e-07, + "loss": 0.3936, + "mean_token_accuracy": 0.8700946569442749, + "num_tokens": 56580944.0, + "step": 1482 + }, + { + "epoch": 0.1886528431497265, + "ewc_loss": 0.0033828807063400745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3828808227553964e-05, + "grad_norm": 2.8227903842926025, + "learning_rate": 6.282323018228063e-07, + "loss": 0.4252, + "mean_token_accuracy": 0.8618800640106201, + "num_tokens": 56616306.0, + "step": 1483 + }, + { + "epoch": 0.188780053428317, + "ewc_loss": 0.0034595231991261244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4595232136780396e-05, + "grad_norm": 2.706481456756592, + "learning_rate": 6.286562102585841e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8393391370773315, + "num_tokens": 56661874.0, + "step": 1484 + }, + { + "epoch": 0.18890726370690752, + "ewc_loss": 0.0033858774695545435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.385877425898798e-05, + "grad_norm": 2.7133824825286865, + "learning_rate": 6.29080118694362e-07, + "loss": 0.4006, + "mean_token_accuracy": 0.8658374547958374, + "num_tokens": 56696697.0, + "step": 1485 + }, + { + "epoch": 0.18903447398549802, + "ewc_loss": 0.003401135792955756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.401135836611502e-05, + "grad_norm": 2.76417875289917, + "learning_rate": 6.295040271301398e-07, + "loss": 0.3995, + "mean_token_accuracy": 0.8649924993515015, + "num_tokens": 56732581.0, + "step": 1486 + }, + { + "epoch": 0.18916168426408853, + "ewc_loss": 0.0034279085230082273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.427908450248651e-05, + "grad_norm": 2.6705567836761475, + "learning_rate": 6.299279355659178e-07, + "loss": 0.4483, + "mean_token_accuracy": 0.8536561727523804, + "num_tokens": 56777905.0, + "step": 1487 + }, + { + "epoch": 0.18928889454267905, + "ewc_loss": 0.0033755707554519176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.375570668140426e-05, + "grad_norm": 2.8353817462921143, + "learning_rate": 6.303518440016956e-07, + "loss": 0.4668, + "mean_token_accuracy": 0.8459961414337158, + "num_tokens": 56812915.0, + "step": 1488 + }, + { + "epoch": 0.18941610482126955, + "ewc_loss": 0.0034440113231539726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.444011235842481e-05, + "grad_norm": 2.8552374839782715, + "learning_rate": 6.307757524374735e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.850363552570343, + "num_tokens": 56846907.0, + "step": 1489 + }, + { + "epoch": 0.18954331509986008, + "ewc_loss": 0.0034337444230914116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.433744495850988e-05, + "grad_norm": 2.7901995182037354, + "learning_rate": 6.311996608732514e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8442445993423462, + "num_tokens": 56884614.0, + "step": 1490 + }, + { + "epoch": 0.18967052537845058, + "ewc_loss": 0.0033994242548942566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.399424167582765e-05, + "grad_norm": 2.8065683841705322, + "learning_rate": 6.316235693090292e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8407475352287292, + "num_tokens": 56920508.0, + "step": 1491 + }, + { + "epoch": 0.18979773565704108, + "ewc_loss": 0.0034237392246723175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.423739326535724e-05, + "grad_norm": 2.768038272857666, + "learning_rate": 6.320474777448071e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.8402115106582642, + "num_tokens": 56957629.0, + "step": 1492 + }, + { + "epoch": 0.1899249459356316, + "ewc_loss": 0.0034148998092859983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4148997656302527e-05, + "grad_norm": 2.708875894546509, + "learning_rate": 6.324713861805849e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8513811230659485, + "num_tokens": 56997265.0, + "step": 1493 + }, + { + "epoch": 0.1900521562142221, + "ewc_loss": 0.0033941881265491247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.394188024685718e-05, + "grad_norm": 2.8668532371520996, + "learning_rate": 6.328952946163628e-07, + "loss": 0.543, + "mean_token_accuracy": 0.8265952467918396, + "num_tokens": 57031519.0, + "step": 1494 + }, + { + "epoch": 0.1901793664928126, + "ewc_loss": 0.0034711912740021944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.471191303106025e-05, + "grad_norm": 2.71079158782959, + "learning_rate": 6.333192030521407e-07, + "loss": 0.4165, + "mean_token_accuracy": 0.8582815527915955, + "num_tokens": 57067264.0, + "step": 1495 + }, + { + "epoch": 0.19030657677140314, + "ewc_loss": 0.0034050438553094864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.40504375344608e-05, + "grad_norm": 2.7790982723236084, + "learning_rate": 6.337431114879186e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8475227355957031, + "num_tokens": 57104906.0, + "step": 1496 + }, + { + "epoch": 0.19043378704999364, + "ewc_loss": 0.0034444117918610573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.444411777309142e-05, + "grad_norm": 2.7592597007751465, + "learning_rate": 6.341670199236965e-07, + "loss": 0.439, + "mean_token_accuracy": 0.8557177186012268, + "num_tokens": 57142534.0, + "step": 1497 + }, + { + "epoch": 0.19056099732858414, + "ewc_loss": 0.0034396490082144737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4396489354548976e-05, + "grad_norm": 2.6286659240722656, + "learning_rate": 6.345909283594744e-07, + "loss": 0.4185, + "mean_token_accuracy": 0.8608791828155518, + "num_tokens": 57188735.0, + "step": 1498 + }, + { + "epoch": 0.19068820760717467, + "ewc_loss": 0.003392665646970272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.39266553055495e-05, + "grad_norm": 2.7508294582366943, + "learning_rate": 6.350148367952522e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8394598960876465, + "num_tokens": 57224610.0, + "step": 1499 + }, + { + "epoch": 0.19081541788576517, + "ewc_loss": 0.003469737246632576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.46973720297683e-05, + "grad_norm": 2.775742292404175, + "learning_rate": 6.354387452310301e-07, + "loss": 0.4267, + "mean_token_accuracy": 0.8556162118911743, + "num_tokens": 57259553.0, + "step": 1500 + }, + { + "epoch": 0.19094262816435567, + "ewc_loss": 0.0034560817293822765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.456081685726531e-05, + "grad_norm": 2.78696870803833, + "learning_rate": 6.358626536668079e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8584574460983276, + "num_tokens": 57295819.0, + "step": 1501 + }, + { + "epoch": 0.1910698384429462, + "ewc_loss": 0.00346382986754179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.463829852989875e-05, + "grad_norm": 2.6825764179229736, + "learning_rate": 6.362865621025858e-07, + "loss": 0.4185, + "mean_token_accuracy": 0.8616563081741333, + "num_tokens": 57339110.0, + "step": 1502 + }, + { + "epoch": 0.1911970487215367, + "ewc_loss": 0.0034196535125374794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4196535125374794e-05, + "grad_norm": 2.7391304969787598, + "learning_rate": 6.367104705383637e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.8528807163238525, + "num_tokens": 57377311.0, + "step": 1503 + }, + { + "epoch": 0.1913242590001272, + "ewc_loss": 0.003459816100075841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4598160709720105e-05, + "grad_norm": 2.736649990081787, + "learning_rate": 6.371343789741416e-07, + "loss": 0.4578, + "mean_token_accuracy": 0.8454248905181885, + "num_tokens": 57419281.0, + "step": 1504 + }, + { + "epoch": 0.19145146927871773, + "ewc_loss": 0.003457202110439539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.457202183199115e-05, + "grad_norm": 2.7325053215026855, + "learning_rate": 6.375582874099195e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8519699573516846, + "num_tokens": 57458134.0, + "step": 1505 + }, + { + "epoch": 0.19157867955730823, + "ewc_loss": 0.0034572810400277376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.457281127339229e-05, + "grad_norm": 2.8777360916137695, + "learning_rate": 6.379821958456974e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8358391523361206, + "num_tokens": 57492931.0, + "step": 1506 + }, + { + "epoch": 0.19170588983589873, + "ewc_loss": 0.0035070967860519886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.507096698740497e-05, + "grad_norm": 2.7808725833892822, + "learning_rate": 6.384061042814751e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.8456158638000488, + "num_tokens": 57529581.0, + "step": 1507 + }, + { + "epoch": 0.19183310011448926, + "ewc_loss": 0.0034531515557318926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.453151657595299e-05, + "grad_norm": 2.755740165710449, + "learning_rate": 6.38830012717253e-07, + "loss": 0.4613, + "mean_token_accuracy": 0.849429726600647, + "num_tokens": 57570302.0, + "step": 1508 + }, + { + "epoch": 0.19196031039307976, + "ewc_loss": 0.003455419559031725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.45541957358364e-05, + "grad_norm": 2.689897298812866, + "learning_rate": 6.392539211530309e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.843669056892395, + "num_tokens": 57615974.0, + "step": 1509 + }, + { + "epoch": 0.19208752067167026, + "ewc_loss": 0.0034528118558228016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.452811870374717e-05, + "grad_norm": 2.7428739070892334, + "learning_rate": 6.396778295888087e-07, + "loss": 0.4353, + "mean_token_accuracy": 0.8586139678955078, + "num_tokens": 57653737.0, + "step": 1510 + }, + { + "epoch": 0.1922147309502608, + "ewc_loss": 0.00347345182672143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4734519431367517e-05, + "grad_norm": 2.744009256362915, + "learning_rate": 6.401017380245867e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8448225855827332, + "num_tokens": 57693431.0, + "step": 1511 + }, + { + "epoch": 0.1923419412288513, + "ewc_loss": 0.0034630128648132086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.463012762949802e-05, + "grad_norm": 2.6666502952575684, + "learning_rate": 6.405256464603645e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.8490257263183594, + "num_tokens": 57736413.0, + "step": 1512 + }, + { + "epoch": 0.1924691515074418, + "ewc_loss": 0.003439371706917882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.439371721469797e-05, + "grad_norm": 2.7803125381469727, + "learning_rate": 6.409495548961425e-07, + "loss": 0.4375, + "mean_token_accuracy": 0.8570227026939392, + "num_tokens": 57773293.0, + "step": 1513 + }, + { + "epoch": 0.19259636178603232, + "ewc_loss": 0.0034908116795122623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.490811650408432e-05, + "grad_norm": 2.803119659423828, + "learning_rate": 6.413734633319203e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8529976606369019, + "num_tokens": 57809294.0, + "step": 1514 + }, + { + "epoch": 0.19272357206462282, + "ewc_loss": 0.0034856274724006653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.485627530608326e-05, + "grad_norm": 2.7871510982513428, + "learning_rate": 6.417973717676981e-07, + "loss": 0.4227, + "mean_token_accuracy": 0.858489990234375, + "num_tokens": 57843610.0, + "step": 1515 + }, + { + "epoch": 0.19285078234321335, + "ewc_loss": 0.0034801731817424297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4801731089828536e-05, + "grad_norm": 2.68735933303833, + "learning_rate": 6.42221280203476e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.838808000087738, + "num_tokens": 57893098.0, + "step": 1516 + }, + { + "epoch": 0.19297799262180385, + "ewc_loss": 0.0034496155567467213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.449615542194806e-05, + "grad_norm": 2.7839107513427734, + "learning_rate": 6.426451886392539e-07, + "loss": 0.509, + "mean_token_accuracy": 0.8383795619010925, + "num_tokens": 57933098.0, + "step": 1517 + }, + { + "epoch": 0.19310520290039435, + "ewc_loss": 0.003500825958326459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.500825914670713e-05, + "grad_norm": 2.7435405254364014, + "learning_rate": 6.430690970750317e-07, + "loss": 0.4401, + "mean_token_accuracy": 0.8540083765983582, + "num_tokens": 57971502.0, + "step": 1518 + }, + { + "epoch": 0.19323241317898487, + "ewc_loss": 0.003483901731669903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.483901673462242e-05, + "grad_norm": 2.898855447769165, + "learning_rate": 6.434930055108097e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.840943455696106, + "num_tokens": 58001557.0, + "step": 1519 + }, + { + "epoch": 0.19335962345757537, + "ewc_loss": 0.003547399304807186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5473993193591014e-05, + "grad_norm": 2.830810785293579, + "learning_rate": 6.439169139465875e-07, + "loss": 0.4968, + "mean_token_accuracy": 0.8363347053527832, + "num_tokens": 58038058.0, + "step": 1520 + }, + { + "epoch": 0.19348683373616588, + "ewc_loss": 0.0035070166923105717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.507016663206741e-05, + "grad_norm": 2.7554750442504883, + "learning_rate": 6.443408223823655e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.845061182975769, + "num_tokens": 58073138.0, + "step": 1521 + }, + { + "epoch": 0.1936140440147564, + "ewc_loss": 0.0034955674782395363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.495567580102943e-05, + "grad_norm": 2.7308571338653564, + "learning_rate": 6.447647308181432e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8493291139602661, + "num_tokens": 58111793.0, + "step": 1522 + }, + { + "epoch": 0.1937412542933469, + "ewc_loss": 0.003511841408908367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.511841350700706e-05, + "grad_norm": 2.6860885620117188, + "learning_rate": 6.451886392539211e-07, + "loss": 0.4457, + "mean_token_accuracy": 0.8519052267074585, + "num_tokens": 58152911.0, + "step": 1523 + }, + { + "epoch": 0.1938684645719374, + "ewc_loss": 0.003499048762023449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.499048762023449e-05, + "grad_norm": 2.7521166801452637, + "learning_rate": 6.45612547689699e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.847886323928833, + "num_tokens": 58192095.0, + "step": 1524 + }, + { + "epoch": 0.19399567485052793, + "ewc_loss": 0.0035307558719068766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5307559301145375e-05, + "grad_norm": 2.7401621341705322, + "learning_rate": 6.460364561254769e-07, + "loss": 0.4189, + "mean_token_accuracy": 0.860429584980011, + "num_tokens": 58232651.0, + "step": 1525 + }, + { + "epoch": 0.19412288512911843, + "ewc_loss": 0.0035186181776225567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.518618177622557e-05, + "grad_norm": 2.7166388034820557, + "learning_rate": 6.464603645612547e-07, + "loss": 0.4176, + "mean_token_accuracy": 0.86229008436203, + "num_tokens": 58270802.0, + "step": 1526 + }, + { + "epoch": 0.19425009540770893, + "ewc_loss": 0.0035249507054686546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.524950807332061e-05, + "grad_norm": 2.7050955295562744, + "learning_rate": 6.468842729970327e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.8539174795150757, + "num_tokens": 58313179.0, + "step": 1527 + }, + { + "epoch": 0.19437730568629946, + "ewc_loss": 0.0035244959872215986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.524496059981175e-05, + "grad_norm": 2.8203954696655273, + "learning_rate": 6.473081814328105e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.8455443978309631, + "num_tokens": 58354728.0, + "step": 1528 + }, + { + "epoch": 0.19450451596488996, + "ewc_loss": 0.0035667014308273792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.566701343515888e-05, + "grad_norm": 2.7300970554351807, + "learning_rate": 6.477320898685885e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.848668098449707, + "num_tokens": 58398649.0, + "step": 1529 + }, + { + "epoch": 0.19463172624348046, + "ewc_loss": 0.003517462406307459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5174623917555436e-05, + "grad_norm": 2.7893991470336914, + "learning_rate": 6.481559983043662e-07, + "loss": 0.482, + "mean_token_accuracy": 0.841131865978241, + "num_tokens": 58438261.0, + "step": 1530 + }, + { + "epoch": 0.194758936522071, + "ewc_loss": 0.0035511425230652094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.551142435753718e-05, + "grad_norm": 2.7484288215637207, + "learning_rate": 6.48579906740144e-07, + "loss": 0.4279, + "mean_token_accuracy": 0.8572835922241211, + "num_tokens": 58479141.0, + "step": 1531 + }, + { + "epoch": 0.1948861468006615, + "ewc_loss": 0.0035300515592098236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5300516174174845e-05, + "grad_norm": 2.754713535308838, + "learning_rate": 6.49003815175922e-07, + "loss": 0.3971, + "mean_token_accuracy": 0.8686014413833618, + "num_tokens": 58515700.0, + "step": 1532 + }, + { + "epoch": 0.195013357079252, + "ewc_loss": 0.0035303947515785694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.530394678818993e-05, + "grad_norm": 2.809260368347168, + "learning_rate": 6.494277236116998e-07, + "loss": 0.4028, + "mean_token_accuracy": 0.8664270043373108, + "num_tokens": 58549825.0, + "step": 1533 + }, + { + "epoch": 0.19514056735784252, + "ewc_loss": 0.0035610992927104235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.561099219950847e-05, + "grad_norm": 2.7389330863952637, + "learning_rate": 6.498516320474777e-07, + "loss": 0.5428, + "mean_token_accuracy": 0.8274203538894653, + "num_tokens": 58596110.0, + "step": 1534 + }, + { + "epoch": 0.19526777763643302, + "ewc_loss": 0.0035201243590563536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5201243008486927e-05, + "grad_norm": 2.783973455429077, + "learning_rate": 6.502755404832556e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.8491329550743103, + "num_tokens": 58632410.0, + "step": 1535 + }, + { + "epoch": 0.19539498791502352, + "ewc_loss": 0.0035459729842841625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.545972867868841e-05, + "grad_norm": 2.8199427127838135, + "learning_rate": 6.506994489190335e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8371862173080444, + "num_tokens": 58671339.0, + "step": 1536 + }, + { + "epoch": 0.19552219819361405, + "ewc_loss": 0.0035527620930224657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.552762063918635e-05, + "grad_norm": 2.6976053714752197, + "learning_rate": 6.511233573548114e-07, + "loss": 0.415, + "mean_token_accuracy": 0.8616347312927246, + "num_tokens": 58711786.0, + "step": 1537 + }, + { + "epoch": 0.19564940847220455, + "ewc_loss": 0.0035006366670131683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5006367397727445e-05, + "grad_norm": 2.851487874984741, + "learning_rate": 6.515472657905892e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.848604679107666, + "num_tokens": 58743600.0, + "step": 1538 + }, + { + "epoch": 0.19577661875079505, + "ewc_loss": 0.003582267090678215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.582267163437791e-05, + "grad_norm": 2.8499717712402344, + "learning_rate": 6.51971174226367e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.8341367244720459, + "num_tokens": 58780622.0, + "step": 1539 + }, + { + "epoch": 0.19590382902938558, + "ewc_loss": 0.003568944986909628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.568944885046221e-05, + "grad_norm": 2.8135101795196533, + "learning_rate": 6.52395082662145e-07, + "loss": 0.5051, + "mean_token_accuracy": 0.8348501920700073, + "num_tokens": 58819370.0, + "step": 1540 + }, + { + "epoch": 0.19603103930797608, + "ewc_loss": 0.003544806968420744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.544806895661168e-05, + "grad_norm": 2.741079807281494, + "learning_rate": 6.528189910979228e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8573300838470459, + "num_tokens": 58856270.0, + "step": 1541 + }, + { + "epoch": 0.1961582495865666, + "ewc_loss": 0.003542388789355755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.542388731148094e-05, + "grad_norm": 2.6791024208068848, + "learning_rate": 6.532428995337007e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8517249226570129, + "num_tokens": 58898702.0, + "step": 1542 + }, + { + "epoch": 0.1962854598651571, + "ewc_loss": 0.003526979358866811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5269793443148956e-05, + "grad_norm": 2.8144450187683105, + "learning_rate": 6.536668079694786e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.8433713912963867, + "num_tokens": 58934602.0, + "step": 1543 + }, + { + "epoch": 0.1964126701437476, + "ewc_loss": 0.0035981202963739634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.598120383685455e-05, + "grad_norm": 2.7175683975219727, + "learning_rate": 6.540907164052565e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8481504917144775, + "num_tokens": 58977766.0, + "step": 1544 + }, + { + "epoch": 0.19653988042233814, + "ewc_loss": 0.0035433487500995398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5433487937552854e-05, + "grad_norm": 2.693854570388794, + "learning_rate": 6.545146248410343e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8461370468139648, + "num_tokens": 59019649.0, + "step": 1545 + }, + { + "epoch": 0.19666709070092864, + "ewc_loss": 0.003546824911609292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5468248825054616e-05, + "grad_norm": 2.7938034534454346, + "learning_rate": 6.549385332768122e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8447576761245728, + "num_tokens": 59060543.0, + "step": 1546 + }, + { + "epoch": 0.19679430097951914, + "ewc_loss": 0.0035999813117086887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.599981209845282e-05, + "grad_norm": 2.832280158996582, + "learning_rate": 6.5536244171259e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8419670462608337, + "num_tokens": 59100049.0, + "step": 1547 + }, + { + "epoch": 0.19692151125810967, + "ewc_loss": 0.003606583457440138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.606583413784392e-05, + "grad_norm": 2.7819740772247314, + "learning_rate": 6.55786350148368e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8412528038024902, + "num_tokens": 59137370.0, + "step": 1548 + }, + { + "epoch": 0.19704872153670017, + "ewc_loss": 0.003580313641577959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.580313568818383e-05, + "grad_norm": 2.8003170490264893, + "learning_rate": 6.562102585841458e-07, + "loss": 0.4366, + "mean_token_accuracy": 0.8505169153213501, + "num_tokens": 59176196.0, + "step": 1549 + }, + { + "epoch": 0.19717593181529067, + "ewc_loss": 0.003596565453335643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.596565511543304e-05, + "grad_norm": 2.763333797454834, + "learning_rate": 6.566341670199236e-07, + "loss": 0.4342, + "mean_token_accuracy": 0.8565431237220764, + "num_tokens": 59216215.0, + "step": 1550 + }, + { + "epoch": 0.1973031420938812, + "ewc_loss": 0.0035826044622808695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5826044040732086e-05, + "grad_norm": 2.8008267879486084, + "learning_rate": 6.570580754557016e-07, + "loss": 0.45, + "mean_token_accuracy": 0.8512415885925293, + "num_tokens": 59254173.0, + "step": 1551 + }, + { + "epoch": 0.1974303523724717, + "ewc_loss": 0.00360251497477293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6025150620844215e-05, + "grad_norm": 2.85449481010437, + "learning_rate": 6.574819838914794e-07, + "loss": 0.4235, + "mean_token_accuracy": 0.8598707318305969, + "num_tokens": 59288496.0, + "step": 1552 + }, + { + "epoch": 0.1975575626510622, + "ewc_loss": 0.003612527623772621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.612527507357299e-05, + "grad_norm": 2.7497992515563965, + "learning_rate": 6.579058923272573e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8422501087188721, + "num_tokens": 59329043.0, + "step": 1553 + }, + { + "epoch": 0.19768477292965272, + "ewc_loss": 0.003574987640604377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5749875678448007e-05, + "grad_norm": 2.821547269821167, + "learning_rate": 6.583298007630351e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8452292084693909, + "num_tokens": 59360743.0, + "step": 1554 + }, + { + "epoch": 0.19781198320824323, + "ewc_loss": 0.003615382593125105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.615382593125105e-05, + "grad_norm": 2.714005470275879, + "learning_rate": 6.58753709198813e-07, + "loss": 0.4691, + "mean_token_accuracy": 0.8464149236679077, + "num_tokens": 59404167.0, + "step": 1555 + }, + { + "epoch": 0.19793919348683373, + "ewc_loss": 0.0035688509233295918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5688510251929983e-05, + "grad_norm": 2.844679832458496, + "learning_rate": 6.591776176345909e-07, + "loss": 0.4371, + "mean_token_accuracy": 0.8519245982170105, + "num_tokens": 59447770.0, + "step": 1556 + }, + { + "epoch": 0.19806640376542425, + "ewc_loss": 0.00363222137093544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6322213418316096e-05, + "grad_norm": 2.7203917503356934, + "learning_rate": 6.596015260703688e-07, + "loss": 0.4454, + "mean_token_accuracy": 0.8562108278274536, + "num_tokens": 59494989.0, + "step": 1557 + }, + { + "epoch": 0.19819361404401475, + "ewc_loss": 0.0035614098887890577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.561409903340973e-05, + "grad_norm": 2.7572739124298096, + "learning_rate": 6.600254345061466e-07, + "loss": 0.4334, + "mean_token_accuracy": 0.8584423065185547, + "num_tokens": 59534926.0, + "step": 1558 + }, + { + "epoch": 0.19832082432260525, + "ewc_loss": 0.0035917568020522594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.59175683115609e-05, + "grad_norm": 2.777609348297119, + "learning_rate": 6.604493429419246e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8404383063316345, + "num_tokens": 59577832.0, + "step": 1559 + }, + { + "epoch": 0.19844803460119578, + "ewc_loss": 0.003604600438848138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6046003515366465e-05, + "grad_norm": 2.717327117919922, + "learning_rate": 6.608732513777023e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8454908132553101, + "num_tokens": 59624995.0, + "step": 1560 + }, + { + "epoch": 0.19857524487978628, + "ewc_loss": 0.00357642094604671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.576420931494795e-05, + "grad_norm": 2.7731683254241943, + "learning_rate": 6.612971598134803e-07, + "loss": 0.4492, + "mean_token_accuracy": 0.8511779308319092, + "num_tokens": 59663000.0, + "step": 1561 + }, + { + "epoch": 0.19870245515837678, + "ewc_loss": 0.0036108256317675114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.610825660871342e-05, + "grad_norm": 2.8567519187927246, + "learning_rate": 6.617210682492581e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8369277119636536, + "num_tokens": 59700241.0, + "step": 1562 + }, + { + "epoch": 0.1988296654369673, + "ewc_loss": 0.003633949439972639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.633949381764978e-05, + "grad_norm": 2.7945075035095215, + "learning_rate": 6.62144976685036e-07, + "loss": 0.4232, + "mean_token_accuracy": 0.8608075976371765, + "num_tokens": 59737244.0, + "step": 1563 + }, + { + "epoch": 0.1989568757155578, + "ewc_loss": 0.003593913046643138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.593913061195053e-05, + "grad_norm": 2.791080951690674, + "learning_rate": 6.625688851208139e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.845604658126831, + "num_tokens": 59775538.0, + "step": 1564 + }, + { + "epoch": 0.19908408599414834, + "ewc_loss": 0.0036053520161658525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6053519579581916e-05, + "grad_norm": 2.8666317462921143, + "learning_rate": 6.629927935565918e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.8439714908599854, + "num_tokens": 59810373.0, + "step": 1565 + }, + { + "epoch": 0.19921129627273884, + "ewc_loss": 0.003645101562142372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.645101605798118e-05, + "grad_norm": 2.804206609725952, + "learning_rate": 6.634167019923696e-07, + "loss": 0.499, + "mean_token_accuracy": 0.8410273790359497, + "num_tokens": 59849992.0, + "step": 1566 + }, + { + "epoch": 0.19933850655132934, + "ewc_loss": 0.0036109155043959618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.610915518947877e-05, + "grad_norm": 2.863180160522461, + "learning_rate": 6.638406104281476e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8490558862686157, + "num_tokens": 59882552.0, + "step": 1567 + }, + { + "epoch": 0.19946571682991987, + "ewc_loss": 0.00365075608715415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6507561162579805e-05, + "grad_norm": 2.737119674682617, + "learning_rate": 6.642645188639253e-07, + "loss": 0.4339, + "mean_token_accuracy": 0.8552064895629883, + "num_tokens": 59925090.0, + "step": 1568 + }, + { + "epoch": 0.19959292710851037, + "ewc_loss": 0.003595386864617467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.595386806409806e-05, + "grad_norm": 2.771665096282959, + "learning_rate": 6.646884272997032e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.8429603576660156, + "num_tokens": 59965406.0, + "step": 1569 + }, + { + "epoch": 0.19972013738710087, + "ewc_loss": 0.0036240704357624054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6240704503143206e-05, + "grad_norm": 2.824439525604248, + "learning_rate": 6.651123357354811e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8464294075965881, + "num_tokens": 60001770.0, + "step": 1570 + }, + { + "epoch": 0.1998473476656914, + "ewc_loss": 0.0036537006963044405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6537006963044405e-05, + "grad_norm": 2.8728067874908447, + "learning_rate": 6.655362441712589e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8382394313812256, + "num_tokens": 60036911.0, + "step": 1571 + }, + { + "epoch": 0.1999745579442819, + "ewc_loss": 0.003666272386908531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.666272459668107e-05, + "grad_norm": 2.7277114391326904, + "learning_rate": 6.659601526070369e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.8357675671577454, + "num_tokens": 60080210.0, + "step": 1572 + }, + { + "epoch": 0.2001017682228724, + "ewc_loss": 0.003604222321882844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6042223655385897e-05, + "grad_norm": 2.911964178085327, + "learning_rate": 6.663840610428147e-07, + "loss": 0.5453, + "mean_token_accuracy": 0.8282649517059326, + "num_tokens": 60111533.0, + "step": 1573 + }, + { + "epoch": 0.20022897850146293, + "ewc_loss": 0.003701548557728529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.701548484968953e-05, + "grad_norm": 2.7753820419311523, + "learning_rate": 6.668079694785926e-07, + "loss": 0.4266, + "mean_token_accuracy": 0.8586256504058838, + "num_tokens": 60150733.0, + "step": 1574 + }, + { + "epoch": 0.20035618878005343, + "ewc_loss": 0.0036308257840573788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.630825813161209e-05, + "grad_norm": 2.8266615867614746, + "learning_rate": 6.672318779143704e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.8542906641960144, + "num_tokens": 60185034.0, + "step": 1575 + }, + { + "epoch": 0.20048339905864393, + "ewc_loss": 0.0036686852108687162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6686851672129706e-05, + "grad_norm": 2.938011407852173, + "learning_rate": 6.676557863501483e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.844824492931366, + "num_tokens": 60216865.0, + "step": 1576 + }, + { + "epoch": 0.20061060933723446, + "ewc_loss": 0.0037150850985199213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7150850403122604e-05, + "grad_norm": 2.7494053840637207, + "learning_rate": 6.680796947859262e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.8477108478546143, + "num_tokens": 60257685.0, + "step": 1577 + }, + { + "epoch": 0.20073781961582496, + "ewc_loss": 0.0036297079641371965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.62970786227379e-05, + "grad_norm": 2.8205180168151855, + "learning_rate": 6.685036032217041e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.849793553352356, + "num_tokens": 60294258.0, + "step": 1578 + }, + { + "epoch": 0.20086502989441546, + "ewc_loss": 0.0036915112286806107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.691511301440187e-05, + "grad_norm": 2.7847602367401123, + "learning_rate": 6.689275116574819e-07, + "loss": 0.4473, + "mean_token_accuracy": 0.8553746342658997, + "num_tokens": 60330319.0, + "step": 1579 + }, + { + "epoch": 0.200992240173006, + "ewc_loss": 0.003678272245451808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6782723327632993e-05, + "grad_norm": 2.760693311691284, + "learning_rate": 6.693514200932599e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.8494447469711304, + "num_tokens": 60370482.0, + "step": 1580 + }, + { + "epoch": 0.2011194504515965, + "ewc_loss": 0.0036730896681547165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6730896681547165e-05, + "grad_norm": 2.756410837173462, + "learning_rate": 6.697753285290377e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8454535007476807, + "num_tokens": 60412436.0, + "step": 1581 + }, + { + "epoch": 0.201246660730187, + "ewc_loss": 0.003679055953398347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.679055953398347e-05, + "grad_norm": 2.859168767929077, + "learning_rate": 6.701992369648156e-07, + "loss": 0.3943, + "mean_token_accuracy": 0.8693241477012634, + "num_tokens": 60448982.0, + "step": 1582 + }, + { + "epoch": 0.20137387100877752, + "ewc_loss": 0.0037227014545351267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7227015127427876e-05, + "grad_norm": 2.752687692642212, + "learning_rate": 6.706231454005934e-07, + "loss": 0.411, + "mean_token_accuracy": 0.8615803718566895, + "num_tokens": 60489878.0, + "step": 1583 + }, + { + "epoch": 0.20150108128736802, + "ewc_loss": 0.003654609201475978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.654609099612571e-05, + "grad_norm": 2.839160203933716, + "learning_rate": 6.710470538363713e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.8549766540527344, + "num_tokens": 60528979.0, + "step": 1584 + }, + { + "epoch": 0.20162829156595852, + "ewc_loss": 0.0037050852552056313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7050853279652074e-05, + "grad_norm": 2.8907697200775146, + "learning_rate": 6.714709622721492e-07, + "loss": 0.4756, + "mean_token_accuracy": 0.8445338010787964, + "num_tokens": 60561670.0, + "step": 1585 + }, + { + "epoch": 0.20175550184454905, + "ewc_loss": 0.003719933331012726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7199333746684715e-05, + "grad_norm": 2.7799627780914307, + "learning_rate": 6.718948707079271e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8506225347518921, + "num_tokens": 60602061.0, + "step": 1586 + }, + { + "epoch": 0.20188271212313955, + "ewc_loss": 0.003654396627098322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.654396641650237e-05, + "grad_norm": 2.7955868244171143, + "learning_rate": 6.723187791437049e-07, + "loss": 0.4514, + "mean_token_accuracy": 0.852515459060669, + "num_tokens": 60639805.0, + "step": 1587 + }, + { + "epoch": 0.20200992240173005, + "ewc_loss": 0.003684466006234288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.684465991682373e-05, + "grad_norm": 2.79433274269104, + "learning_rate": 6.727426875794829e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.8429144620895386, + "num_tokens": 60681816.0, + "step": 1588 + }, + { + "epoch": 0.20213713268032057, + "ewc_loss": 0.003691726364195347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.691726305987686e-05, + "grad_norm": 2.7211716175079346, + "learning_rate": 6.731665960152607e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.8454511165618896, + "num_tokens": 60726343.0, + "step": 1589 + }, + { + "epoch": 0.20226434295891108, + "ewc_loss": 0.0036502471193671227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.650247163022868e-05, + "grad_norm": 2.84523868560791, + "learning_rate": 6.735905044510385e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8283390998840332, + "num_tokens": 60764814.0, + "step": 1590 + }, + { + "epoch": 0.2023915532375016, + "ewc_loss": 0.0037113691214472055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.711369208758697e-05, + "grad_norm": 2.8987784385681152, + "learning_rate": 6.740144128868164e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8450742363929749, + "num_tokens": 60802384.0, + "step": 1591 + }, + { + "epoch": 0.2025187635160921, + "ewc_loss": 0.0037149565760046244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.71495661966037e-05, + "grad_norm": 2.7795522212982178, + "learning_rate": 6.744383213225942e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8363626003265381, + "num_tokens": 60842085.0, + "step": 1592 + }, + { + "epoch": 0.2026459737946826, + "ewc_loss": 0.0036463686265051365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.646368713816628e-05, + "grad_norm": 2.8394436836242676, + "learning_rate": 6.748622297583722e-07, + "loss": 0.4369, + "mean_token_accuracy": 0.8519237041473389, + "num_tokens": 60874835.0, + "step": 1593 + }, + { + "epoch": 0.20277318407327313, + "ewc_loss": 0.0036930644419044256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.693064354592934e-05, + "grad_norm": 2.797450304031372, + "learning_rate": 6.7528613819415e-07, + "loss": 0.4606, + "mean_token_accuracy": 0.846030592918396, + "num_tokens": 60910682.0, + "step": 1594 + }, + { + "epoch": 0.20290039435186363, + "ewc_loss": 0.0036771497689187527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.677149652503431e-05, + "grad_norm": 2.7696380615234375, + "learning_rate": 6.757100466299279e-07, + "loss": 0.459, + "mean_token_accuracy": 0.850773811340332, + "num_tokens": 60951352.0, + "step": 1595 + }, + { + "epoch": 0.20302760463045413, + "ewc_loss": 0.0036602236796170473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.660223592305556e-05, + "grad_norm": 3.0220947265625, + "learning_rate": 6.761339550657058e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8397207260131836, + "num_tokens": 60979055.0, + "step": 1596 + }, + { + "epoch": 0.20315481490904466, + "ewc_loss": 0.0037828804925084114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.782880594371818e-05, + "grad_norm": 2.7667131423950195, + "learning_rate": 6.765578635014837e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8487964272499084, + "num_tokens": 61017437.0, + "step": 1597 + }, + { + "epoch": 0.20328202518763516, + "ewc_loss": 0.0036621803883463144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6621804611058906e-05, + "grad_norm": 2.8264145851135254, + "learning_rate": 6.769817719372614e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.8535003662109375, + "num_tokens": 61052286.0, + "step": 1598 + }, + { + "epoch": 0.20340923546622566, + "ewc_loss": 0.003714136313647032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.714136255439371e-05, + "grad_norm": 2.8744800090789795, + "learning_rate": 6.774056803730394e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8408651351928711, + "num_tokens": 61086110.0, + "step": 1599 + }, + { + "epoch": 0.2035364457448162, + "ewc_loss": 0.003752462798729539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.752462725969963e-05, + "grad_norm": 2.7562849521636963, + "learning_rate": 6.778295888088172e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.849744975566864, + "num_tokens": 61124181.0, + "step": 1600 + }, + { + "epoch": 0.2036636560234067, + "ewc_loss": 0.003695329651236534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.69532972399611e-05, + "grad_norm": 2.867601156234741, + "learning_rate": 6.782534972445952e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8422688245773315, + "num_tokens": 61160881.0, + "step": 1601 + }, + { + "epoch": 0.2037908663019972, + "ewc_loss": 0.003762408858165145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7624089600285515e-05, + "grad_norm": 2.8921148777008057, + "learning_rate": 6.78677405680373e-07, + "loss": 0.4758, + "mean_token_accuracy": 0.8457322120666504, + "num_tokens": 61200755.0, + "step": 1602 + }, + { + "epoch": 0.20391807658058772, + "ewc_loss": 0.0037678193766623735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.767819362110458e-05, + "grad_norm": 2.7849175930023193, + "learning_rate": 6.791013141161509e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8438772559165955, + "num_tokens": 61240045.0, + "step": 1603 + }, + { + "epoch": 0.20404528685917822, + "ewc_loss": 0.0037183137610554695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.718313746503554e-05, + "grad_norm": 2.7976815700531006, + "learning_rate": 6.795252225519288e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.8366366624832153, + "num_tokens": 61282832.0, + "step": 1604 + }, + { + "epoch": 0.20417249713776872, + "ewc_loss": 0.0037299320101737976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7299319956218824e-05, + "grad_norm": 2.7871434688568115, + "learning_rate": 6.799491309877067e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8442143201828003, + "num_tokens": 61319841.0, + "step": 1605 + }, + { + "epoch": 0.20429970741635925, + "ewc_loss": 0.0037369595374912024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7369594792835414e-05, + "grad_norm": 2.8876054286956787, + "learning_rate": 6.803730394234844e-07, + "loss": 0.434, + "mean_token_accuracy": 0.8569824695587158, + "num_tokens": 61352732.0, + "step": 1606 + }, + { + "epoch": 0.20442691769494975, + "ewc_loss": 0.0037725602742284536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.77256037609186e-05, + "grad_norm": 2.8571560382843018, + "learning_rate": 6.807969478592624e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.8482055068016052, + "num_tokens": 61388324.0, + "step": 1607 + }, + { + "epoch": 0.20455412797354025, + "ewc_loss": 0.0037487014196813107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7487014196813107e-05, + "grad_norm": 2.737333059310913, + "learning_rate": 6.812208562950402e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.8446362018585205, + "num_tokens": 61431222.0, + "step": 1608 + }, + { + "epoch": 0.20468133825213078, + "ewc_loss": 0.0037151407450437546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.715140701388009e-05, + "grad_norm": 2.7656986713409424, + "learning_rate": 6.816447647308182e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.8304646015167236, + "num_tokens": 61475617.0, + "step": 1609 + }, + { + "epoch": 0.20480854853072128, + "ewc_loss": 0.0037488157395273447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.748815652215853e-05, + "grad_norm": 2.8095176219940186, + "learning_rate": 6.82068673166596e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8527981042861938, + "num_tokens": 61511338.0, + "step": 1610 + }, + { + "epoch": 0.20493575880931178, + "ewc_loss": 0.003761257976293564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.761257903533988e-05, + "grad_norm": 2.82049298286438, + "learning_rate": 6.824925816023738e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8439871072769165, + "num_tokens": 61549093.0, + "step": 1611 + }, + { + "epoch": 0.2050629690879023, + "ewc_loss": 0.003755304729565978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.755304715014063e-05, + "grad_norm": 2.727200746536255, + "learning_rate": 6.829164900381518e-07, + "loss": 0.439, + "mean_token_accuracy": 0.8548356294631958, + "num_tokens": 61593843.0, + "step": 1612 + }, + { + "epoch": 0.2051901793664928, + "ewc_loss": 0.0037243033293634653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.72430331481155e-05, + "grad_norm": 2.8293991088867188, + "learning_rate": 6.833403984739295e-07, + "loss": 0.4495, + "mean_token_accuracy": 0.8511960506439209, + "num_tokens": 61629362.0, + "step": 1613 + }, + { + "epoch": 0.2053173896450833, + "ewc_loss": 0.003779228776693344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7792287912452593e-05, + "grad_norm": 2.866241455078125, + "learning_rate": 6.837643069097074e-07, + "loss": 0.5304, + "mean_token_accuracy": 0.8286471366882324, + "num_tokens": 61667037.0, + "step": 1614 + }, + { + "epoch": 0.20544459992367384, + "ewc_loss": 0.0037774876691401005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.777487654588185e-05, + "grad_norm": 2.884164810180664, + "learning_rate": 6.841882153454853e-07, + "loss": 0.4466, + "mean_token_accuracy": 0.8505418300628662, + "num_tokens": 61701976.0, + "step": 1615 + }, + { + "epoch": 0.20557181020226434, + "ewc_loss": 0.003782645333558321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.782645217142999e-05, + "grad_norm": 2.8337554931640625, + "learning_rate": 6.846121237812632e-07, + "loss": 0.4143, + "mean_token_accuracy": 0.8628145456314087, + "num_tokens": 61739068.0, + "step": 1616 + }, + { + "epoch": 0.20569902048085487, + "ewc_loss": 0.0037598733324557543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7598732888000086e-05, + "grad_norm": 2.778677463531494, + "learning_rate": 6.850360322170411e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8564536571502686, + "num_tokens": 61781515.0, + "step": 1617 + }, + { + "epoch": 0.20582623075944537, + "ewc_loss": 0.003748974297195673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7489742680918425e-05, + "grad_norm": 2.909014940261841, + "learning_rate": 6.85459940652819e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8502451181411743, + "num_tokens": 61815201.0, + "step": 1618 + }, + { + "epoch": 0.20595344103803587, + "ewc_loss": 0.0038044946268200874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.804494554060511e-05, + "grad_norm": 2.817138195037842, + "learning_rate": 6.858838490885968e-07, + "loss": 0.5027, + "mean_token_accuracy": 0.8351039886474609, + "num_tokens": 61856639.0, + "step": 1619 + }, + { + "epoch": 0.2060806513166264, + "ewc_loss": 0.0037485146895051003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.748514791368507e-05, + "grad_norm": 2.843804359436035, + "learning_rate": 6.863077575243748e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8344679474830627, + "num_tokens": 61894313.0, + "step": 1620 + }, + { + "epoch": 0.2062078615952169, + "ewc_loss": 0.003767625894397497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.767625821637921e-05, + "grad_norm": 2.8537638187408447, + "learning_rate": 6.867316659601525e-07, + "loss": 0.4505, + "mean_token_accuracy": 0.8502371311187744, + "num_tokens": 61930748.0, + "step": 1621 + }, + { + "epoch": 0.2063350718738074, + "ewc_loss": 0.0037757176905870438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.775717777898535e-05, + "grad_norm": 2.758979082107544, + "learning_rate": 6.871555743959304e-07, + "loss": 0.4664, + "mean_token_accuracy": 0.8481485843658447, + "num_tokens": 61972002.0, + "step": 1622 + }, + { + "epoch": 0.20646228215239792, + "ewc_loss": 0.0037364906165748835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.736490543815307e-05, + "grad_norm": 2.786036968231201, + "learning_rate": 6.875794828317083e-07, + "loss": 0.4423, + "mean_token_accuracy": 0.856001615524292, + "num_tokens": 62010141.0, + "step": 1623 + }, + { + "epoch": 0.20658949243098843, + "ewc_loss": 0.0037655106279999018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.765510700759478e-05, + "grad_norm": 2.7049663066864014, + "learning_rate": 6.880033912674862e-07, + "loss": 0.4299, + "mean_token_accuracy": 0.8585425615310669, + "num_tokens": 62055907.0, + "step": 1624 + }, + { + "epoch": 0.20671670270957893, + "ewc_loss": 0.00373729201965034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7372919905465096e-05, + "grad_norm": 2.8719027042388916, + "learning_rate": 6.884272997032641e-07, + "loss": 0.5361, + "mean_token_accuracy": 0.8248215913772583, + "num_tokens": 62095885.0, + "step": 1625 + }, + { + "epoch": 0.20684391298816945, + "ewc_loss": 0.003816179698333144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.81617974198889e-05, + "grad_norm": 2.8325183391571045, + "learning_rate": 6.88851208139042e-07, + "loss": 0.4289, + "mean_token_accuracy": 0.8604057431221008, + "num_tokens": 62130497.0, + "step": 1626 + }, + { + "epoch": 0.20697112326675995, + "ewc_loss": 0.003782362909987569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.782362909987569e-05, + "grad_norm": 2.8423829078674316, + "learning_rate": 6.892751165748198e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8389374017715454, + "num_tokens": 62170296.0, + "step": 1627 + }, + { + "epoch": 0.20709833354535045, + "ewc_loss": 0.0037836784031242132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.783678403124213e-05, + "grad_norm": 2.7789218425750732, + "learning_rate": 6.896990250105978e-07, + "loss": 0.4281, + "mean_token_accuracy": 0.8607546091079712, + "num_tokens": 62210168.0, + "step": 1628 + }, + { + "epoch": 0.20722554382394098, + "ewc_loss": 0.003767388639971614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7673886254196987e-05, + "grad_norm": 2.866753101348877, + "learning_rate": 6.901229334463755e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8502635955810547, + "num_tokens": 62245251.0, + "step": 1629 + }, + { + "epoch": 0.20735275410253148, + "ewc_loss": 0.0038089423906058073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8089423469500616e-05, + "grad_norm": 2.8067727088928223, + "learning_rate": 6.905468418821534e-07, + "loss": 0.4509, + "mean_token_accuracy": 0.8563734889030457, + "num_tokens": 62286124.0, + "step": 1630 + }, + { + "epoch": 0.20747996438112198, + "ewc_loss": 0.0037762951105833054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7762951251352206e-05, + "grad_norm": 2.865694046020508, + "learning_rate": 6.909707503179313e-07, + "loss": 0.4544, + "mean_token_accuracy": 0.8470796346664429, + "num_tokens": 62325604.0, + "step": 1631 + }, + { + "epoch": 0.2076071746597125, + "ewc_loss": 0.003803611733019352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.80361161660403e-05, + "grad_norm": 2.7879984378814697, + "learning_rate": 6.913946587537091e-07, + "loss": 0.4497, + "mean_token_accuracy": 0.8519208431243896, + "num_tokens": 62364467.0, + "step": 1632 + }, + { + "epoch": 0.207734384938303, + "ewc_loss": 0.0037627166602760553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7627167330356315e-05, + "grad_norm": 2.8669490814208984, + "learning_rate": 6.918185671894871e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.829012393951416, + "num_tokens": 62402075.0, + "step": 1633 + }, + { + "epoch": 0.2078615952168935, + "ewc_loss": 0.003812442533671856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8124424463603646e-05, + "grad_norm": 2.892643690109253, + "learning_rate": 6.922424756252649e-07, + "loss": 0.4929, + "mean_token_accuracy": 0.8459091186523438, + "num_tokens": 62437802.0, + "step": 1634 + }, + { + "epoch": 0.20798880549548404, + "ewc_loss": 0.0038106597494333982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8106598367448896e-05, + "grad_norm": 2.7806761264801025, + "learning_rate": 6.926663840610428e-07, + "loss": 0.4112, + "mean_token_accuracy": 0.8633135557174683, + "num_tokens": 62476176.0, + "step": 1635 + }, + { + "epoch": 0.20811601577407454, + "ewc_loss": 0.0037609112914651632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.760911204153672e-05, + "grad_norm": 2.831864356994629, + "learning_rate": 6.930902924968206e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.847022294998169, + "num_tokens": 62514378.0, + "step": 1636 + }, + { + "epoch": 0.20824322605266504, + "ewc_loss": 0.0038069901056587696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.806990207522176e-05, + "grad_norm": 2.8648416996002197, + "learning_rate": 6.935142009325985e-07, + "loss": 0.4759, + "mean_token_accuracy": 0.8436460494995117, + "num_tokens": 62555302.0, + "step": 1637 + }, + { + "epoch": 0.20837043633125557, + "ewc_loss": 0.003810175694525242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.810175621765666e-05, + "grad_norm": 2.8256311416625977, + "learning_rate": 6.939381093683764e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8392285108566284, + "num_tokens": 62594356.0, + "step": 1638 + }, + { + "epoch": 0.20849764660984607, + "ewc_loss": 0.0037896959111094475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7896959838690236e-05, + "grad_norm": 2.790229558944702, + "learning_rate": 6.943620178041543e-07, + "loss": 0.4129, + "mean_token_accuracy": 0.8647676706314087, + "num_tokens": 62632903.0, + "step": 1639 + }, + { + "epoch": 0.2086248568884366, + "ewc_loss": 0.003773305332288146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7733054341515526e-05, + "grad_norm": 2.876241683959961, + "learning_rate": 6.947859262399321e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8550119400024414, + "num_tokens": 62668979.0, + "step": 1640 + }, + { + "epoch": 0.2087520671670271, + "ewc_loss": 0.00382938701659441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8293870602501556e-05, + "grad_norm": 2.8724441528320312, + "learning_rate": 6.952098346757101e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8515326976776123, + "num_tokens": 62705867.0, + "step": 1641 + }, + { + "epoch": 0.2088792774456176, + "ewc_loss": 0.0038118697702884674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.811869828496128e-05, + "grad_norm": 2.84088134765625, + "learning_rate": 6.956337431114879e-07, + "loss": 0.4429, + "mean_token_accuracy": 0.855402410030365, + "num_tokens": 62744559.0, + "step": 1642 + }, + { + "epoch": 0.20900648772420813, + "ewc_loss": 0.0037951332051306963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.795133306994103e-05, + "grad_norm": 2.8177902698516846, + "learning_rate": 6.960576515472658e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8450357913970947, + "num_tokens": 62785221.0, + "step": 1643 + }, + { + "epoch": 0.20913369800279863, + "ewc_loss": 0.0038014629390090704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.801463026320562e-05, + "grad_norm": 2.896042585372925, + "learning_rate": 6.964815599830436e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8480542898178101, + "num_tokens": 62820095.0, + "step": 1644 + }, + { + "epoch": 0.20926090828138913, + "ewc_loss": 0.003837775904685259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8377758755814284e-05, + "grad_norm": 2.8299055099487305, + "learning_rate": 6.969054684188215e-07, + "loss": 0.5142, + "mean_token_accuracy": 0.829332709312439, + "num_tokens": 62863414.0, + "step": 1645 + }, + { + "epoch": 0.20938811855997966, + "ewc_loss": 0.0038016955368220806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.801695493166335e-05, + "grad_norm": 2.8096163272857666, + "learning_rate": 6.973293768545994e-07, + "loss": 0.4257, + "mean_token_accuracy": 0.8595168590545654, + "num_tokens": 62904647.0, + "step": 1646 + }, + { + "epoch": 0.20951532883857016, + "ewc_loss": 0.003804815700277686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8048157875891775e-05, + "grad_norm": 2.747795343399048, + "learning_rate": 6.977532852903773e-07, + "loss": 0.4586, + "mean_token_accuracy": 0.8479932546615601, + "num_tokens": 62949476.0, + "step": 1647 + }, + { + "epoch": 0.20964253911716066, + "ewc_loss": 0.0037928966339677572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.792896677623503e-05, + "grad_norm": 2.910889148712158, + "learning_rate": 6.981771937261551e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8352086544036865, + "num_tokens": 62984784.0, + "step": 1648 + }, + { + "epoch": 0.2097697493957512, + "ewc_loss": 0.0038708036299794912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.870803629979491e-05, + "grad_norm": 2.8073809146881104, + "learning_rate": 6.986011021619331e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8460830450057983, + "num_tokens": 63024395.0, + "step": 1649 + }, + { + "epoch": 0.2098969596743417, + "ewc_loss": 0.003810514695942402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8105146813904867e-05, + "grad_norm": 2.8786351680755615, + "learning_rate": 6.990250105977109e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8442769050598145, + "num_tokens": 63061886.0, + "step": 1650 + }, + { + "epoch": 0.2100241699529322, + "ewc_loss": 0.0038571306504309177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.857130650430918e-05, + "grad_norm": 2.8030643463134766, + "learning_rate": 6.994489190334886e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8505963087081909, + "num_tokens": 63104150.0, + "step": 1651 + }, + { + "epoch": 0.21015138023152272, + "ewc_loss": 0.003816103097051382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.81610298063606e-05, + "grad_norm": 2.7524466514587402, + "learning_rate": 6.998728274692666e-07, + "loss": 0.4143, + "mean_token_accuracy": 0.8621180057525635, + "num_tokens": 63144346.0, + "step": 1652 + }, + { + "epoch": 0.21027859051011322, + "ewc_loss": 0.0038195729721337557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.819572884822264e-05, + "grad_norm": 2.893354892730713, + "learning_rate": 7.002967359050444e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8561534881591797, + "num_tokens": 63180042.0, + "step": 1653 + }, + { + "epoch": 0.21040580078870372, + "ewc_loss": 0.0038835955783724785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.883595491060987e-05, + "grad_norm": 2.81748366355896, + "learning_rate": 7.007206443408224e-07, + "loss": 0.4055, + "mean_token_accuracy": 0.8643299341201782, + "num_tokens": 63216570.0, + "step": 1654 + }, + { + "epoch": 0.21053301106729425, + "ewc_loss": 0.0038203289732337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.820328856818378e-05, + "grad_norm": 2.830120801925659, + "learning_rate": 7.011445527766002e-07, + "loss": 0.457, + "mean_token_accuracy": 0.8488441705703735, + "num_tokens": 63253457.0, + "step": 1655 + }, + { + "epoch": 0.21066022134588475, + "ewc_loss": 0.003842439502477646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.842439400614239e-05, + "grad_norm": 2.7838802337646484, + "learning_rate": 7.015684612123781e-07, + "loss": 0.443, + "mean_token_accuracy": 0.8522419929504395, + "num_tokens": 63295430.0, + "step": 1656 + }, + { + "epoch": 0.21078743162447525, + "ewc_loss": 0.003837413154542446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8374131690943614e-05, + "grad_norm": 2.837578058242798, + "learning_rate": 7.01992369648156e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8488112688064575, + "num_tokens": 63340118.0, + "step": 1657 + }, + { + "epoch": 0.21091464190306577, + "ewc_loss": 0.003856141120195389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.856141120195389e-05, + "grad_norm": 2.9833271503448486, + "learning_rate": 7.024162780839339e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8304646015167236, + "num_tokens": 63372037.0, + "step": 1658 + }, + { + "epoch": 0.21104185218165628, + "ewc_loss": 0.003914572764188051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.914572880603373e-05, + "grad_norm": 2.80997896194458, + "learning_rate": 7.028401865197116e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8398298621177673, + "num_tokens": 63415508.0, + "step": 1659 + }, + { + "epoch": 0.21116906246024678, + "ewc_loss": 0.0038212849758565426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.821284917648882e-05, + "grad_norm": 2.7612509727478027, + "learning_rate": 7.032640949554896e-07, + "loss": 0.3973, + "mean_token_accuracy": 0.8637509346008301, + "num_tokens": 63459320.0, + "step": 1660 + }, + { + "epoch": 0.2112962727388373, + "ewc_loss": 0.0038222861476242542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.822286089416593e-05, + "grad_norm": 2.869405746459961, + "learning_rate": 7.036880033912674e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8396762013435364, + "num_tokens": 63499624.0, + "step": 1661 + }, + { + "epoch": 0.2114234830174278, + "ewc_loss": 0.0038907513953745365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8907513953745365e-05, + "grad_norm": 2.845991611480713, + "learning_rate": 7.041119118270454e-07, + "loss": 0.4495, + "mean_token_accuracy": 0.8500905632972717, + "num_tokens": 63536507.0, + "step": 1662 + }, + { + "epoch": 0.2115506932960183, + "ewc_loss": 0.003858644049614668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.858644049614668e-05, + "grad_norm": 2.8729143142700195, + "learning_rate": 7.045358202628232e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8416796326637268, + "num_tokens": 63576483.0, + "step": 1663 + }, + { + "epoch": 0.21167790357460883, + "ewc_loss": 0.003873582696542144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8735826819902286e-05, + "grad_norm": 2.840501308441162, + "learning_rate": 7.049597286986011e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8402286767959595, + "num_tokens": 63615568.0, + "step": 1664 + }, + { + "epoch": 0.21180511385319933, + "ewc_loss": 0.0038565141148865223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.856514013023116e-05, + "grad_norm": 2.8749654293060303, + "learning_rate": 7.05383637134379e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8542664647102356, + "num_tokens": 63650023.0, + "step": 1665 + }, + { + "epoch": 0.21193232413178986, + "ewc_loss": 0.0038718467112630606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8718466385034844e-05, + "grad_norm": 2.8438587188720703, + "learning_rate": 7.058075455701568e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8395144939422607, + "num_tokens": 63690229.0, + "step": 1666 + }, + { + "epoch": 0.21205953441038036, + "ewc_loss": 0.0038573232013732195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.857323099509813e-05, + "grad_norm": 2.8184845447540283, + "learning_rate": 7.062314540059346e-07, + "loss": 0.4184, + "mean_token_accuracy": 0.864544153213501, + "num_tokens": 63728169.0, + "step": 1667 + }, + { + "epoch": 0.21218674468897086, + "ewc_loss": 0.003850943874567747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8509439036715776e-05, + "grad_norm": 2.8940937519073486, + "learning_rate": 7.066553624417126e-07, + "loss": 0.4412, + "mean_token_accuracy": 0.852378785610199, + "num_tokens": 63762816.0, + "step": 1668 + }, + { + "epoch": 0.2123139549675614, + "ewc_loss": 0.0038917737547308207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.891773667419329e-05, + "grad_norm": 2.8399181365966797, + "learning_rate": 7.070792708774904e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.8472385406494141, + "num_tokens": 63804288.0, + "step": 1669 + }, + { + "epoch": 0.2124411652461519, + "ewc_loss": 0.003853430738672614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8534308259841055e-05, + "grad_norm": 2.75482439994812, + "learning_rate": 7.075031793132684e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8446781635284424, + "num_tokens": 63851365.0, + "step": 1670 + }, + { + "epoch": 0.2125683755247424, + "ewc_loss": 0.0038350180257111788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.835017923847772e-05, + "grad_norm": 2.907454490661621, + "learning_rate": 7.079270877490462e-07, + "loss": 0.4463, + "mean_token_accuracy": 0.8545204401016235, + "num_tokens": 63886187.0, + "step": 1671 + }, + { + "epoch": 0.21269558580333292, + "ewc_loss": 0.003909214865416288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9092148654162884e-05, + "grad_norm": 2.866839647293091, + "learning_rate": 7.08350996184824e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8310661315917969, + "num_tokens": 63927187.0, + "step": 1672 + }, + { + "epoch": 0.21282279608192342, + "ewc_loss": 0.0038629984483122826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.862998346448876e-05, + "grad_norm": 2.875960111618042, + "learning_rate": 7.08774904620602e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8572103381156921, + "num_tokens": 63964088.0, + "step": 1673 + }, + { + "epoch": 0.21295000636051392, + "ewc_loss": 0.0038740956224501133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8740956370020285e-05, + "grad_norm": 2.8605895042419434, + "learning_rate": 7.091988130563797e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.841131329536438, + "num_tokens": 64004931.0, + "step": 1674 + }, + { + "epoch": 0.21307721663910445, + "ewc_loss": 0.0038803918287158012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.880391886923462e-05, + "grad_norm": 2.839844226837158, + "learning_rate": 7.096227214921576e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8420664072036743, + "num_tokens": 64043966.0, + "step": 1675 + }, + { + "epoch": 0.21320442691769495, + "ewc_loss": 0.0038645772729068995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.864577229251154e-05, + "grad_norm": 2.854274272918701, + "learning_rate": 7.100466299279355e-07, + "loss": 0.4613, + "mean_token_accuracy": 0.8499404788017273, + "num_tokens": 64078886.0, + "step": 1676 + }, + { + "epoch": 0.21333163719628545, + "ewc_loss": 0.0038858132902532816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.88581320294179e-05, + "grad_norm": 2.8806934356689453, + "learning_rate": 7.104705383637134e-07, + "loss": 0.4833, + "mean_token_accuracy": 0.8463665246963501, + "num_tokens": 64117291.0, + "step": 1677 + }, + { + "epoch": 0.21345884747487598, + "ewc_loss": 0.0038979172240942717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.897917122230865e-05, + "grad_norm": 2.8164565563201904, + "learning_rate": 7.108944467994913e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8530411124229431, + "num_tokens": 64156369.0, + "step": 1678 + }, + { + "epoch": 0.21358605775346648, + "ewc_loss": 0.003864758647978306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8647587643936276e-05, + "grad_norm": 2.941047191619873, + "learning_rate": 7.113183552352692e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8286195993423462, + "num_tokens": 64193646.0, + "step": 1679 + }, + { + "epoch": 0.21371326803205698, + "ewc_loss": 0.003931865096092224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9318649214692414e-05, + "grad_norm": 2.801640033721924, + "learning_rate": 7.11742263671047e-07, + "loss": 0.4093, + "mean_token_accuracy": 0.8631559014320374, + "num_tokens": 64231933.0, + "step": 1680 + }, + { + "epoch": 0.2138404783106475, + "ewc_loss": 0.0038600494153797626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8600494008278474e-05, + "grad_norm": 2.9775631427764893, + "learning_rate": 7.12166172106825e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.8232932090759277, + "num_tokens": 64267418.0, + "step": 1681 + }, + { + "epoch": 0.213967688589238, + "ewc_loss": 0.0039550564251840115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9550563087686896e-05, + "grad_norm": 2.8488357067108154, + "learning_rate": 7.125900805426027e-07, + "loss": 0.4219, + "mean_token_accuracy": 0.8592504262924194, + "num_tokens": 64302440.0, + "step": 1682 + }, + { + "epoch": 0.2140948988678285, + "ewc_loss": 0.0038882663939148188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.888266292051412e-05, + "grad_norm": 2.8809657096862793, + "learning_rate": 7.130139889783806e-07, + "loss": 0.4584, + "mean_token_accuracy": 0.849420964717865, + "num_tokens": 64338240.0, + "step": 1683 + }, + { + "epoch": 0.21422210914641904, + "ewc_loss": 0.003918520174920559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9185200876090676e-05, + "grad_norm": 2.8684844970703125, + "learning_rate": 7.134378974141585e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8499874472618103, + "num_tokens": 64375734.0, + "step": 1684 + }, + { + "epoch": 0.21434931942500954, + "ewc_loss": 0.00391762051731348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.917620415450074e-05, + "grad_norm": 2.9822566509246826, + "learning_rate": 7.138618058499364e-07, + "loss": 0.445, + "mean_token_accuracy": 0.8533477187156677, + "num_tokens": 64411401.0, + "step": 1685 + }, + { + "epoch": 0.21447652970360004, + "ewc_loss": 0.003972597420215607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.972597551182844e-05, + "grad_norm": 2.8444831371307373, + "learning_rate": 7.142857142857143e-07, + "loss": 0.4196, + "mean_token_accuracy": 0.8610152006149292, + "num_tokens": 64454719.0, + "step": 1686 + }, + { + "epoch": 0.21460373998219057, + "ewc_loss": 0.003905062796548009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9050628402037546e-05, + "grad_norm": 2.8582065105438232, + "learning_rate": 7.147096227214922e-07, + "loss": 0.445, + "mean_token_accuracy": 0.8549978733062744, + "num_tokens": 64495736.0, + "step": 1687 + }, + { + "epoch": 0.21473095026078107, + "ewc_loss": 0.00393517455086112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9351747545879334e-05, + "grad_norm": 2.873913526535034, + "learning_rate": 7.1513353115727e-07, + "loss": 0.4248, + "mean_token_accuracy": 0.8582130074501038, + "num_tokens": 64532843.0, + "step": 1688 + }, + { + "epoch": 0.21485816053937157, + "ewc_loss": 0.003944380208849907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.944380296161398e-05, + "grad_norm": 2.8739097118377686, + "learning_rate": 7.155574395930479e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8426101207733154, + "num_tokens": 64568950.0, + "step": 1689 + }, + { + "epoch": 0.2149853708179621, + "ewc_loss": 0.003940986469388008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9409864257322624e-05, + "grad_norm": 2.8664562702178955, + "learning_rate": 7.159813480288257e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8497192859649658, + "num_tokens": 64606878.0, + "step": 1690 + }, + { + "epoch": 0.2151125810965526, + "ewc_loss": 0.003942285198718309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.942285184166394e-05, + "grad_norm": 2.902604579925537, + "learning_rate": 7.164052564646035e-07, + "loss": 0.4168, + "mean_token_accuracy": 0.8607449531555176, + "num_tokens": 64643012.0, + "step": 1691 + }, + { + "epoch": 0.21523979137514312, + "ewc_loss": 0.003964252769947052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.964252755395137e-05, + "grad_norm": 2.823897123336792, + "learning_rate": 7.168291649003815e-07, + "loss": 0.4379, + "mean_token_accuracy": 0.8579258918762207, + "num_tokens": 64684444.0, + "step": 1692 + }, + { + "epoch": 0.21536700165373363, + "ewc_loss": 0.003932296298444271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9322963857557625e-05, + "grad_norm": 2.8741159439086914, + "learning_rate": 7.172530733361593e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8434642553329468, + "num_tokens": 64722068.0, + "step": 1693 + }, + { + "epoch": 0.21549421193232413, + "ewc_loss": 0.003955174703150988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.95517454307992e-05, + "grad_norm": 2.81611704826355, + "learning_rate": 7.176769817719373e-07, + "loss": 0.3965, + "mean_token_accuracy": 0.8731588125228882, + "num_tokens": 64761829.0, + "step": 1694 + }, + { + "epoch": 0.21562142221091465, + "ewc_loss": 0.003933992236852646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9339924114756286e-05, + "grad_norm": 2.8709616661071777, + "learning_rate": 7.181008902077151e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.8532216548919678, + "num_tokens": 64799185.0, + "step": 1695 + }, + { + "epoch": 0.21574863248950515, + "ewc_loss": 0.003958495799452066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.958495653932914e-05, + "grad_norm": 2.9152729511260986, + "learning_rate": 7.18524798643493e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8466026186943054, + "num_tokens": 64839306.0, + "step": 1696 + }, + { + "epoch": 0.21587584276809565, + "ewc_loss": 0.003972672391682863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.97267249354627e-05, + "grad_norm": 2.883661985397339, + "learning_rate": 7.189487070792708e-07, + "loss": 0.4505, + "mean_token_accuracy": 0.8534218072891235, + "num_tokens": 64876011.0, + "step": 1697 + }, + { + "epoch": 0.21600305304668618, + "ewc_loss": 0.003952380735427141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9523809391539544e-05, + "grad_norm": 2.7976338863372803, + "learning_rate": 7.193726155150487e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8547486662864685, + "num_tokens": 64918510.0, + "step": 1698 + }, + { + "epoch": 0.21613026332527668, + "ewc_loss": 0.003926094621419907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.926094723283313e-05, + "grad_norm": 2.880192995071411, + "learning_rate": 7.197965239508265e-07, + "loss": 0.454, + "mean_token_accuracy": 0.8506190776824951, + "num_tokens": 64957156.0, + "step": 1699 + }, + { + "epoch": 0.21625747360386718, + "ewc_loss": 0.0039675417356193066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9675418520346284e-05, + "grad_norm": 2.8226027488708496, + "learning_rate": 7.202204323866045e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8526307344436646, + "num_tokens": 64998761.0, + "step": 1700 + }, + { + "epoch": 0.2163846838824577, + "ewc_loss": 0.003927989397197962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9279893826460466e-05, + "grad_norm": 2.856001853942871, + "learning_rate": 7.206443408223823e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.8482931852340698, + "num_tokens": 65038849.0, + "step": 1701 + }, + { + "epoch": 0.2165118941610482, + "ewc_loss": 0.003945446107536554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.945446223951876e-05, + "grad_norm": 2.9154305458068848, + "learning_rate": 7.210682492581603e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8536106944084167, + "num_tokens": 65071130.0, + "step": 1702 + }, + { + "epoch": 0.2166391044396387, + "ewc_loss": 0.00396865326911211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.968653254560195e-05, + "grad_norm": 2.8286116123199463, + "learning_rate": 7.214921576939381e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.8404980897903442, + "num_tokens": 65111619.0, + "step": 1703 + }, + { + "epoch": 0.21676631471822924, + "ewc_loss": 0.003924875054508448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.924875272787176e-05, + "grad_norm": 2.8827154636383057, + "learning_rate": 7.219160661297159e-07, + "loss": 0.4242, + "mean_token_accuracy": 0.857033371925354, + "num_tokens": 65147537.0, + "step": 1704 + }, + { + "epoch": 0.21689352499681974, + "ewc_loss": 0.003958908375352621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.958908200729638e-05, + "grad_norm": 2.9165732860565186, + "learning_rate": 7.223399745654938e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8481716513633728, + "num_tokens": 65187254.0, + "step": 1705 + }, + { + "epoch": 0.21702073527541024, + "ewc_loss": 0.0039587607607245445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.958760862587951e-05, + "grad_norm": 2.888732433319092, + "learning_rate": 7.227638830012717e-07, + "loss": 0.4737, + "mean_token_accuracy": 0.841379702091217, + "num_tokens": 65226130.0, + "step": 1706 + }, + { + "epoch": 0.21714794555400077, + "ewc_loss": 0.003942968789488077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.942968760384247e-05, + "grad_norm": 2.9500954151153564, + "learning_rate": 7.231877914370495e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8568947911262512, + "num_tokens": 65261282.0, + "step": 1707 + }, + { + "epoch": 0.21727515583259127, + "ewc_loss": 0.003986389376223087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.986389492638409e-05, + "grad_norm": 2.9358153343200684, + "learning_rate": 7.236116998728275e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8401767015457153, + "num_tokens": 65294850.0, + "step": 1708 + }, + { + "epoch": 0.21740236611118177, + "ewc_loss": 0.003971281927078962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9712820580461994e-05, + "grad_norm": 2.8226118087768555, + "learning_rate": 7.240356083086053e-07, + "loss": 0.382, + "mean_token_accuracy": 0.8722143769264221, + "num_tokens": 65332323.0, + "step": 1709 + }, + { + "epoch": 0.2175295763897723, + "ewc_loss": 0.003929679282009602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.929679223801941e-05, + "grad_norm": 2.8200955390930176, + "learning_rate": 7.244595167443833e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.84517502784729, + "num_tokens": 65376033.0, + "step": 1710 + }, + { + "epoch": 0.2176567866683628, + "ewc_loss": 0.003951895050704479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9518952689832076e-05, + "grad_norm": 2.8206443786621094, + "learning_rate": 7.248834251801611e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8560343384742737, + "num_tokens": 65417875.0, + "step": 1711 + }, + { + "epoch": 0.2177839969469533, + "ewc_loss": 0.003964395262300968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.964395364164375e-05, + "grad_norm": 2.850398302078247, + "learning_rate": 7.253073336159388e-07, + "loss": 0.418, + "mean_token_accuracy": 0.8610669374465942, + "num_tokens": 65456675.0, + "step": 1712 + }, + { + "epoch": 0.21791120722554383, + "ewc_loss": 0.003970439080148935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.970439138356596e-05, + "grad_norm": 2.928280830383301, + "learning_rate": 7.257312420517168e-07, + "loss": 0.4079, + "mean_token_accuracy": 0.8640474081039429, + "num_tokens": 65487699.0, + "step": 1713 + }, + { + "epoch": 0.21803841750413433, + "ewc_loss": 0.003997492603957653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.997492603957653e-05, + "grad_norm": 2.883333683013916, + "learning_rate": 7.261551504874946e-07, + "loss": 0.4356, + "mean_token_accuracy": 0.8517802357673645, + "num_tokens": 65526143.0, + "step": 1714 + }, + { + "epoch": 0.21816562778272486, + "ewc_loss": 0.003975568804889917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9755686884745955e-05, + "grad_norm": 2.8589534759521484, + "learning_rate": 7.265790589232725e-07, + "loss": 0.4334, + "mean_token_accuracy": 0.857984721660614, + "num_tokens": 65562969.0, + "step": 1715 + }, + { + "epoch": 0.21829283806131536, + "ewc_loss": 0.003976666834205389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9766669942764565e-05, + "grad_norm": 2.8382909297943115, + "learning_rate": 7.270029673590504e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.848432183265686, + "num_tokens": 65605999.0, + "step": 1716 + }, + { + "epoch": 0.21842004833990586, + "ewc_loss": 0.003970364574342966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.970364559791051e-05, + "grad_norm": 2.9429104328155518, + "learning_rate": 7.274268757948283e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8345011472702026, + "num_tokens": 65644877.0, + "step": 1717 + }, + { + "epoch": 0.2185472586184964, + "ewc_loss": 0.004028817638754845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.028817784273997e-05, + "grad_norm": 2.913344144821167, + "learning_rate": 7.278507842306062e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.8573217391967773, + "num_tokens": 65678823.0, + "step": 1718 + }, + { + "epoch": 0.2186744688970869, + "ewc_loss": 0.004005684982985258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.005684968433343e-05, + "grad_norm": 2.901624917984009, + "learning_rate": 7.282746926663841e-07, + "loss": 0.5101, + "mean_token_accuracy": 0.8351765275001526, + "num_tokens": 65717185.0, + "step": 1719 + }, + { + "epoch": 0.2188016791756774, + "ewc_loss": 0.004003731068223715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.003731010016054e-05, + "grad_norm": 2.8411285877227783, + "learning_rate": 7.286986011021618e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.8574602603912354, + "num_tokens": 65754422.0, + "step": 1720 + }, + { + "epoch": 0.21892888945426792, + "ewc_loss": 0.003982866648584604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9828664739616215e-05, + "grad_norm": 2.832127809524536, + "learning_rate": 7.291225095379398e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.8487647771835327, + "num_tokens": 65799390.0, + "step": 1721 + }, + { + "epoch": 0.21905609973285842, + "ewc_loss": 0.003996793646365404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.996793748228811e-05, + "grad_norm": 2.897442579269409, + "learning_rate": 7.295464179737176e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8522840142250061, + "num_tokens": 65837566.0, + "step": 1722 + }, + { + "epoch": 0.21918331001144892, + "ewc_loss": 0.004014450125396252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.014449950773269e-05, + "grad_norm": 2.880206346511841, + "learning_rate": 7.299703264094955e-07, + "loss": 0.443, + "mean_token_accuracy": 0.8532048463821411, + "num_tokens": 65874537.0, + "step": 1723 + }, + { + "epoch": 0.21931052029003945, + "ewc_loss": 0.004006985574960709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.006985545856878e-05, + "grad_norm": 2.8656182289123535, + "learning_rate": 7.303942348452734e-07, + "loss": 0.4686, + "mean_token_accuracy": 0.8453948497772217, + "num_tokens": 65915616.0, + "step": 1724 + }, + { + "epoch": 0.21943773056862995, + "ewc_loss": 0.004012050107121468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.012050339952111e-05, + "grad_norm": 2.8468434810638428, + "learning_rate": 7.308181432810513e-07, + "loss": 0.4188, + "mean_token_accuracy": 0.8596143126487732, + "num_tokens": 65954263.0, + "step": 1725 + }, + { + "epoch": 0.21956494084722045, + "ewc_loss": 0.004004789050668478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.004788934253156e-05, + "grad_norm": 2.8869521617889404, + "learning_rate": 7.312420517168292e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.848670482635498, + "num_tokens": 65992880.0, + "step": 1726 + }, + { + "epoch": 0.21969215112581097, + "ewc_loss": 0.004035142250359058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.035142046632245e-05, + "grad_norm": 2.904280424118042, + "learning_rate": 7.31665960152607e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8430546522140503, + "num_tokens": 66033958.0, + "step": 1727 + }, + { + "epoch": 0.21981936140440148, + "ewc_loss": 0.004025349859148264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0253496990771964e-05, + "grad_norm": 2.9170637130737305, + "learning_rate": 7.320898685883848e-07, + "loss": 0.4411, + "mean_token_accuracy": 0.8531795144081116, + "num_tokens": 66072448.0, + "step": 1728 + }, + { + "epoch": 0.21994657168299198, + "ewc_loss": 0.00402738805860281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.02738805860281e-05, + "grad_norm": 2.92929744720459, + "learning_rate": 7.325137770241628e-07, + "loss": 0.5521, + "mean_token_accuracy": 0.8223456740379333, + "num_tokens": 66112648.0, + "step": 1729 + }, + { + "epoch": 0.2200737819615825, + "ewc_loss": 0.004037567880004644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.037567850900814e-05, + "grad_norm": 2.834709644317627, + "learning_rate": 7.329376854599406e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.848706841468811, + "num_tokens": 66154791.0, + "step": 1730 + }, + { + "epoch": 0.220200992240173, + "ewc_loss": 0.003987327218055725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.987326999776997e-05, + "grad_norm": 2.8783257007598877, + "learning_rate": 7.333615938957184e-07, + "loss": 0.4485, + "mean_token_accuracy": 0.8511673808097839, + "num_tokens": 66192450.0, + "step": 1731 + }, + { + "epoch": 0.2203282025187635, + "ewc_loss": 0.004034041427075863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.034041194245219e-05, + "grad_norm": 2.8809380531311035, + "learning_rate": 7.337855023314964e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.8512054681777954, + "num_tokens": 66231173.0, + "step": 1732 + }, + { + "epoch": 0.22045541279735403, + "ewc_loss": 0.004027289338409901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.027289469377138e-05, + "grad_norm": 2.8959109783172607, + "learning_rate": 7.342094107672742e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8357114791870117, + "num_tokens": 66272956.0, + "step": 1733 + }, + { + "epoch": 0.22058262307594453, + "ewc_loss": 0.00403626961633563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.036269456264563e-05, + "grad_norm": 2.9423317909240723, + "learning_rate": 7.346333192030522e-07, + "loss": 0.4537, + "mean_token_accuracy": 0.8548440337181091, + "num_tokens": 66309450.0, + "step": 1734 + }, + { + "epoch": 0.22070983335453503, + "ewc_loss": 0.004059437662363052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.059437560499646e-05, + "grad_norm": 2.919264078140259, + "learning_rate": 7.350572276388299e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.834864616394043, + "num_tokens": 66350947.0, + "step": 1735 + }, + { + "epoch": 0.22083704363312556, + "ewc_loss": 0.0040437025018036366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.043702574563213e-05, + "grad_norm": 2.9194960594177246, + "learning_rate": 7.354811360746078e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.8465800285339355, + "num_tokens": 66388364.0, + "step": 1736 + }, + { + "epoch": 0.22096425391171606, + "ewc_loss": 0.004046924877911806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0469250961905345e-05, + "grad_norm": 2.908874750137329, + "learning_rate": 7.359050445103857e-07, + "loss": 0.4756, + "mean_token_accuracy": 0.8423044681549072, + "num_tokens": 66425029.0, + "step": 1737 + }, + { + "epoch": 0.22109146419030656, + "ewc_loss": 0.004044305998831987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.044306115247309e-05, + "grad_norm": 2.878891706466675, + "learning_rate": 7.363289529461636e-07, + "loss": 0.4513, + "mean_token_accuracy": 0.8521472215652466, + "num_tokens": 66463924.0, + "step": 1738 + }, + { + "epoch": 0.2212186744688971, + "ewc_loss": 0.00403341930359602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.033419463667087e-05, + "grad_norm": 2.871363878250122, + "learning_rate": 7.367528613819415e-07, + "loss": 0.4064, + "mean_token_accuracy": 0.8644102811813354, + "num_tokens": 66504824.0, + "step": 1739 + }, + { + "epoch": 0.2213458847474876, + "ewc_loss": 0.004040294326841831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.040294516016729e-05, + "grad_norm": 2.9259233474731445, + "learning_rate": 7.371767698177194e-07, + "loss": 0.4268, + "mean_token_accuracy": 0.8572056889533997, + "num_tokens": 66539308.0, + "step": 1740 + }, + { + "epoch": 0.22147309502607812, + "ewc_loss": 0.00407009432092309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0700942918192595e-05, + "grad_norm": 2.9397435188293457, + "learning_rate": 7.376006782534972e-07, + "loss": 0.4212, + "mean_token_accuracy": 0.86085045337677, + "num_tokens": 66572785.0, + "step": 1741 + }, + { + "epoch": 0.22160030530466862, + "ewc_loss": 0.00406480161473155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0648017602507025e-05, + "grad_norm": 2.9434597492218018, + "learning_rate": 7.380245866892751e-07, + "loss": 0.439, + "mean_token_accuracy": 0.8538400530815125, + "num_tokens": 66607890.0, + "step": 1742 + }, + { + "epoch": 0.22172751558325912, + "ewc_loss": 0.0040710619650781155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.071061994181946e-05, + "grad_norm": 2.838221549987793, + "learning_rate": 7.384484951250529e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.856767475605011, + "num_tokens": 66650553.0, + "step": 1743 + }, + { + "epoch": 0.22185472586184965, + "ewc_loss": 0.004033057950437069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0330578485736623e-05, + "grad_norm": 2.9342846870422363, + "learning_rate": 7.388724035608308e-07, + "loss": 0.4374, + "mean_token_accuracy": 0.8569978475570679, + "num_tokens": 66685330.0, + "step": 1744 + }, + { + "epoch": 0.22198193614044015, + "ewc_loss": 0.004093253053724766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.093252937309444e-05, + "grad_norm": 2.9427924156188965, + "learning_rate": 7.392963119966087e-07, + "loss": 0.4207, + "mean_token_accuracy": 0.8595468997955322, + "num_tokens": 66716816.0, + "step": 1745 + }, + { + "epoch": 0.22210914641903065, + "ewc_loss": 0.00408385181799531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0838516724761575e-05, + "grad_norm": 2.926454782485962, + "learning_rate": 7.397202204323866e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.8471636772155762, + "num_tokens": 66754030.0, + "step": 1746 + }, + { + "epoch": 0.22223635669762118, + "ewc_loss": 0.004074114840477705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.074114985996857e-05, + "grad_norm": 2.917055130004883, + "learning_rate": 7.401441288681645e-07, + "loss": 0.4228, + "mean_token_accuracy": 0.8602197766304016, + "num_tokens": 66793650.0, + "step": 1747 + }, + { + "epoch": 0.22236356697621168, + "ewc_loss": 0.004091030452400446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0910304960561916e-05, + "grad_norm": 2.839139938354492, + "learning_rate": 7.405680373039424e-07, + "loss": 0.4077, + "mean_token_accuracy": 0.867523193359375, + "num_tokens": 66833709.0, + "step": 1748 + }, + { + "epoch": 0.22249077725480218, + "ewc_loss": 0.004053209908306599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.053210068377666e-05, + "grad_norm": 2.985247850418091, + "learning_rate": 7.409919457397202e-07, + "loss": 0.4725, + "mean_token_accuracy": 0.8433130383491516, + "num_tokens": 66866456.0, + "step": 1749 + }, + { + "epoch": 0.2226179875333927, + "ewc_loss": 0.004133011680096388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.133011680096388e-05, + "grad_norm": 2.9034194946289062, + "learning_rate": 7.414158541754981e-07, + "loss": 0.5004, + "mean_token_accuracy": 0.8342932462692261, + "num_tokens": 66905875.0, + "step": 1750 + }, + { + "epoch": 0.2227451978119832, + "ewc_loss": 0.004071736708283424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.071736475452781e-05, + "grad_norm": 3.096118927001953, + "learning_rate": 7.418397626112759e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.8569157719612122, + "num_tokens": 66936009.0, + "step": 1751 + }, + { + "epoch": 0.2228724080905737, + "ewc_loss": 0.004173906520009041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1739065636647865e-05, + "grad_norm": 2.8877336978912354, + "learning_rate": 7.422636710470537e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.8496835231781006, + "num_tokens": 66978111.0, + "step": 1752 + }, + { + "epoch": 0.22299961836916424, + "ewc_loss": 0.004070883151143789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.070883005624637e-05, + "grad_norm": 2.9510462284088135, + "learning_rate": 7.426875794828317e-07, + "loss": 0.4691, + "mean_token_accuracy": 0.8487292528152466, + "num_tokens": 67015938.0, + "step": 1753 + }, + { + "epoch": 0.22312682864775474, + "ewc_loss": 0.004113931208848953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.113931208848953e-05, + "grad_norm": 2.8846724033355713, + "learning_rate": 7.431114879186095e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.8395072221755981, + "num_tokens": 67055456.0, + "step": 1754 + }, + { + "epoch": 0.22325403892634524, + "ewc_loss": 0.00409501139074564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0950111724669114e-05, + "grad_norm": 2.9446117877960205, + "learning_rate": 7.435353963543875e-07, + "loss": 0.4513, + "mean_token_accuracy": 0.8504273891448975, + "num_tokens": 67096298.0, + "step": 1755 + }, + { + "epoch": 0.22338124920493577, + "ewc_loss": 0.004132403992116451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1324041376356035e-05, + "grad_norm": 2.891402244567871, + "learning_rate": 7.439593047901653e-07, + "loss": 0.4079, + "mean_token_accuracy": 0.8639705181121826, + "num_tokens": 67135280.0, + "step": 1756 + }, + { + "epoch": 0.22350845948352627, + "ewc_loss": 0.004099811892956495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0998118493007496e-05, + "grad_norm": 2.9270687103271484, + "learning_rate": 7.443832132259431e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8431713581085205, + "num_tokens": 67171936.0, + "step": 1757 + }, + { + "epoch": 0.22363566976211677, + "ewc_loss": 0.0041217077523469925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1217077523469925e-05, + "grad_norm": 2.943946361541748, + "learning_rate": 7.44807121661721e-07, + "loss": 0.4569, + "mean_token_accuracy": 0.8481761813163757, + "num_tokens": 67209854.0, + "step": 1758 + }, + { + "epoch": 0.2237628800407073, + "ewc_loss": 0.00413758447393775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.137584619456902e-05, + "grad_norm": 2.9393680095672607, + "learning_rate": 7.452310300974989e-07, + "loss": 0.4148, + "mean_token_accuracy": 0.8580160140991211, + "num_tokens": 67244369.0, + "step": 1759 + }, + { + "epoch": 0.2238900903192978, + "ewc_loss": 0.004125332925468683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1253329982282594e-05, + "grad_norm": 2.8777549266815186, + "learning_rate": 7.456549385332767e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.8394014835357666, + "num_tokens": 67283994.0, + "step": 1760 + }, + { + "epoch": 0.2240173005978883, + "ewc_loss": 0.004106338135898113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1063380194827914e-05, + "grad_norm": 2.839759111404419, + "learning_rate": 7.460788469690547e-07, + "loss": 0.3784, + "mean_token_accuracy": 0.869986355304718, + "num_tokens": 67323230.0, + "step": 1761 + }, + { + "epoch": 0.22414451087647883, + "ewc_loss": 0.004102170933037996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1021710785571486e-05, + "grad_norm": 2.8863725662231445, + "learning_rate": 7.465027554048325e-07, + "loss": 0.5287, + "mean_token_accuracy": 0.8299508094787598, + "num_tokens": 67366198.0, + "step": 1762 + }, + { + "epoch": 0.22427172115506933, + "ewc_loss": 0.004131015855818987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1310158849228173e-05, + "grad_norm": 2.8961732387542725, + "learning_rate": 7.469266638406105e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.834324836730957, + "num_tokens": 67410685.0, + "step": 1763 + }, + { + "epoch": 0.22439893143365983, + "ewc_loss": 0.004123153630644083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1231538489228114e-05, + "grad_norm": 2.926325798034668, + "learning_rate": 7.473505722763883e-07, + "loss": 0.4371, + "mean_token_accuracy": 0.8562997579574585, + "num_tokens": 67445818.0, + "step": 1764 + }, + { + "epoch": 0.22452614171225035, + "ewc_loss": 0.004141820594668388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.14182068197988e-05, + "grad_norm": 2.9114577770233154, + "learning_rate": 7.477744807121661e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.8432471752166748, + "num_tokens": 67483873.0, + "step": 1765 + }, + { + "epoch": 0.22465335199084085, + "ewc_loss": 0.004128989763557911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.128989530727267e-05, + "grad_norm": 2.9648430347442627, + "learning_rate": 7.48198389147944e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8525389432907104, + "num_tokens": 67519524.0, + "step": 1766 + }, + { + "epoch": 0.22478056226943138, + "ewc_loss": 0.004163190256804228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1631901694927365e-05, + "grad_norm": 2.8945775032043457, + "learning_rate": 7.486222975837219e-07, + "loss": 0.4412, + "mean_token_accuracy": 0.8521919250488281, + "num_tokens": 67560920.0, + "step": 1767 + }, + { + "epoch": 0.22490777254802188, + "ewc_loss": 0.004123117309063673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1231174691347405e-05, + "grad_norm": 2.939671039581299, + "learning_rate": 7.490462060194997e-07, + "loss": 0.4133, + "mean_token_accuracy": 0.8587444424629211, + "num_tokens": 67594948.0, + "step": 1768 + }, + { + "epoch": 0.22503498282661238, + "ewc_loss": 0.004153229296207428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1532293835189193e-05, + "grad_norm": 2.9917731285095215, + "learning_rate": 7.494701144552777e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8369859457015991, + "num_tokens": 67628867.0, + "step": 1769 + }, + { + "epoch": 0.2251621931052029, + "ewc_loss": 0.004163476638495922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1634764784248546e-05, + "grad_norm": 2.984261989593506, + "learning_rate": 7.498940228910555e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8477252721786499, + "num_tokens": 67669762.0, + "step": 1770 + }, + { + "epoch": 0.2252894033837934, + "ewc_loss": 0.004145098850131035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.14509886468295e-05, + "grad_norm": 2.9425547122955322, + "learning_rate": 7.503179313268335e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8491823673248291, + "num_tokens": 67704750.0, + "step": 1771 + }, + { + "epoch": 0.2254166136623839, + "ewc_loss": 0.004140446428209543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.140446617384441e-05, + "grad_norm": 2.8798985481262207, + "learning_rate": 7.507418397626113e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8414660692214966, + "num_tokens": 67745181.0, + "step": 1772 + }, + { + "epoch": 0.22554382394097444, + "ewc_loss": 0.004127993248403072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.127993088332005e-05, + "grad_norm": 2.9180266857147217, + "learning_rate": 7.51165748198389e-07, + "loss": 0.4709, + "mean_token_accuracy": 0.8426035642623901, + "num_tokens": 67785929.0, + "step": 1773 + }, + { + "epoch": 0.22567103421956494, + "ewc_loss": 0.004149607848376036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1496077756164595e-05, + "grad_norm": 2.9033429622650146, + "learning_rate": 7.51589656634167e-07, + "loss": 0.4284, + "mean_token_accuracy": 0.8543392419815063, + "num_tokens": 67821887.0, + "step": 1774 + }, + { + "epoch": 0.22579824449815544, + "ewc_loss": 0.004145124927163124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.145125058130361e-05, + "grad_norm": 2.909797191619873, + "learning_rate": 7.520135650699448e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8472915291786194, + "num_tokens": 67859608.0, + "step": 1775 + }, + { + "epoch": 0.22592545477674597, + "ewc_loss": 0.004154928494244814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.154928683419712e-05, + "grad_norm": 2.9465267658233643, + "learning_rate": 7.524374735057227e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8462629914283752, + "num_tokens": 67895287.0, + "step": 1776 + }, + { + "epoch": 0.22605266505533647, + "ewc_loss": 0.004170980304479599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.170980173512362e-05, + "grad_norm": 2.9340529441833496, + "learning_rate": 7.528613819415006e-07, + "loss": 0.4317, + "mean_token_accuracy": 0.8607879877090454, + "num_tokens": 67930358.0, + "step": 1777 + }, + { + "epoch": 0.22617987533392697, + "ewc_loss": 0.004160103388130665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.160103344474919e-05, + "grad_norm": 2.9565412998199463, + "learning_rate": 7.532852903772785e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8446541428565979, + "num_tokens": 67966112.0, + "step": 1778 + }, + { + "epoch": 0.2263070856125175, + "ewc_loss": 0.004180910065770149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.180910036666319e-05, + "grad_norm": 3.0285017490386963, + "learning_rate": 7.537091988130564e-07, + "loss": 0.4765, + "mean_token_accuracy": 0.841189980506897, + "num_tokens": 68001297.0, + "step": 1779 + }, + { + "epoch": 0.226434295891108, + "ewc_loss": 0.0042200842872262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.220084520056844e-05, + "grad_norm": 2.961940050125122, + "learning_rate": 7.541331072488342e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8491368293762207, + "num_tokens": 68037346.0, + "step": 1780 + }, + { + "epoch": 0.2265615061696985, + "ewc_loss": 0.004173923749476671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.17392366216518e-05, + "grad_norm": 2.9029383659362793, + "learning_rate": 7.54557015684612e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.8472837209701538, + "num_tokens": 68076566.0, + "step": 1781 + }, + { + "epoch": 0.22668871644828903, + "ewc_loss": 0.004166214261204004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.166214057477191e-05, + "grad_norm": 2.919806480407715, + "learning_rate": 7.5498092412039e-07, + "loss": 0.4375, + "mean_token_accuracy": 0.8564333915710449, + "num_tokens": 68116834.0, + "step": 1782 + }, + { + "epoch": 0.22681592672687953, + "ewc_loss": 0.0041826446540653706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.18264462496154e-05, + "grad_norm": 2.8748393058776855, + "learning_rate": 7.554048325561678e-07, + "loss": 0.4091, + "mean_token_accuracy": 0.865179181098938, + "num_tokens": 68155573.0, + "step": 1783 + }, + { + "epoch": 0.22694313700547003, + "ewc_loss": 0.004163146018981934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.16314578615129e-05, + "grad_norm": 2.9709434509277344, + "learning_rate": 7.558287409919457e-07, + "loss": 0.502, + "mean_token_accuracy": 0.8394783139228821, + "num_tokens": 68193378.0, + "step": 1784 + }, + { + "epoch": 0.22707034728406056, + "ewc_loss": 0.004216045141220093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.216044908389449e-05, + "grad_norm": 2.994569778442383, + "learning_rate": 7.562526494277236e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.8399266600608826, + "num_tokens": 68230183.0, + "step": 1785 + }, + { + "epoch": 0.22719755756265106, + "ewc_loss": 0.004203001037240028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.203000935376622e-05, + "grad_norm": 2.8934545516967773, + "learning_rate": 7.566765578635015e-07, + "loss": 0.4456, + "mean_token_accuracy": 0.8552557229995728, + "num_tokens": 68270526.0, + "step": 1786 + }, + { + "epoch": 0.22732476784124156, + "ewc_loss": 0.004158779978752136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.15878021158278e-05, + "grad_norm": 2.907780647277832, + "learning_rate": 7.571004662992794e-07, + "loss": 0.4722, + "mean_token_accuracy": 0.8486136198043823, + "num_tokens": 68308690.0, + "step": 1787 + }, + { + "epoch": 0.2274519781198321, + "ewc_loss": 0.004188591614365578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.188591628917493e-05, + "grad_norm": 2.944974184036255, + "learning_rate": 7.575243747350572e-07, + "loss": 0.5142, + "mean_token_accuracy": 0.8352912068367004, + "num_tokens": 68348660.0, + "step": 1788 + }, + { + "epoch": 0.2275791883984226, + "ewc_loss": 0.004215392749756575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2153926187893376e-05, + "grad_norm": 2.8978312015533447, + "learning_rate": 7.57948283170835e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.8512542247772217, + "num_tokens": 68389129.0, + "step": 1789 + }, + { + "epoch": 0.2277063986770131, + "ewc_loss": 0.004183659795671701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.183659621048719e-05, + "grad_norm": 3.0051870346069336, + "learning_rate": 7.58372191606613e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.851096510887146, + "num_tokens": 68423234.0, + "step": 1790 + }, + { + "epoch": 0.22783360895560362, + "ewc_loss": 0.004237276501953602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.237276516505517e-05, + "grad_norm": 2.9198038578033447, + "learning_rate": 7.587961000423908e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.8499133586883545, + "num_tokens": 68460834.0, + "step": 1791 + }, + { + "epoch": 0.22796081923419412, + "ewc_loss": 0.004190202336758375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1902021621353924e-05, + "grad_norm": 3.008521795272827, + "learning_rate": 7.592200084781686e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8390456438064575, + "num_tokens": 68495605.0, + "step": 1792 + }, + { + "epoch": 0.22808802951278465, + "ewc_loss": 0.004246074240654707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.246074240654707e-05, + "grad_norm": 2.956279754638672, + "learning_rate": 7.596439169139466e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.8497900366783142, + "num_tokens": 68530838.0, + "step": 1793 + }, + { + "epoch": 0.22821523979137515, + "ewc_loss": 0.004212815780192614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2128158384002745e-05, + "grad_norm": 2.923204183578491, + "learning_rate": 7.600678253497244e-07, + "loss": 0.4473, + "mean_token_accuracy": 0.8548172116279602, + "num_tokens": 68571225.0, + "step": 1794 + }, + { + "epoch": 0.22834245006996565, + "ewc_loss": 0.004198502749204636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1985029383795336e-05, + "grad_norm": 2.8799779415130615, + "learning_rate": 7.604917337855023e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8596692085266113, + "num_tokens": 68608284.0, + "step": 1795 + }, + { + "epoch": 0.22846966034855618, + "ewc_loss": 0.004198400769382715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.198400711175054e-05, + "grad_norm": 2.9019699096679688, + "learning_rate": 7.609156422212801e-07, + "loss": 0.4104, + "mean_token_accuracy": 0.862909197807312, + "num_tokens": 68645943.0, + "step": 1796 + }, + { + "epoch": 0.22859687062714668, + "ewc_loss": 0.004214154090732336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.214153887005523e-05, + "grad_norm": 2.895627737045288, + "learning_rate": 7.61339550657058e-07, + "loss": 0.4326, + "mean_token_accuracy": 0.8559397459030151, + "num_tokens": 68684245.0, + "step": 1797 + }, + { + "epoch": 0.22872408090573718, + "ewc_loss": 0.004210410173982382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.210410043015145e-05, + "grad_norm": 3.0499322414398193, + "learning_rate": 7.617634590928359e-07, + "loss": 0.4466, + "mean_token_accuracy": 0.8546608686447144, + "num_tokens": 68717532.0, + "step": 1798 + }, + { + "epoch": 0.2288512911843277, + "ewc_loss": 0.004282988607883453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.282988447812386e-05, + "grad_norm": 2.910050630569458, + "learning_rate": 7.621873675286138e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8483315706253052, + "num_tokens": 68759515.0, + "step": 1799 + }, + { + "epoch": 0.2289785014629182, + "ewc_loss": 0.004188910126686096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.188910315860994e-05, + "grad_norm": 2.9369025230407715, + "learning_rate": 7.626112759643916e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.8442397117614746, + "num_tokens": 68796969.0, + "step": 1800 + }, + { + "epoch": 0.2291057117415087, + "ewc_loss": 0.004207996651530266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.20799660787452e-05, + "grad_norm": 2.9726064205169678, + "learning_rate": 7.630351844001696e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8495659828186035, + "num_tokens": 68831138.0, + "step": 1801 + }, + { + "epoch": 0.22923292202009923, + "ewc_loss": 0.004241569899022579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.241569695295766e-05, + "grad_norm": 2.9153389930725098, + "learning_rate": 7.634590928359474e-07, + "loss": 0.4402, + "mean_token_accuracy": 0.8561837077140808, + "num_tokens": 68868741.0, + "step": 1802 + }, + { + "epoch": 0.22936013229868973, + "ewc_loss": 0.004197509493678808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.197509406367317e-05, + "grad_norm": 2.900239944458008, + "learning_rate": 7.638830012717253e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8447542190551758, + "num_tokens": 68909191.0, + "step": 1803 + }, + { + "epoch": 0.22948734257728023, + "ewc_loss": 0.004202307667583227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.20230753661599e-05, + "grad_norm": 2.9453365802764893, + "learning_rate": 7.643069097075031e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8529195785522461, + "num_tokens": 68949261.0, + "step": 1804 + }, + { + "epoch": 0.22961455285587076, + "ewc_loss": 0.004234664607793093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.234664447722025e-05, + "grad_norm": 2.890302896499634, + "learning_rate": 7.64730818143281e-07, + "loss": 0.4128, + "mean_token_accuracy": 0.8589433431625366, + "num_tokens": 68990434.0, + "step": 1805 + }, + { + "epoch": 0.22974176313446126, + "ewc_loss": 0.004193686414510012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.193686254438944e-05, + "grad_norm": 2.9459261894226074, + "learning_rate": 7.651547265790589e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8395919799804688, + "num_tokens": 69029482.0, + "step": 1806 + }, + { + "epoch": 0.22986897341305176, + "ewc_loss": 0.004227600526064634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.227600584272295e-05, + "grad_norm": 2.927295446395874, + "learning_rate": 7.655786350148368e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8440313935279846, + "num_tokens": 69069766.0, + "step": 1807 + }, + { + "epoch": 0.2299961836916423, + "ewc_loss": 0.004207773599773645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2077735997736454e-05, + "grad_norm": 2.887620210647583, + "learning_rate": 7.660025434506146e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.848221480846405, + "num_tokens": 69114592.0, + "step": 1808 + }, + { + "epoch": 0.2301233939702328, + "ewc_loss": 0.004187928978353739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1879291529767215e-05, + "grad_norm": 2.909736394882202, + "learning_rate": 7.664264518863926e-07, + "loss": 0.4515, + "mean_token_accuracy": 0.8515720963478088, + "num_tokens": 69152599.0, + "step": 1809 + }, + { + "epoch": 0.2302506042488233, + "ewc_loss": 0.004209999460726976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.209999315207824e-05, + "grad_norm": 2.92669677734375, + "learning_rate": 7.668503603221704e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8322082161903381, + "num_tokens": 69193157.0, + "step": 1810 + }, + { + "epoch": 0.23037781452741382, + "ewc_loss": 0.004214937333017588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2149375076405704e-05, + "grad_norm": 2.8929615020751953, + "learning_rate": 7.672742687579483e-07, + "loss": 0.4142, + "mean_token_accuracy": 0.8606045246124268, + "num_tokens": 69230681.0, + "step": 1811 + }, + { + "epoch": 0.23050502480600432, + "ewc_loss": 0.00420018145814538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.200181501801126e-05, + "grad_norm": 3.0125298500061035, + "learning_rate": 7.676981771937261e-07, + "loss": 0.3952, + "mean_token_accuracy": 0.8650467395782471, + "num_tokens": 69261292.0, + "step": 1812 + }, + { + "epoch": 0.23063223508459482, + "ewc_loss": 0.004261806141585112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2618059524102136e-05, + "grad_norm": 2.923784017562866, + "learning_rate": 7.681220856295039e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8593848943710327, + "num_tokens": 69301795.0, + "step": 1813 + }, + { + "epoch": 0.23075944536318535, + "ewc_loss": 0.00419482309371233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.19482312281616e-05, + "grad_norm": 2.964486837387085, + "learning_rate": 7.685459940652819e-07, + "loss": 0.4487, + "mean_token_accuracy": 0.8483719229698181, + "num_tokens": 69337038.0, + "step": 1814 + }, + { + "epoch": 0.23088665564177585, + "ewc_loss": 0.004230956081300974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.230955892126076e-05, + "grad_norm": 2.8993141651153564, + "learning_rate": 7.689699025010597e-07, + "loss": 0.4437, + "mean_token_accuracy": 0.8517938852310181, + "num_tokens": 69379871.0, + "step": 1815 + }, + { + "epoch": 0.23101386592036638, + "ewc_loss": 0.004204639699310064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2046398448292166e-05, + "grad_norm": 3.034445285797119, + "learning_rate": 7.693938109368376e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8457632064819336, + "num_tokens": 69418021.0, + "step": 1816 + }, + { + "epoch": 0.23114107619895688, + "ewc_loss": 0.004262496251612902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.262496076989919e-05, + "grad_norm": 2.936338424682617, + "learning_rate": 7.698177193726155e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.8505645990371704, + "num_tokens": 69455574.0, + "step": 1817 + }, + { + "epoch": 0.23126828647754738, + "ewc_loss": 0.004210089333355427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.210089173284359e-05, + "grad_norm": 2.9693100452423096, + "learning_rate": 7.702416278083933e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.8459906578063965, + "num_tokens": 69489822.0, + "step": 1818 + }, + { + "epoch": 0.2313954967561379, + "ewc_loss": 0.004233773797750473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.233773870510049e-05, + "grad_norm": 2.889209747314453, + "learning_rate": 7.706655362441712e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8539695739746094, + "num_tokens": 69532821.0, + "step": 1819 + }, + { + "epoch": 0.2315227070347284, + "ewc_loss": 0.004216745495796204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.216745583107695e-05, + "grad_norm": 3.007970094680786, + "learning_rate": 7.710894446799491e-07, + "loss": 0.4242, + "mean_token_accuracy": 0.8575384020805359, + "num_tokens": 69565674.0, + "step": 1820 + }, + { + "epoch": 0.2316499173133189, + "ewc_loss": 0.004285634960979223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.285635077394545e-05, + "grad_norm": 2.9468939304351807, + "learning_rate": 7.715133531157269e-07, + "loss": 0.4256, + "mean_token_accuracy": 0.8610424399375916, + "num_tokens": 69606118.0, + "step": 1821 + }, + { + "epoch": 0.23177712759190944, + "ewc_loss": 0.004234421998262405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.234422158333473e-05, + "grad_norm": 2.925307035446167, + "learning_rate": 7.719372615515049e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8384698629379272, + "num_tokens": 69650102.0, + "step": 1822 + }, + { + "epoch": 0.23190433787049994, + "ewc_loss": 0.004240716807544231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.240716589265503e-05, + "grad_norm": 2.917694091796875, + "learning_rate": 7.723611699872827e-07, + "loss": 0.4383, + "mean_token_accuracy": 0.8528302907943726, + "num_tokens": 69693440.0, + "step": 1823 + }, + { + "epoch": 0.23203154814909044, + "ewc_loss": 0.004246033728122711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2460334952920675e-05, + "grad_norm": 3.0025503635406494, + "learning_rate": 7.727850784230606e-07, + "loss": 0.4584, + "mean_token_accuracy": 0.8450372815132141, + "num_tokens": 69732651.0, + "step": 1824 + }, + { + "epoch": 0.23215875842768097, + "ewc_loss": 0.004276903811842203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.276903928257525e-05, + "grad_norm": 2.984618663787842, + "learning_rate": 7.732089868588385e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.845689594745636, + "num_tokens": 69769915.0, + "step": 1825 + }, + { + "epoch": 0.23228596870627147, + "ewc_loss": 0.004257597960531712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.257597902324051e-05, + "grad_norm": 2.8675332069396973, + "learning_rate": 7.736328952946163e-07, + "loss": 0.397, + "mean_token_accuracy": 0.8690222501754761, + "num_tokens": 69813943.0, + "step": 1826 + }, + { + "epoch": 0.23241317898486197, + "ewc_loss": 0.004222701769322157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2227016820106655e-05, + "grad_norm": 3.0113351345062256, + "learning_rate": 7.740568037303942e-07, + "loss": 0.4263, + "mean_token_accuracy": 0.8563600778579712, + "num_tokens": 69847039.0, + "step": 1827 + }, + { + "epoch": 0.2325403892634525, + "ewc_loss": 0.0043093119747936726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3093117710668594e-05, + "grad_norm": 2.9137792587280273, + "learning_rate": 7.744807121661721e-07, + "loss": 0.3905, + "mean_token_accuracy": 0.871464729309082, + "num_tokens": 69884588.0, + "step": 1828 + }, + { + "epoch": 0.232667599542043, + "ewc_loss": 0.004238424822688103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.238424662617035e-05, + "grad_norm": 2.9563612937927246, + "learning_rate": 7.749046206019499e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8508943319320679, + "num_tokens": 69925552.0, + "step": 1829 + }, + { + "epoch": 0.2327948098206335, + "ewc_loss": 0.004268956370651722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2689563997555524e-05, + "grad_norm": 2.942996025085449, + "learning_rate": 7.753285290377279e-07, + "loss": 0.39, + "mean_token_accuracy": 0.8684296607971191, + "num_tokens": 69961776.0, + "step": 1830 + }, + { + "epoch": 0.23292202009922403, + "ewc_loss": 0.004259047564119101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.259047636878677e-05, + "grad_norm": 2.96085524559021, + "learning_rate": 7.757524374735057e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8359610438346863, + "num_tokens": 69999775.0, + "step": 1831 + }, + { + "epoch": 0.23304923037781453, + "ewc_loss": 0.00426153838634491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.261538197170012e-05, + "grad_norm": 2.9337210655212402, + "learning_rate": 7.761763459092836e-07, + "loss": 0.4057, + "mean_token_accuracy": 0.8656321167945862, + "num_tokens": 70037294.0, + "step": 1832 + }, + { + "epoch": 0.23317644065640503, + "ewc_loss": 0.004256606567651033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.256606553099118e-05, + "grad_norm": 2.991358518600464, + "learning_rate": 7.766002543450614e-07, + "loss": 0.4408, + "mean_token_accuracy": 0.8549709916114807, + "num_tokens": 70075014.0, + "step": 1833 + }, + { + "epoch": 0.23330365093499555, + "ewc_loss": 0.004281162284314632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.281162182451226e-05, + "grad_norm": 2.944545269012451, + "learning_rate": 7.770241627808392e-07, + "loss": 0.3993, + "mean_token_accuracy": 0.8660063743591309, + "num_tokens": 70112186.0, + "step": 1834 + }, + { + "epoch": 0.23343086121358606, + "ewc_loss": 0.004252230748534203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.252230792189948e-05, + "grad_norm": 2.959895610809326, + "learning_rate": 7.774480712166172e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8442232608795166, + "num_tokens": 70153285.0, + "step": 1835 + }, + { + "epoch": 0.23355807149217656, + "ewc_loss": 0.004269691184163094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.269691271474585e-05, + "grad_norm": 2.9876351356506348, + "learning_rate": 7.77871979652395e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8493019342422485, + "num_tokens": 70193382.0, + "step": 1836 + }, + { + "epoch": 0.23368528177076708, + "ewc_loss": 0.0042763990350067616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2763989767991006e-05, + "grad_norm": 3.012924909591675, + "learning_rate": 7.782958880881729e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.852616548538208, + "num_tokens": 70227279.0, + "step": 1837 + }, + { + "epoch": 0.23381249204935758, + "ewc_loss": 0.004288758151233196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.288758282200433e-05, + "grad_norm": 2.984229803085327, + "learning_rate": 7.787197965239508e-07, + "loss": 0.4114, + "mean_token_accuracy": 0.8599795699119568, + "num_tokens": 70264395.0, + "step": 1838 + }, + { + "epoch": 0.23393970232794808, + "ewc_loss": 0.004275170620530844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.275170431355946e-05, + "grad_norm": 3.147784948348999, + "learning_rate": 7.791437049597287e-07, + "loss": 0.523, + "mean_token_accuracy": 0.8291987180709839, + "num_tokens": 70295016.0, + "step": 1839 + }, + { + "epoch": 0.2340669126065386, + "ewc_loss": 0.004361055325716734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3610551074380055e-05, + "grad_norm": 2.965376138687134, + "learning_rate": 7.795676133955065e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.846555233001709, + "num_tokens": 70332774.0, + "step": 1840 + }, + { + "epoch": 0.2341941228851291, + "ewc_loss": 0.004253419581800699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.253419683664106e-05, + "grad_norm": 2.9640402793884277, + "learning_rate": 7.799915218312844e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8506002426147461, + "num_tokens": 70366880.0, + "step": 1841 + }, + { + "epoch": 0.23432133316371964, + "ewc_loss": 0.0042950063943862915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.295006510801613e-05, + "grad_norm": 2.9085140228271484, + "learning_rate": 7.804154302670622e-07, + "loss": 0.4292, + "mean_token_accuracy": 0.8580639362335205, + "num_tokens": 70405235.0, + "step": 1842 + }, + { + "epoch": 0.23444854344231014, + "ewc_loss": 0.004288213327527046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.288213312975131e-05, + "grad_norm": 2.8947806358337402, + "learning_rate": 7.808393387028402e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.8540514707565308, + "num_tokens": 70446503.0, + "step": 1843 + }, + { + "epoch": 0.23457575372090064, + "ewc_loss": 0.004295022692531347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2950225179083645e-05, + "grad_norm": 3.002983808517456, + "learning_rate": 7.81263247138618e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8387118577957153, + "num_tokens": 70483514.0, + "step": 1844 + }, + { + "epoch": 0.23470296399949117, + "ewc_loss": 0.004346147645264864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.346147761680186e-05, + "grad_norm": 2.9561424255371094, + "learning_rate": 7.816871555743959e-07, + "loss": 0.4506, + "mean_token_accuracy": 0.8503196239471436, + "num_tokens": 70520944.0, + "step": 1845 + }, + { + "epoch": 0.23483017427808167, + "ewc_loss": 0.004312889184802771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3128893594257534e-05, + "grad_norm": 2.940321207046509, + "learning_rate": 7.821110640101738e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.840915322303772, + "num_tokens": 70564399.0, + "step": 1846 + }, + { + "epoch": 0.23495738455667217, + "ewc_loss": 0.00431443378329277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.314433681429364e-05, + "grad_norm": 2.960376024246216, + "learning_rate": 7.825349724459517e-07, + "loss": 0.4477, + "mean_token_accuracy": 0.8537997603416443, + "num_tokens": 70604000.0, + "step": 1847 + }, + { + "epoch": 0.2350845948352627, + "ewc_loss": 0.004335231613367796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3352316424716264e-05, + "grad_norm": 2.9856512546539307, + "learning_rate": 7.829588808817294e-07, + "loss": 0.4413, + "mean_token_accuracy": 0.8526970744132996, + "num_tokens": 70645133.0, + "step": 1848 + }, + { + "epoch": 0.2352118051138532, + "ewc_loss": 0.004329973831772804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.329973671701737e-05, + "grad_norm": 3.0839755535125732, + "learning_rate": 7.833827893175074e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8379768133163452, + "num_tokens": 70675824.0, + "step": 1849 + }, + { + "epoch": 0.2353390153924437, + "ewc_loss": 0.004365837201476097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3658372305799276e-05, + "grad_norm": 2.9572575092315674, + "learning_rate": 7.838066977532852e-07, + "loss": 0.4369, + "mean_token_accuracy": 0.8535094261169434, + "num_tokens": 70714103.0, + "step": 1850 + }, + { + "epoch": 0.23546622567103423, + "ewc_loss": 0.00429685041308403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2968502384610474e-05, + "grad_norm": 2.9503931999206543, + "learning_rate": 7.842306061890632e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8427160978317261, + "num_tokens": 70754265.0, + "step": 1851 + }, + { + "epoch": 0.23559343594962473, + "ewc_loss": 0.004314727615565062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.314727630116977e-05, + "grad_norm": 2.963898181915283, + "learning_rate": 7.84654514624841e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.8476438522338867, + "num_tokens": 70791368.0, + "step": 1852 + }, + { + "epoch": 0.23572064622821523, + "ewc_loss": 0.0043313344940543175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.33133463957347e-05, + "grad_norm": 3.029003381729126, + "learning_rate": 7.850784230606188e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8395150899887085, + "num_tokens": 70827397.0, + "step": 1853 + }, + { + "epoch": 0.23584785650680576, + "ewc_loss": 0.004353638738393784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3536387238418683e-05, + "grad_norm": 2.8860433101654053, + "learning_rate": 7.855023314963968e-07, + "loss": 0.4689, + "mean_token_accuracy": 0.8472123146057129, + "num_tokens": 70872503.0, + "step": 1854 + }, + { + "epoch": 0.23597506678539626, + "ewc_loss": 0.004286422859877348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.286423063604161e-05, + "grad_norm": 2.9561312198638916, + "learning_rate": 7.859262399321746e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8436963558197021, + "num_tokens": 70914376.0, + "step": 1855 + }, + { + "epoch": 0.23610227706398676, + "ewc_loss": 0.004349611233919859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.349611117504537e-05, + "grad_norm": 2.938922882080078, + "learning_rate": 7.863501483679524e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8576191663742065, + "num_tokens": 70954004.0, + "step": 1856 + }, + { + "epoch": 0.2362294873425773, + "ewc_loss": 0.004330415278673172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.330415322328918e-05, + "grad_norm": 2.9789462089538574, + "learning_rate": 7.867740568037303e-07, + "loss": 0.426, + "mean_token_accuracy": 0.854119062423706, + "num_tokens": 70991765.0, + "step": 1857 + }, + { + "epoch": 0.2363566976211678, + "ewc_loss": 0.004350300878286362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.350300878286362e-05, + "grad_norm": 3.028707504272461, + "learning_rate": 7.871979652395082e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.8465870022773743, + "num_tokens": 71025478.0, + "step": 1858 + }, + { + "epoch": 0.2364839078997583, + "ewc_loss": 0.004373463802039623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3734638893511146e-05, + "grad_norm": 2.9331347942352295, + "learning_rate": 7.876218736752861e-07, + "loss": 0.4105, + "mean_token_accuracy": 0.865663468837738, + "num_tokens": 71067287.0, + "step": 1859 + }, + { + "epoch": 0.23661111817834882, + "ewc_loss": 0.004324525594711304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.324525434640236e-05, + "grad_norm": 2.9671123027801514, + "learning_rate": 7.88045782111064e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.843101978302002, + "num_tokens": 71108095.0, + "step": 1860 + }, + { + "epoch": 0.23673832845693932, + "ewc_loss": 0.004351419396698475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3514195567695424e-05, + "grad_norm": 2.9063613414764404, + "learning_rate": 7.884696905468418e-07, + "loss": 0.4142, + "mean_token_accuracy": 0.8632662892341614, + "num_tokens": 71150297.0, + "step": 1861 + }, + { + "epoch": 0.23686553873552982, + "ewc_loss": 0.004324308130890131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.324308247305453e-05, + "grad_norm": 3.0154123306274414, + "learning_rate": 7.888935989826198e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.844885528087616, + "num_tokens": 71189205.0, + "step": 1862 + }, + { + "epoch": 0.23699274901412035, + "ewc_loss": 0.004378700163215399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.378700032248162e-05, + "grad_norm": 2.983741521835327, + "learning_rate": 7.893175074183976e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.851473867893219, + "num_tokens": 71228141.0, + "step": 1863 + }, + { + "epoch": 0.23711995929271085, + "ewc_loss": 0.004344868939369917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.344869012129493e-05, + "grad_norm": 2.949228048324585, + "learning_rate": 7.897414158541754e-07, + "loss": 0.4493, + "mean_token_accuracy": 0.8504139184951782, + "num_tokens": 71267854.0, + "step": 1864 + }, + { + "epoch": 0.23724716957130135, + "ewc_loss": 0.004329659976065159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3296600779285654e-05, + "grad_norm": 2.9841156005859375, + "learning_rate": 7.901653242899533e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8451001644134521, + "num_tokens": 71305030.0, + "step": 1865 + }, + { + "epoch": 0.23737437984989188, + "ewc_loss": 0.004361093509942293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.361093670013361e-05, + "grad_norm": 2.9869043827056885, + "learning_rate": 7.905892327257312e-07, + "loss": 0.4421, + "mean_token_accuracy": 0.8516974449157715, + "num_tokens": 71342202.0, + "step": 1866 + }, + { + "epoch": 0.23750159012848238, + "ewc_loss": 0.004354448057711124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.354448174126446e-05, + "grad_norm": 2.997558355331421, + "learning_rate": 7.910131411615091e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8557561635971069, + "num_tokens": 71379702.0, + "step": 1867 + }, + { + "epoch": 0.2376288004070729, + "ewc_loss": 0.004358107689768076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3581076170085e-05, + "grad_norm": 2.9388608932495117, + "learning_rate": 7.91437049597287e-07, + "loss": 0.4434, + "mean_token_accuracy": 0.851580798625946, + "num_tokens": 71417785.0, + "step": 1868 + }, + { + "epoch": 0.2377560106856634, + "ewc_loss": 0.0043320669792592525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.332066964707337e-05, + "grad_norm": 2.957970142364502, + "learning_rate": 7.918609580330648e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8395408391952515, + "num_tokens": 71461358.0, + "step": 1869 + }, + { + "epoch": 0.2378832209642539, + "ewc_loss": 0.004354081116616726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.354081102064811e-05, + "grad_norm": 2.9799115657806396, + "learning_rate": 7.922848664688428e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8433574438095093, + "num_tokens": 71500059.0, + "step": 1870 + }, + { + "epoch": 0.23801043124284443, + "ewc_loss": 0.004354868549853563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3548683606786653e-05, + "grad_norm": 2.960685968399048, + "learning_rate": 7.927087749046205e-07, + "loss": 0.3971, + "mean_token_accuracy": 0.867215633392334, + "num_tokens": 71537043.0, + "step": 1871 + }, + { + "epoch": 0.23813764152143493, + "ewc_loss": 0.004349385388195515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3493855628184974e-05, + "grad_norm": 3.0098352432250977, + "learning_rate": 7.931326833403983e-07, + "loss": 0.4321, + "mean_token_accuracy": 0.8562926054000854, + "num_tokens": 71570671.0, + "step": 1872 + }, + { + "epoch": 0.23826485180002543, + "ewc_loss": 0.0043783350847661495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3783351429738104e-05, + "grad_norm": 2.9664063453674316, + "learning_rate": 7.935565917761763e-07, + "loss": 0.4401, + "mean_token_accuracy": 0.854374885559082, + "num_tokens": 71614813.0, + "step": 1873 + }, + { + "epoch": 0.23839206207861596, + "ewc_loss": 0.004353295546025038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3532956624403596e-05, + "grad_norm": 3.069856643676758, + "learning_rate": 7.939805002119541e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8487370014190674, + "num_tokens": 71650441.0, + "step": 1874 + }, + { + "epoch": 0.23851927235720646, + "ewc_loss": 0.004421243444085121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4212432840140536e-05, + "grad_norm": 2.9786813259124756, + "learning_rate": 7.944044086477321e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8480560183525085, + "num_tokens": 71690425.0, + "step": 1875 + }, + { + "epoch": 0.23864648263579696, + "ewc_loss": 0.0043622953817248344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.362295294413343e-05, + "grad_norm": 2.946767807006836, + "learning_rate": 7.948283170835099e-07, + "loss": 0.4097, + "mean_token_accuracy": 0.8624258041381836, + "num_tokens": 71727349.0, + "step": 1876 + }, + { + "epoch": 0.2387736929143875, + "ewc_loss": 0.00436739157885313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.367391375126317e-05, + "grad_norm": 2.8731067180633545, + "learning_rate": 7.952522255192878e-07, + "loss": 0.379, + "mean_token_accuracy": 0.8721876740455627, + "num_tokens": 71770386.0, + "step": 1877 + }, + { + "epoch": 0.238900903192978, + "ewc_loss": 0.004351366311311722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.351366078481078e-05, + "grad_norm": 2.98911452293396, + "learning_rate": 7.956761339550657e-07, + "loss": 0.4066, + "mean_token_accuracy": 0.8638932108879089, + "num_tokens": 71808946.0, + "step": 1878 + }, + { + "epoch": 0.2390281134715685, + "ewc_loss": 0.004413968417793512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4139684177935123e-05, + "grad_norm": 2.992084503173828, + "learning_rate": 7.961000423908435e-07, + "loss": 0.4497, + "mean_token_accuracy": 0.8571196794509888, + "num_tokens": 71848605.0, + "step": 1879 + }, + { + "epoch": 0.23915532375015902, + "ewc_loss": 0.004389825277030468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.389825335238129e-05, + "grad_norm": 2.9854977130889893, + "learning_rate": 7.965239508266214e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8565584421157837, + "num_tokens": 71886771.0, + "step": 1880 + }, + { + "epoch": 0.23928253402874952, + "ewc_loss": 0.004387648310512304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.387648368719965e-05, + "grad_norm": 2.9387319087982178, + "learning_rate": 7.969478592623993e-07, + "loss": 0.4057, + "mean_token_accuracy": 0.8630518317222595, + "num_tokens": 71924905.0, + "step": 1881 + }, + { + "epoch": 0.23940974430734002, + "ewc_loss": 0.004373578354716301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.373578121885657e-05, + "grad_norm": 2.947173595428467, + "learning_rate": 7.973717676981771e-07, + "loss": 0.433, + "mean_token_accuracy": 0.8576891422271729, + "num_tokens": 71964513.0, + "step": 1882 + }, + { + "epoch": 0.23953695458593055, + "ewc_loss": 0.004390072543174028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3900723539991304e-05, + "grad_norm": 2.982381582260132, + "learning_rate": 7.977956761339551e-07, + "loss": 0.405, + "mean_token_accuracy": 0.8652280569076538, + "num_tokens": 72000918.0, + "step": 1883 + }, + { + "epoch": 0.23966416486452105, + "ewc_loss": 0.004397173877805471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.397174052428454e-05, + "grad_norm": 3.014709711074829, + "learning_rate": 7.982195845697329e-07, + "loss": 0.5321, + "mean_token_accuracy": 0.8255732655525208, + "num_tokens": 72040840.0, + "step": 1884 + }, + { + "epoch": 0.23979137514311155, + "ewc_loss": 0.004406212829053402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.406212974572554e-05, + "grad_norm": 3.0035672187805176, + "learning_rate": 7.986434930055108e-07, + "loss": 0.4065, + "mean_token_accuracy": 0.8626512289047241, + "num_tokens": 72076757.0, + "step": 1885 + }, + { + "epoch": 0.23991858542170208, + "ewc_loss": 0.004385648760944605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.385648571769707e-05, + "grad_norm": 2.9255356788635254, + "learning_rate": 7.990674014412886e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8484358787536621, + "num_tokens": 72120760.0, + "step": 1886 + }, + { + "epoch": 0.24004579570029258, + "ewc_loss": 0.004358289297670126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3582895159488544e-05, + "grad_norm": 3.008910894393921, + "learning_rate": 7.994913098770665e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8511099815368652, + "num_tokens": 72159985.0, + "step": 1887 + }, + { + "epoch": 0.24017300597888308, + "ewc_loss": 0.004405147861689329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.405147774377838e-05, + "grad_norm": 3.018820285797119, + "learning_rate": 7.999152183128444e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8406476974487305, + "num_tokens": 72196377.0, + "step": 1888 + }, + { + "epoch": 0.2403002162574736, + "ewc_loss": 0.004397016018629074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.397015800350346e-05, + "grad_norm": 2.9458608627319336, + "learning_rate": 8.003391267486223e-07, + "loss": 0.4281, + "mean_token_accuracy": 0.8580771088600159, + "num_tokens": 72236585.0, + "step": 1889 + }, + { + "epoch": 0.2404274265360641, + "ewc_loss": 0.004358159378170967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3581592763075605e-05, + "grad_norm": 3.0839345455169678, + "learning_rate": 8.007630351844001e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8535094261169434, + "num_tokens": 72268450.0, + "step": 1890 + }, + { + "epoch": 0.24055463681465464, + "ewc_loss": 0.0044507551938295364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.450755295692943e-05, + "grad_norm": 3.0122599601745605, + "learning_rate": 8.011869436201781e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.857448935508728, + "num_tokens": 72304092.0, + "step": 1891 + }, + { + "epoch": 0.24068184709324514, + "ewc_loss": 0.004397217184305191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3972173443762586e-05, + "grad_norm": 2.96453595161438, + "learning_rate": 8.016108520559559e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8499065637588501, + "num_tokens": 72347312.0, + "step": 1892 + }, + { + "epoch": 0.24080905737183564, + "ewc_loss": 0.004384761676192284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.384761632536538e-05, + "grad_norm": 3.0388331413269043, + "learning_rate": 8.020347604917338e-07, + "loss": 0.4178, + "mean_token_accuracy": 0.8603392839431763, + "num_tokens": 72382135.0, + "step": 1893 + }, + { + "epoch": 0.24093626765042617, + "ewc_loss": 0.004438117146492004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.438117321114987e-05, + "grad_norm": 3.019235610961914, + "learning_rate": 8.024586689275116e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.8521899580955505, + "num_tokens": 72421526.0, + "step": 1894 + }, + { + "epoch": 0.24106347792901667, + "ewc_loss": 0.004406174644827843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.406174411997199e-05, + "grad_norm": 3.0013363361358643, + "learning_rate": 8.028825773632894e-07, + "loss": 0.4455, + "mean_token_accuracy": 0.8551778197288513, + "num_tokens": 72458592.0, + "step": 1895 + }, + { + "epoch": 0.24119068820760717, + "ewc_loss": 0.004407237283885479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.407237429404631e-05, + "grad_norm": 3.0180656909942627, + "learning_rate": 8.033064857990674e-07, + "loss": 0.3774, + "mean_token_accuracy": 0.8731735348701477, + "num_tokens": 72494734.0, + "step": 1896 + }, + { + "epoch": 0.2413178984861977, + "ewc_loss": 0.004422290716320276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4222906581126153e-05, + "grad_norm": 2.969348907470703, + "learning_rate": 8.037303942348452e-07, + "loss": 0.4259, + "mean_token_accuracy": 0.8588980436325073, + "num_tokens": 72531560.0, + "step": 1897 + }, + { + "epoch": 0.2414451087647882, + "ewc_loss": 0.0043947575613856316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.394757706904784e-05, + "grad_norm": 2.9868738651275635, + "learning_rate": 8.041543026706231e-07, + "loss": 0.4237, + "mean_token_accuracy": 0.8592119216918945, + "num_tokens": 72570144.0, + "step": 1898 + }, + { + "epoch": 0.2415723190433787, + "ewc_loss": 0.0044253007508814335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.425300721777603e-05, + "grad_norm": 2.9733712673187256, + "learning_rate": 8.04578211106401e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.854512095451355, + "num_tokens": 72606738.0, + "step": 1899 + }, + { + "epoch": 0.24169952932196923, + "ewc_loss": 0.004421014804393053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.421014818944968e-05, + "grad_norm": 2.9750494956970215, + "learning_rate": 8.050021195421789e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8431766629219055, + "num_tokens": 72646230.0, + "step": 1900 + }, + { + "epoch": 0.24182673960055973, + "ewc_loss": 0.0044320556335151196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.432055720826611e-05, + "grad_norm": 3.0203475952148438, + "learning_rate": 8.054260279779567e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.8498172163963318, + "num_tokens": 72681811.0, + "step": 1901 + }, + { + "epoch": 0.24195394987915023, + "ewc_loss": 0.004461208824068308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.461209027795121e-05, + "grad_norm": 3.0098814964294434, + "learning_rate": 8.058499364137346e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8423203229904175, + "num_tokens": 72721805.0, + "step": 1902 + }, + { + "epoch": 0.24208116015774075, + "ewc_loss": 0.004450703971087933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.450704000191763e-05, + "grad_norm": 2.936657667160034, + "learning_rate": 8.062738448495124e-07, + "loss": 0.3963, + "mean_token_accuracy": 0.8672752380371094, + "num_tokens": 72763142.0, + "step": 1903 + }, + { + "epoch": 0.24220837043633126, + "ewc_loss": 0.004418639000505209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4186392187839374e-05, + "grad_norm": 2.9592154026031494, + "learning_rate": 8.066977532852904e-07, + "loss": 0.3874, + "mean_token_accuracy": 0.8740651607513428, + "num_tokens": 72802047.0, + "step": 1904 + }, + { + "epoch": 0.24233558071492176, + "ewc_loss": 0.00445984024554491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.459840420167893e-05, + "grad_norm": 3.0614516735076904, + "learning_rate": 8.071216617210682e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8485411405563354, + "num_tokens": 72834470.0, + "step": 1905 + }, + { + "epoch": 0.24246279099351228, + "ewc_loss": 0.004507057834416628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5070577471051365e-05, + "grad_norm": 2.9436511993408203, + "learning_rate": 8.075455701568461e-07, + "loss": 0.4335, + "mean_token_accuracy": 0.8560736179351807, + "num_tokens": 72873101.0, + "step": 1906 + }, + { + "epoch": 0.24259000127210278, + "ewc_loss": 0.0044304258190095425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.430425906321034e-05, + "grad_norm": 2.939239501953125, + "learning_rate": 8.07969478592624e-07, + "loss": 0.4158, + "mean_token_accuracy": 0.8637203574180603, + "num_tokens": 72915504.0, + "step": 1907 + }, + { + "epoch": 0.24271721155069328, + "ewc_loss": 0.004458141513168812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4581414840649813e-05, + "grad_norm": 3.031447649002075, + "learning_rate": 8.083933870284019e-07, + "loss": 0.39, + "mean_token_accuracy": 0.8713687658309937, + "num_tokens": 72947179.0, + "step": 1908 + }, + { + "epoch": 0.2428444218292838, + "ewc_loss": 0.004501634743064642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.501634612097405e-05, + "grad_norm": 3.0191667079925537, + "learning_rate": 8.088172954641796e-07, + "loss": 0.4284, + "mean_token_accuracy": 0.8577234745025635, + "num_tokens": 72983313.0, + "step": 1909 + }, + { + "epoch": 0.2429716321078743, + "ewc_loss": 0.004473499488085508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.473499575397e-05, + "grad_norm": 3.130064010620117, + "learning_rate": 8.092412038999576e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8441039323806763, + "num_tokens": 73014218.0, + "step": 1910 + }, + { + "epoch": 0.24309884238646481, + "ewc_loss": 0.004533783998340368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.533784158411436e-05, + "grad_norm": 3.131991147994995, + "learning_rate": 8.096651123357354e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8369956016540527, + "num_tokens": 73048282.0, + "step": 1911 + }, + { + "epoch": 0.24322605266505534, + "ewc_loss": 0.00452252384275198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5225238864077255e-05, + "grad_norm": 3.0334627628326416, + "learning_rate": 8.100890207715134e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8433821797370911, + "num_tokens": 73084390.0, + "step": 1912 + }, + { + "epoch": 0.24335326294364584, + "ewc_loss": 0.004478521179407835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4785210775444284e-05, + "grad_norm": 3.036689519882202, + "learning_rate": 8.105129292072912e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8534478545188904, + "num_tokens": 73123296.0, + "step": 1913 + }, + { + "epoch": 0.24348047322223634, + "ewc_loss": 0.004504176788032055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5041768316878006e-05, + "grad_norm": 3.0706467628479004, + "learning_rate": 8.10936837643069e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8414695262908936, + "num_tokens": 73157533.0, + "step": 1914 + }, + { + "epoch": 0.24360768350082687, + "ewc_loss": 0.004536750726401806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5367505663307384e-05, + "grad_norm": 2.9926578998565674, + "learning_rate": 8.11360746078847e-07, + "loss": 0.4212, + "mean_token_accuracy": 0.8576868176460266, + "num_tokens": 73195314.0, + "step": 1915 + }, + { + "epoch": 0.24373489377941737, + "ewc_loss": 0.004490530584007502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4905307731823996e-05, + "grad_norm": 2.9207522869110107, + "learning_rate": 8.117846545146248e-07, + "loss": 0.3896, + "mean_token_accuracy": 0.8672627210617065, + "num_tokens": 73236763.0, + "step": 1916 + }, + { + "epoch": 0.2438621040580079, + "ewc_loss": 0.004482286516577005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.482286385609768e-05, + "grad_norm": 3.0090255737304688, + "learning_rate": 8.122085629504026e-07, + "loss": 0.475, + "mean_token_accuracy": 0.8412278890609741, + "num_tokens": 73276719.0, + "step": 1917 + }, + { + "epoch": 0.2439893143365984, + "ewc_loss": 0.004548987839370966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.548987635644153e-05, + "grad_norm": 2.9715819358825684, + "learning_rate": 8.126324713861805e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8556733131408691, + "num_tokens": 73318594.0, + "step": 1918 + }, + { + "epoch": 0.2441165246151889, + "ewc_loss": 0.004502166993916035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.502167212194763e-05, + "grad_norm": 2.996856689453125, + "learning_rate": 8.130563798219584e-07, + "loss": 0.4204, + "mean_token_accuracy": 0.8622363805770874, + "num_tokens": 73358760.0, + "step": 1919 + }, + { + "epoch": 0.24424373489377943, + "ewc_loss": 0.00452799629420042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5279964979272336e-05, + "grad_norm": 3.029111385345459, + "learning_rate": 8.134802882577363e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.8516700267791748, + "num_tokens": 73398623.0, + "step": 1920 + }, + { + "epoch": 0.24437094517236993, + "ewc_loss": 0.004545877687633038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.545877891359851e-05, + "grad_norm": 2.939908981323242, + "learning_rate": 8.139041966935142e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.85491943359375, + "num_tokens": 73446369.0, + "step": 1921 + }, + { + "epoch": 0.24449815545096043, + "ewc_loss": 0.004495411645621061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.495411485549994e-05, + "grad_norm": 3.0572919845581055, + "learning_rate": 8.14328105129292e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8397401571273804, + "num_tokens": 73483152.0, + "step": 1922 + }, + { + "epoch": 0.24462536572955096, + "ewc_loss": 0.004558220040053129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.55822009826079e-05, + "grad_norm": 3.0462965965270996, + "learning_rate": 8.1475201356507e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.848883867263794, + "num_tokens": 73519179.0, + "step": 1923 + }, + { + "epoch": 0.24475257600814146, + "ewc_loss": 0.004544846713542938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.54484652436804e-05, + "grad_norm": 2.983365774154663, + "learning_rate": 8.151759220008477e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.8559073805809021, + "num_tokens": 73561947.0, + "step": 1924 + }, + { + "epoch": 0.24487978628673196, + "ewc_loss": 0.004509058780968189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.509058635449037e-05, + "grad_norm": 3.0586166381835938, + "learning_rate": 8.155998304366256e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.8564474582672119, + "num_tokens": 73595841.0, + "step": 1925 + }, + { + "epoch": 0.2450069965653225, + "ewc_loss": 0.0045598410069942474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.55984118161723e-05, + "grad_norm": 3.092376947402954, + "learning_rate": 8.160237388724035e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.842795193195343, + "num_tokens": 73629716.0, + "step": 1926 + }, + { + "epoch": 0.245134206843913, + "ewc_loss": 0.004545783158391714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5457833039108664e-05, + "grad_norm": 2.995704174041748, + "learning_rate": 8.164476473081814e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8504292368888855, + "num_tokens": 73667285.0, + "step": 1927 + }, + { + "epoch": 0.2452614171225035, + "ewc_loss": 0.004503568168729544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.503568197833374e-05, + "grad_norm": 2.941922426223755, + "learning_rate": 8.168715557439593e-07, + "loss": 0.4028, + "mean_token_accuracy": 0.865437388420105, + "num_tokens": 73708059.0, + "step": 1928 + }, + { + "epoch": 0.24538862740109402, + "ewc_loss": 0.004504541400820017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.504541357164271e-05, + "grad_norm": 2.9506776332855225, + "learning_rate": 8.172954641797372e-07, + "loss": 0.4158, + "mean_token_accuracy": 0.8607168197631836, + "num_tokens": 73748553.0, + "step": 1929 + }, + { + "epoch": 0.24551583767968452, + "ewc_loss": 0.004519561305642128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.51956111646723e-05, + "grad_norm": 2.971179485321045, + "learning_rate": 8.17719372615515e-07, + "loss": 0.5141, + "mean_token_accuracy": 0.8313612937927246, + "num_tokens": 73796295.0, + "step": 1930 + }, + { + "epoch": 0.24564304795827502, + "ewc_loss": 0.004515409469604492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.515409455052577e-05, + "grad_norm": 3.0144236087799072, + "learning_rate": 8.18143281051293e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.8403318524360657, + "num_tokens": 73835960.0, + "step": 1931 + }, + { + "epoch": 0.24577025823686555, + "ewc_loss": 0.004547533579170704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.547533535514958e-05, + "grad_norm": 3.063098192214966, + "learning_rate": 8.185671894870707e-07, + "loss": 0.508, + "mean_token_accuracy": 0.8331562280654907, + "num_tokens": 73875302.0, + "step": 1932 + }, + { + "epoch": 0.24589746851545605, + "ewc_loss": 0.004554660525172949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5546606997959316e-05, + "grad_norm": 3.0549399852752686, + "learning_rate": 8.189910979228485e-07, + "loss": 0.4011, + "mean_token_accuracy": 0.8649572134017944, + "num_tokens": 73907703.0, + "step": 1933 + }, + { + "epoch": 0.24602467879404655, + "ewc_loss": 0.00453984085470438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.539840665529482e-05, + "grad_norm": 3.049861192703247, + "learning_rate": 8.194150063586265e-07, + "loss": 0.496, + "mean_token_accuracy": 0.8398622274398804, + "num_tokens": 73945449.0, + "step": 1934 + }, + { + "epoch": 0.24615188907263708, + "ewc_loss": 0.004543420393019915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.54342043667566e-05, + "grad_norm": 3.048422336578369, + "learning_rate": 8.198389147944043e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8386869430541992, + "num_tokens": 73982461.0, + "step": 1935 + }, + { + "epoch": 0.24627909935122758, + "ewc_loss": 0.00453975610435009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5397562644211575e-05, + "grad_norm": 3.002840995788574, + "learning_rate": 8.202628232301823e-07, + "loss": 0.4554, + "mean_token_accuracy": 0.8534068465232849, + "num_tokens": 74018252.0, + "step": 1936 + }, + { + "epoch": 0.24640630962981808, + "ewc_loss": 0.004535509739071131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5355096517596394e-05, + "grad_norm": 2.9301164150238037, + "learning_rate": 8.206867316659601e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8411704301834106, + "num_tokens": 74065843.0, + "step": 1937 + }, + { + "epoch": 0.2465335199084086, + "ewc_loss": 0.004509340040385723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.509339851210825e-05, + "grad_norm": 2.930253028869629, + "learning_rate": 8.21110640101738e-07, + "loss": 0.3685, + "mean_token_accuracy": 0.8733335733413696, + "num_tokens": 74106389.0, + "step": 1938 + }, + { + "epoch": 0.2466607301869991, + "ewc_loss": 0.004533562809228897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.533562605502084e-05, + "grad_norm": 2.991746187210083, + "learning_rate": 8.215345485375159e-07, + "loss": 0.488, + "mean_token_accuracy": 0.840005099773407, + "num_tokens": 74150786.0, + "step": 1939 + }, + { + "epoch": 0.2467879404655896, + "ewc_loss": 0.004553422797471285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.553422695607878e-05, + "grad_norm": 3.0330207347869873, + "learning_rate": 8.219584569732937e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8452449440956116, + "num_tokens": 74190474.0, + "step": 1940 + }, + { + "epoch": 0.24691515074418013, + "ewc_loss": 0.004555812105536461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.555812120088376e-05, + "grad_norm": 3.0274970531463623, + "learning_rate": 8.223823654090715e-07, + "loss": 0.4088, + "mean_token_accuracy": 0.8637588024139404, + "num_tokens": 74226437.0, + "step": 1941 + }, + { + "epoch": 0.24704236102277063, + "ewc_loss": 0.004548135679215193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.548135621007532e-05, + "grad_norm": 3.009885549545288, + "learning_rate": 8.228062738448495e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8529659509658813, + "num_tokens": 74267292.0, + "step": 1942 + }, + { + "epoch": 0.24716957130136116, + "ewc_loss": 0.004535796120762825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5357959606917575e-05, + "grad_norm": 3.0051231384277344, + "learning_rate": 8.232301822806273e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.8484575152397156, + "num_tokens": 74306738.0, + "step": 1943 + }, + { + "epoch": 0.24729678157995166, + "ewc_loss": 0.004527769051492214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5277691242517903e-05, + "grad_norm": 3.0983054637908936, + "learning_rate": 8.236540907164053e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.8530910015106201, + "num_tokens": 74343151.0, + "step": 1944 + }, + { + "epoch": 0.24742399185854216, + "ewc_loss": 0.004585307091474533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.585307033266872e-05, + "grad_norm": 3.014162302017212, + "learning_rate": 8.240779991521831e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8566784262657166, + "num_tokens": 74382755.0, + "step": 1945 + }, + { + "epoch": 0.2475512021371327, + "ewc_loss": 0.004515490494668484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.515490581979975e-05, + "grad_norm": 2.990908145904541, + "learning_rate": 8.24501907587961e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8469697833061218, + "num_tokens": 74426129.0, + "step": 1946 + }, + { + "epoch": 0.2476784124157232, + "ewc_loss": 0.0045254770666360855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.525476833805442e-05, + "grad_norm": 3.084700584411621, + "learning_rate": 8.249258160237388e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8495079278945923, + "num_tokens": 74460904.0, + "step": 1947 + }, + { + "epoch": 0.2478056226943137, + "ewc_loss": 0.004578349646180868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.57834976259619e-05, + "grad_norm": 3.0640928745269775, + "learning_rate": 8.253497244595167e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8476190567016602, + "num_tokens": 74497993.0, + "step": 1948 + }, + { + "epoch": 0.24793283297290422, + "ewc_loss": 0.004554070997983217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.554070983431302e-05, + "grad_norm": 3.0002591609954834, + "learning_rate": 8.257736328952945e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.8422501087188721, + "num_tokens": 74541108.0, + "step": 1949 + }, + { + "epoch": 0.24806004325149472, + "ewc_loss": 0.004535992629826069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.535992775345221e-05, + "grad_norm": 3.016263484954834, + "learning_rate": 8.261975413310725e-07, + "loss": 0.4521, + "mean_token_accuracy": 0.8516656160354614, + "num_tokens": 74579389.0, + "step": 1950 + }, + { + "epoch": 0.24818725353008522, + "ewc_loss": 0.004557340871542692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.557340798783116e-05, + "grad_norm": 2.9777743816375732, + "learning_rate": 8.266214497668503e-07, + "loss": 0.4067, + "mean_token_accuracy": 0.865898072719574, + "num_tokens": 74621546.0, + "step": 1951 + }, + { + "epoch": 0.24831446380867575, + "ewc_loss": 0.004537421278655529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.537421409622766e-05, + "grad_norm": 3.0485525131225586, + "learning_rate": 8.270453582026283e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.8382793664932251, + "num_tokens": 74660178.0, + "step": 1952 + }, + { + "epoch": 0.24844167408726625, + "ewc_loss": 0.004581517539918423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5815177145414054e-05, + "grad_norm": 3.0703039169311523, + "learning_rate": 8.274692666384061e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8568589687347412, + "num_tokens": 74693867.0, + "step": 1953 + }, + { + "epoch": 0.24856888436585675, + "ewc_loss": 0.004582024645060301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.582024848787114e-05, + "grad_norm": 3.067573070526123, + "learning_rate": 8.27893175074184e-07, + "loss": 0.4455, + "mean_token_accuracy": 0.8504310846328735, + "num_tokens": 74727561.0, + "step": 1954 + }, + { + "epoch": 0.24869609464444728, + "ewc_loss": 0.00458163907751441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5816388592356816e-05, + "grad_norm": 3.0773661136627197, + "learning_rate": 8.283170835099618e-07, + "loss": 0.4488, + "mean_token_accuracy": 0.8523881435394287, + "num_tokens": 74763719.0, + "step": 1955 + }, + { + "epoch": 0.24882330492303778, + "ewc_loss": 0.0045917038805782795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5917036914033815e-05, + "grad_norm": 2.966240167617798, + "learning_rate": 8.287409919457396e-07, + "loss": 0.4114, + "mean_token_accuracy": 0.8615858554840088, + "num_tokens": 74802666.0, + "step": 1956 + }, + { + "epoch": 0.24895051520162828, + "ewc_loss": 0.004542745184153318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.542745227809064e-05, + "grad_norm": 3.0518128871917725, + "learning_rate": 8.291649003815175e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.848051130771637, + "num_tokens": 74840307.0, + "step": 1957 + }, + { + "epoch": 0.2490777254802188, + "ewc_loss": 0.004613980650901794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.613980490830727e-05, + "grad_norm": 3.014496326446533, + "learning_rate": 8.295888088172954e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.8460327386856079, + "num_tokens": 74880181.0, + "step": 1958 + }, + { + "epoch": 0.2492049357588093, + "ewc_loss": 0.0045818486250936985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.58184840681497e-05, + "grad_norm": 3.0221738815307617, + "learning_rate": 8.300127172530733e-07, + "loss": 0.3827, + "mean_token_accuracy": 0.8704062700271606, + "num_tokens": 74917148.0, + "step": 1959 + }, + { + "epoch": 0.2493321460373998, + "ewc_loss": 0.004594980273395777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.594980055117048e-05, + "grad_norm": 3.0525991916656494, + "learning_rate": 8.304366256888512e-07, + "loss": 0.4203, + "mean_token_accuracy": 0.8593621850013733, + "num_tokens": 74953124.0, + "step": 1960 + }, + { + "epoch": 0.24945935631599034, + "ewc_loss": 0.004603900481015444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.603900379152037e-05, + "grad_norm": 3.0262317657470703, + "learning_rate": 8.308605341246291e-07, + "loss": 0.4324, + "mean_token_accuracy": 0.855492115020752, + "num_tokens": 74989726.0, + "step": 1961 + }, + { + "epoch": 0.24958656659458084, + "ewc_loss": 0.004592969082295895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.592968980432488e-05, + "grad_norm": 3.0407423973083496, + "learning_rate": 8.312844425604068e-07, + "loss": 0.4348, + "mean_token_accuracy": 0.8617980480194092, + "num_tokens": 75024849.0, + "step": 1962 + }, + { + "epoch": 0.24971377687317134, + "ewc_loss": 0.004604938440024853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.604938658303581e-05, + "grad_norm": 2.9913763999938965, + "learning_rate": 8.317083509961848e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.8486639261245728, + "num_tokens": 75066464.0, + "step": 1963 + }, + { + "epoch": 0.24984098715176187, + "ewc_loss": 0.004582024645060301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.582024484989233e-05, + "grad_norm": 2.9947593212127686, + "learning_rate": 8.321322594319626e-07, + "loss": 0.413, + "mean_token_accuracy": 0.8609355688095093, + "num_tokens": 75106829.0, + "step": 1964 + }, + { + "epoch": 0.24996819743035237, + "ewc_loss": 0.0045903529971838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5903529098723084e-05, + "grad_norm": 3.0208678245544434, + "learning_rate": 8.325561678677405e-07, + "loss": 0.468, + "mean_token_accuracy": 0.8473638892173767, + "num_tokens": 75148227.0, + "step": 1965 + }, + { + "epoch": 0.2500954077089429, + "ewc_loss": 0.004605711903423071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.605711728800088e-05, + "grad_norm": 3.022249937057495, + "learning_rate": 8.329800763035184e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8402934670448303, + "num_tokens": 75188719.0, + "step": 1966 + }, + { + "epoch": 0.25022261798753337, + "ewc_loss": 0.0045972708612680435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.597270890371874e-05, + "grad_norm": 3.0995869636535645, + "learning_rate": 8.334039847392963e-07, + "loss": 0.4306, + "mean_token_accuracy": 0.8553210496902466, + "num_tokens": 75222260.0, + "step": 1967 + }, + { + "epoch": 0.2503498282661239, + "ewc_loss": 0.004639317747205496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.639317558030598e-05, + "grad_norm": 3.073042392730713, + "learning_rate": 8.338278931750742e-07, + "loss": 0.4078, + "mean_token_accuracy": 0.8643484115600586, + "num_tokens": 75255489.0, + "step": 1968 + }, + { + "epoch": 0.2504770385447144, + "ewc_loss": 0.004615381825715303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6153818402672186e-05, + "grad_norm": 3.0424082279205322, + "learning_rate": 8.342518016108521e-07, + "loss": 0.4308, + "mean_token_accuracy": 0.8544955849647522, + "num_tokens": 75290548.0, + "step": 1969 + }, + { + "epoch": 0.2506042488233049, + "ewc_loss": 0.0046079098246991634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.607909795595333e-05, + "grad_norm": 3.120908260345459, + "learning_rate": 8.346757100466298e-07, + "loss": 0.4337, + "mean_token_accuracy": 0.8566597104072571, + "num_tokens": 75323912.0, + "step": 1970 + }, + { + "epoch": 0.2507314591018954, + "ewc_loss": 0.004655860364437103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.655860175262205e-05, + "grad_norm": 3.0741419792175293, + "learning_rate": 8.350996184824078e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8410658836364746, + "num_tokens": 75361206.0, + "step": 1971 + }, + { + "epoch": 0.25085866938048595, + "ewc_loss": 0.0046167029067873955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6167027903720737e-05, + "grad_norm": 3.069033145904541, + "learning_rate": 8.355235269181856e-07, + "loss": 0.4532, + "mean_token_accuracy": 0.8496741056442261, + "num_tokens": 75397292.0, + "step": 1972 + }, + { + "epoch": 0.2509858796590764, + "ewc_loss": 0.004629893694072962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6298937377287075e-05, + "grad_norm": 3.027247428894043, + "learning_rate": 8.359474353539635e-07, + "loss": 0.4162, + "mean_token_accuracy": 0.862551748752594, + "num_tokens": 75437616.0, + "step": 1973 + }, + { + "epoch": 0.25111308993766696, + "ewc_loss": 0.004616234917193651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6162349462974817e-05, + "grad_norm": 2.986201286315918, + "learning_rate": 8.363713437897414e-07, + "loss": 0.5031, + "mean_token_accuracy": 0.8370251655578613, + "num_tokens": 75486172.0, + "step": 1974 + }, + { + "epoch": 0.2512403002162575, + "ewc_loss": 0.004604898393154144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6048982767388225e-05, + "grad_norm": 3.02143931388855, + "learning_rate": 8.367952522255193e-07, + "loss": 0.439, + "mean_token_accuracy": 0.8513941168785095, + "num_tokens": 75526637.0, + "step": 1975 + }, + { + "epoch": 0.25136751049484796, + "ewc_loss": 0.004639604594558477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6396045945584774e-05, + "grad_norm": 3.0793325901031494, + "learning_rate": 8.372191606612972e-07, + "loss": 0.4413, + "mean_token_accuracy": 0.8559368848800659, + "num_tokens": 75564632.0, + "step": 1976 + }, + { + "epoch": 0.2514947207734385, + "ewc_loss": 0.0046525634825229645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.652563438867219e-05, + "grad_norm": 3.08530592918396, + "learning_rate": 8.376430690970749e-07, + "loss": 0.4091, + "mean_token_accuracy": 0.8654592037200928, + "num_tokens": 75595553.0, + "step": 1977 + }, + { + "epoch": 0.251621931052029, + "ewc_loss": 0.004639413673430681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.639413600671105e-05, + "grad_norm": 3.0032660961151123, + "learning_rate": 8.380669775328528e-07, + "loss": 0.4035, + "mean_token_accuracy": 0.8661726713180542, + "num_tokens": 75635847.0, + "step": 1978 + }, + { + "epoch": 0.25174914133061954, + "ewc_loss": 0.004606310278177261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.606310176313855e-05, + "grad_norm": 3.0020902156829834, + "learning_rate": 8.384908859686307e-07, + "loss": 0.402, + "mean_token_accuracy": 0.8669146299362183, + "num_tokens": 75677403.0, + "step": 1979 + }, + { + "epoch": 0.25187635160921, + "ewc_loss": 0.004620215855538845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6202156227082014e-05, + "grad_norm": 2.992706298828125, + "learning_rate": 8.389147944044086e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8526202440261841, + "num_tokens": 75720703.0, + "step": 1980 + }, + { + "epoch": 0.25200356188780054, + "ewc_loss": 0.004614000208675861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.614000135916285e-05, + "grad_norm": 3.436403512954712, + "learning_rate": 8.393387028401864e-07, + "loss": 0.4675, + "mean_token_accuracy": 0.8468507528305054, + "num_tokens": 75758000.0, + "step": 1981 + }, + { + "epoch": 0.25213077216639107, + "ewc_loss": 0.0048211440443992615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8211441026069224e-05, + "grad_norm": 3.1041419506073, + "learning_rate": 8.397626112759644e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8389058113098145, + "num_tokens": 75791113.0, + "step": 1982 + }, + { + "epoch": 0.25225798244498154, + "ewc_loss": 0.004581352695822716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5813525503035635e-05, + "grad_norm": 3.122068405151367, + "learning_rate": 8.401865197117422e-07, + "loss": 0.4382, + "mean_token_accuracy": 0.855758011341095, + "num_tokens": 75821733.0, + "step": 1983 + }, + { + "epoch": 0.25238519272357207, + "ewc_loss": 0.004653264302760363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6532644773833454e-05, + "grad_norm": 3.0127792358398438, + "learning_rate": 8.406104281475202e-07, + "loss": 0.4179, + "mean_token_accuracy": 0.8596774339675903, + "num_tokens": 75861404.0, + "step": 1984 + }, + { + "epoch": 0.2525124030021626, + "ewc_loss": 0.004615528043359518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.615528087015264e-05, + "grad_norm": 2.9831912517547607, + "learning_rate": 8.410343365832979e-07, + "loss": 0.4199, + "mean_token_accuracy": 0.859196662902832, + "num_tokens": 75903059.0, + "step": 1985 + }, + { + "epoch": 0.2526396132807531, + "ewc_loss": 0.0046193040907382965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.619304309017025e-05, + "grad_norm": 2.9712071418762207, + "learning_rate": 8.414582450190758e-07, + "loss": 0.431, + "mean_token_accuracy": 0.8574212789535522, + "num_tokens": 75947125.0, + "step": 1986 + }, + { + "epoch": 0.2527668235593436, + "ewc_loss": 0.004623305518180132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.623305721906945e-05, + "grad_norm": 3.1174027919769287, + "learning_rate": 8.418821534548537e-07, + "loss": 0.5144, + "mean_token_accuracy": 0.8326593637466431, + "num_tokens": 75987519.0, + "step": 1987 + }, + { + "epoch": 0.25289403383793413, + "ewc_loss": 0.004703324753791094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.703324884758331e-05, + "grad_norm": 3.0693156719207764, + "learning_rate": 8.423060618906316e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8546327948570251, + "num_tokens": 76027728.0, + "step": 1988 + }, + { + "epoch": 0.2530212441165246, + "ewc_loss": 0.004648299887776375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.648299727705307e-05, + "grad_norm": 2.9927289485931396, + "learning_rate": 8.427299703264095e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8565789461135864, + "num_tokens": 76070942.0, + "step": 1989 + }, + { + "epoch": 0.25314845439511513, + "ewc_loss": 0.004640716128051281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.640715997084044e-05, + "grad_norm": 3.0551323890686035, + "learning_rate": 8.431538787621874e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.8480456471443176, + "num_tokens": 76113330.0, + "step": 1990 + }, + { + "epoch": 0.25327566467370566, + "ewc_loss": 0.004676200449466705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.676200478570536e-05, + "grad_norm": 3.0537893772125244, + "learning_rate": 8.435777871979652e-07, + "loss": 0.4192, + "mean_token_accuracy": 0.8613781332969666, + "num_tokens": 76153511.0, + "step": 1991 + }, + { + "epoch": 0.25340287495229613, + "ewc_loss": 0.004660151898860931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6601518988609314e-05, + "grad_norm": 3.066451072692871, + "learning_rate": 8.440016956337432e-07, + "loss": 0.4443, + "mean_token_accuracy": 0.8539905548095703, + "num_tokens": 76191505.0, + "step": 1992 + }, + { + "epoch": 0.25353008523088666, + "ewc_loss": 0.0046518053859472275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.651805284083821e-05, + "grad_norm": 3.0770585536956787, + "learning_rate": 8.444256040695209e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.8508251905441284, + "num_tokens": 76232479.0, + "step": 1993 + }, + { + "epoch": 0.2536572955094772, + "ewc_loss": 0.004670294933021069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.670294947572984e-05, + "grad_norm": 3.2818660736083984, + "learning_rate": 8.448495125052988e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.8515911102294922, + "num_tokens": 76270192.0, + "step": 1994 + }, + { + "epoch": 0.25378450578806766, + "ewc_loss": 0.0047500901855528355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.750090010929853e-05, + "grad_norm": 3.0861551761627197, + "learning_rate": 8.452734209410767e-07, + "loss": 0.3933, + "mean_token_accuracy": 0.868547260761261, + "num_tokens": 76303981.0, + "step": 1995 + }, + { + "epoch": 0.2539117160666582, + "ewc_loss": 0.004628659691661596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.628659735317342e-05, + "grad_norm": 3.038127899169922, + "learning_rate": 8.456973293768545e-07, + "loss": 0.417, + "mean_token_accuracy": 0.8603746294975281, + "num_tokens": 76338496.0, + "step": 1996 + }, + { + "epoch": 0.2540389263452487, + "ewc_loss": 0.004634351935237646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.634351716958918e-05, + "grad_norm": 3.0188329219818115, + "learning_rate": 8.461212378126325e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.8449209928512573, + "num_tokens": 76377904.0, + "step": 1997 + }, + { + "epoch": 0.2541661366238392, + "ewc_loss": 0.004663241095840931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.663240906666033e-05, + "grad_norm": 3.085143804550171, + "learning_rate": 8.465451462484103e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8399407863616943, + "num_tokens": 76413905.0, + "step": 1998 + }, + { + "epoch": 0.2542933469024297, + "ewc_loss": 0.004691240377724171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.691240610554814e-05, + "grad_norm": 3.078423500061035, + "learning_rate": 8.469690546841882e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8536561727523804, + "num_tokens": 76451591.0, + "step": 1999 + }, + { + "epoch": 0.25442055718102025, + "ewc_loss": 0.004676149692386389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6761495468672365e-05, + "grad_norm": 3.0976359844207764, + "learning_rate": 8.47392963119966e-07, + "loss": 0.423, + "mean_token_accuracy": 0.8598698377609253, + "num_tokens": 76486191.0, + "step": 2000 + }, + { + "epoch": 0.2545477674596107, + "ewc_loss": 0.004696748219430447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6967481466708705e-05, + "grad_norm": 3.0895910263061523, + "learning_rate": 8.478168715557439e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8560335636138916, + "num_tokens": 76523317.0, + "step": 2001 + }, + { + "epoch": 0.25467497773820125, + "ewc_loss": 0.004693263676017523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.693263690569438e-05, + "grad_norm": 3.028690814971924, + "learning_rate": 8.482407799915217e-07, + "loss": 0.45, + "mean_token_accuracy": 0.8510209918022156, + "num_tokens": 76562661.0, + "step": 2002 + }, + { + "epoch": 0.2548021880167918, + "ewc_loss": 0.0046773492358624935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6773493522778153e-05, + "grad_norm": 3.0431137084960938, + "learning_rate": 8.486646884272997e-07, + "loss": 0.4445, + "mean_token_accuracy": 0.8514100313186646, + "num_tokens": 76600914.0, + "step": 2003 + }, + { + "epoch": 0.25492939829538225, + "ewc_loss": 0.004704915918409824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.704915772890672e-05, + "grad_norm": 3.1549453735351562, + "learning_rate": 8.490885968630775e-07, + "loss": 0.4653, + "mean_token_accuracy": 0.8446307182312012, + "num_tokens": 76636310.0, + "step": 2004 + }, + { + "epoch": 0.2550566085739728, + "ewc_loss": 0.004760687239468098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.760687443194911e-05, + "grad_norm": 3.0125787258148193, + "learning_rate": 8.495125052988555e-07, + "loss": 0.4513, + "mean_token_accuracy": 0.8542855978012085, + "num_tokens": 76680889.0, + "step": 2005 + }, + { + "epoch": 0.2551838188525633, + "ewc_loss": 0.004670527298003435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.670527414418757e-05, + "grad_norm": 3.0294575691223145, + "learning_rate": 8.499364137346333e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.8574588894844055, + "num_tokens": 76722469.0, + "step": 2006 + }, + { + "epoch": 0.2553110291311538, + "ewc_loss": 0.004719285294413567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.719285061582923e-05, + "grad_norm": 3.107468605041504, + "learning_rate": 8.503603221704112e-07, + "loss": 0.4755, + "mean_token_accuracy": 0.8437293767929077, + "num_tokens": 76757668.0, + "step": 2007 + }, + { + "epoch": 0.2554382394097443, + "ewc_loss": 0.004754418972879648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.754418841912411e-05, + "grad_norm": 3.0427212715148926, + "learning_rate": 8.50784230606189e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.8557454347610474, + "num_tokens": 76795607.0, + "step": 2008 + }, + { + "epoch": 0.25556544968833483, + "ewc_loss": 0.004710816778242588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7108169383136556e-05, + "grad_norm": 3.0429327487945557, + "learning_rate": 8.512081390419669e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8546940684318542, + "num_tokens": 76837161.0, + "step": 2009 + }, + { + "epoch": 0.2556926599669253, + "ewc_loss": 0.00472576729953289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.725767212221399e-05, + "grad_norm": 3.0920565128326416, + "learning_rate": 8.516320474777447e-07, + "loss": 0.4467, + "mean_token_accuracy": 0.8539543151855469, + "num_tokens": 76872198.0, + "step": 2010 + }, + { + "epoch": 0.25581987024551583, + "ewc_loss": 0.0047400351613759995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.740035001304932e-05, + "grad_norm": 3.0511467456817627, + "learning_rate": 8.520559559135227e-07, + "loss": 0.4116, + "mean_token_accuracy": 0.8660683035850525, + "num_tokens": 76908976.0, + "step": 2011 + }, + { + "epoch": 0.25594708052410636, + "ewc_loss": 0.0047219255939126015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.72192550660111e-05, + "grad_norm": 3.052283525466919, + "learning_rate": 8.524798643493005e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8510104417800903, + "num_tokens": 76947298.0, + "step": 2012 + }, + { + "epoch": 0.25607429080269684, + "ewc_loss": 0.004728233441710472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7282333980547264e-05, + "grad_norm": 2.989647388458252, + "learning_rate": 8.529037727850785e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.8601190447807312, + "num_tokens": 76985112.0, + "step": 2013 + }, + { + "epoch": 0.25620150108128736, + "ewc_loss": 0.004711938090622425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.711938163382001e-05, + "grad_norm": 3.0941498279571533, + "learning_rate": 8.533276812208563e-07, + "loss": 0.3914, + "mean_token_accuracy": 0.8693497180938721, + "num_tokens": 77020309.0, + "step": 2014 + }, + { + "epoch": 0.2563287113598779, + "ewc_loss": 0.004777589347213507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.777589492732659e-05, + "grad_norm": 3.0512731075286865, + "learning_rate": 8.53751589656634e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.854705810546875, + "num_tokens": 77063245.0, + "step": 2015 + }, + { + "epoch": 0.25645592163846836, + "ewc_loss": 0.0047301617451012135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.730161526822485e-05, + "grad_norm": 3.1689600944519043, + "learning_rate": 8.54175498092412e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8589402437210083, + "num_tokens": 77096791.0, + "step": 2016 + }, + { + "epoch": 0.2565831319170589, + "ewc_loss": 0.004791875835508108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.791875835508108e-05, + "grad_norm": 3.1340038776397705, + "learning_rate": 8.545994065281898e-07, + "loss": 0.4083, + "mean_token_accuracy": 0.8645654916763306, + "num_tokens": 77129281.0, + "step": 2017 + }, + { + "epoch": 0.2567103421956494, + "ewc_loss": 0.004757177550345659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.757177521241829e-05, + "grad_norm": 3.1152467727661133, + "learning_rate": 8.550233149639677e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8360375165939331, + "num_tokens": 77168785.0, + "step": 2018 + }, + { + "epoch": 0.2568375524742399, + "ewc_loss": 0.0047531696036458015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.753169559990056e-05, + "grad_norm": 3.064687728881836, + "learning_rate": 8.554472233997456e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8447012305259705, + "num_tokens": 77208360.0, + "step": 2019 + }, + { + "epoch": 0.2569647627528304, + "ewc_loss": 0.004737695213407278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7376950533362105e-05, + "grad_norm": 3.1638102531433105, + "learning_rate": 8.558711318355235e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.835605800151825, + "num_tokens": 77245539.0, + "step": 2020 + }, + { + "epoch": 0.25709197303142095, + "ewc_loss": 0.004793244414031506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.793244443135336e-05, + "grad_norm": 3.1023826599121094, + "learning_rate": 8.562950402713014e-07, + "loss": 0.3974, + "mean_token_accuracy": 0.871871292591095, + "num_tokens": 77279118.0, + "step": 2021 + }, + { + "epoch": 0.2572191833100114, + "ewc_loss": 0.0047455960884690285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.745596015709452e-05, + "grad_norm": 3.0906662940979004, + "learning_rate": 8.567189487070793e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8435905575752258, + "num_tokens": 77316259.0, + "step": 2022 + }, + { + "epoch": 0.25734639358860195, + "ewc_loss": 0.004761821590363979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7618214011890814e-05, + "grad_norm": 3.068634033203125, + "learning_rate": 8.57142857142857e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8372783660888672, + "num_tokens": 77359712.0, + "step": 2023 + }, + { + "epoch": 0.2574736038671925, + "ewc_loss": 0.004757708869874477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.757709029945545e-05, + "grad_norm": 3.015908718109131, + "learning_rate": 8.57566765578635e-07, + "loss": 0.4065, + "mean_token_accuracy": 0.8678199052810669, + "num_tokens": 77402269.0, + "step": 2024 + }, + { + "epoch": 0.25760081414578295, + "ewc_loss": 0.004738345742225647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7383455239469185e-05, + "grad_norm": 3.0289571285247803, + "learning_rate": 8.579906740144128e-07, + "loss": 0.435, + "mean_token_accuracy": 0.8576188087463379, + "num_tokens": 77445791.0, + "step": 2025 + }, + { + "epoch": 0.2577280244243735, + "ewc_loss": 0.0047548068687319756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.754806650453247e-05, + "grad_norm": 3.0955135822296143, + "learning_rate": 8.584145824501907e-07, + "loss": 0.4687, + "mean_token_accuracy": 0.8430712223052979, + "num_tokens": 77485376.0, + "step": 2026 + }, + { + "epoch": 0.257855234702964, + "ewc_loss": 0.004792388528585434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7923884267220274e-05, + "grad_norm": 3.046795129776001, + "learning_rate": 8.588384908859686e-07, + "loss": 0.4643, + "mean_token_accuracy": 0.8505279421806335, + "num_tokens": 77524344.0, + "step": 2027 + }, + { + "epoch": 0.25798244498155454, + "ewc_loss": 0.004745997488498688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7459972847718745e-05, + "grad_norm": 3.0999176502227783, + "learning_rate": 8.592623993217465e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8515897989273071, + "num_tokens": 77562746.0, + "step": 2028 + }, + { + "epoch": 0.258109655260145, + "ewc_loss": 0.004788156133145094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.788156002177857e-05, + "grad_norm": 3.101336717605591, + "learning_rate": 8.596863077575244e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8469577431678772, + "num_tokens": 77601261.0, + "step": 2029 + }, + { + "epoch": 0.25823686553873554, + "ewc_loss": 0.004782821983098984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.782821997650899e-05, + "grad_norm": 3.1282236576080322, + "learning_rate": 8.601102161933023e-07, + "loss": 0.4598, + "mean_token_accuracy": 0.8488421440124512, + "num_tokens": 77640054.0, + "step": 2030 + }, + { + "epoch": 0.25836407581732607, + "ewc_loss": 0.0047790855169296265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.779085429618135e-05, + "grad_norm": 3.022737741470337, + "learning_rate": 8.6053412462908e-07, + "loss": 0.4336, + "mean_token_accuracy": 0.8571273684501648, + "num_tokens": 77680098.0, + "step": 2031 + }, + { + "epoch": 0.25849128609591654, + "ewc_loss": 0.004737738985568285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7377390728797764e-05, + "grad_norm": 3.1942756175994873, + "learning_rate": 8.60958033064858e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.8601675033569336, + "num_tokens": 77712517.0, + "step": 2032 + }, + { + "epoch": 0.25861849637450707, + "ewc_loss": 0.0048580169677734375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8580168368062004e-05, + "grad_norm": 3.1374733448028564, + "learning_rate": 8.613819415006358e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.8463304042816162, + "num_tokens": 77751335.0, + "step": 2033 + }, + { + "epoch": 0.2587457066530976, + "ewc_loss": 0.004780838266015053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.780838207807392e-05, + "grad_norm": 3.0756287574768066, + "learning_rate": 8.618058499364137e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8429586887359619, + "num_tokens": 77792300.0, + "step": 2034 + }, + { + "epoch": 0.25887291693168807, + "ewc_loss": 0.004761873744428158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7618737880839035e-05, + "grad_norm": 3.038578987121582, + "learning_rate": 8.622297583721916e-07, + "loss": 0.4379, + "mean_token_accuracy": 0.858491063117981, + "num_tokens": 77836391.0, + "step": 2035 + }, + { + "epoch": 0.2590001272102786, + "ewc_loss": 0.004768040031194687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.768040162161924e-05, + "grad_norm": 3.101573944091797, + "learning_rate": 8.626536668079695e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8501049876213074, + "num_tokens": 77871600.0, + "step": 2036 + }, + { + "epoch": 0.2591273374888691, + "ewc_loss": 0.004799954127520323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.799953967449255e-05, + "grad_norm": 3.1588094234466553, + "learning_rate": 8.630775752437474e-07, + "loss": 0.4344, + "mean_token_accuracy": 0.8563851118087769, + "num_tokens": 77904623.0, + "step": 2037 + }, + { + "epoch": 0.2592545477674596, + "ewc_loss": 0.00481718685477972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.817186709260568e-05, + "grad_norm": 3.127854108810425, + "learning_rate": 8.635014836795251e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.846190333366394, + "num_tokens": 77941145.0, + "step": 2038 + }, + { + "epoch": 0.2593817580460501, + "ewc_loss": 0.0047903526574373245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.790352613781579e-05, + "grad_norm": 3.0928735733032227, + "learning_rate": 8.63925392115303e-07, + "loss": 0.4116, + "mean_token_accuracy": 0.8606756925582886, + "num_tokens": 77975087.0, + "step": 2039 + }, + { + "epoch": 0.25950896832464065, + "ewc_loss": 0.004794619046151638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.794618871528655e-05, + "grad_norm": 3.115778923034668, + "learning_rate": 8.643493005510809e-07, + "loss": 0.4189, + "mean_token_accuracy": 0.8610336780548096, + "num_tokens": 78009526.0, + "step": 2040 + }, + { + "epoch": 0.2596361786032311, + "ewc_loss": 0.004820534959435463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.820535104954615e-05, + "grad_norm": 3.041454553604126, + "learning_rate": 8.647732089868588e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8368593454360962, + "num_tokens": 78050918.0, + "step": 2041 + }, + { + "epoch": 0.25976338888182166, + "ewc_loss": 0.004783045966178179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.783046097145416e-05, + "grad_norm": 3.104189872741699, + "learning_rate": 8.651971174226366e-07, + "loss": 0.4144, + "mean_token_accuracy": 0.8625291585922241, + "num_tokens": 78085547.0, + "step": 2042 + }, + { + "epoch": 0.2598905991604122, + "ewc_loss": 0.004845126532018185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8451263864990324e-05, + "grad_norm": 3.1488027572631836, + "learning_rate": 8.656210258584146e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8410826921463013, + "num_tokens": 78122916.0, + "step": 2043 + }, + { + "epoch": 0.26001780943900266, + "ewc_loss": 0.0048513286747038364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.851328776567243e-05, + "grad_norm": 3.1232755184173584, + "learning_rate": 8.660449342941924e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8421483039855957, + "num_tokens": 78161800.0, + "step": 2044 + }, + { + "epoch": 0.2601450197175932, + "ewc_loss": 0.004840414505451918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.840414476348087e-05, + "grad_norm": 3.0771467685699463, + "learning_rate": 8.664688427299704e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8506901264190674, + "num_tokens": 78199961.0, + "step": 2045 + }, + { + "epoch": 0.2602722299961837, + "ewc_loss": 0.004827125463634729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8271256673615426e-05, + "grad_norm": 3.122708320617676, + "learning_rate": 8.668927511657481e-07, + "loss": 0.4132, + "mean_token_accuracy": 0.8604961037635803, + "num_tokens": 78231332.0, + "step": 2046 + }, + { + "epoch": 0.2603994402747742, + "ewc_loss": 0.004862689413130283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.862689456786029e-05, + "grad_norm": 3.143232822418213, + "learning_rate": 8.67316659601526e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.8446755409240723, + "num_tokens": 78265054.0, + "step": 2047 + }, + { + "epoch": 0.2605266505533647, + "ewc_loss": 0.0048750899732112885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.875089871347882e-05, + "grad_norm": 3.0637998580932617, + "learning_rate": 8.677405680373039e-07, + "loss": 0.4249, + "mean_token_accuracy": 0.8615870475769043, + "num_tokens": 78305445.0, + "step": 2048 + }, + { + "epoch": 0.26065386083195524, + "ewc_loss": 0.004831015598028898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.831015758099966e-05, + "grad_norm": 3.187354803085327, + "learning_rate": 8.681644764730818e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.8504990935325623, + "num_tokens": 78339174.0, + "step": 2049 + }, + { + "epoch": 0.2607810711105457, + "ewc_loss": 0.0049139573238790035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.913957309327088e-05, + "grad_norm": 3.035994529724121, + "learning_rate": 8.685883849088596e-07, + "loss": 0.3965, + "mean_token_accuracy": 0.8683620691299438, + "num_tokens": 78377630.0, + "step": 2050 + }, + { + "epoch": 0.26090828138913624, + "ewc_loss": 0.004825417418032885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.825417636311613e-05, + "grad_norm": 3.076193332672119, + "learning_rate": 8.690122933446376e-07, + "loss": 0.4547, + "mean_token_accuracy": 0.8525651693344116, + "num_tokens": 78419908.0, + "step": 2051 + }, + { + "epoch": 0.26103549166772677, + "ewc_loss": 0.004873097874224186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.873098077950999e-05, + "grad_norm": 3.032443046569824, + "learning_rate": 8.694362017804154e-07, + "loss": 0.4218, + "mean_token_accuracy": 0.8622917532920837, + "num_tokens": 78459561.0, + "step": 2052 + }, + { + "epoch": 0.26116270194631724, + "ewc_loss": 0.004840821027755737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.840820838580839e-05, + "grad_norm": 3.0665667057037354, + "learning_rate": 8.698601102161933e-07, + "loss": 0.4512, + "mean_token_accuracy": 0.8501104116439819, + "num_tokens": 78502500.0, + "step": 2053 + }, + { + "epoch": 0.26128991222490777, + "ewc_loss": 0.004866956267505884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.866956442128867e-05, + "grad_norm": 3.04398512840271, + "learning_rate": 8.702840186519711e-07, + "loss": 0.4108, + "mean_token_accuracy": 0.8639530539512634, + "num_tokens": 78545411.0, + "step": 2054 + }, + { + "epoch": 0.2614171225034983, + "ewc_loss": 0.004854024387896061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.854024518863298e-05, + "grad_norm": 3.0834732055664062, + "learning_rate": 8.70707927087749e-07, + "loss": 0.41, + "mean_token_accuracy": 0.864639937877655, + "num_tokens": 78585878.0, + "step": 2055 + }, + { + "epoch": 0.2615443327820888, + "ewc_loss": 0.0048665390349924564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8665391659596935e-05, + "grad_norm": 3.1905555725097656, + "learning_rate": 8.711318355235269e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8353350758552551, + "num_tokens": 78621233.0, + "step": 2056 + }, + { + "epoch": 0.2616715430606793, + "ewc_loss": 0.00491959135979414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.919591447105631e-05, + "grad_norm": 3.142237663269043, + "learning_rate": 8.715557439593047e-07, + "loss": 0.4189, + "mean_token_accuracy": 0.8591715693473816, + "num_tokens": 78654644.0, + "step": 2057 + }, + { + "epoch": 0.26179875333926983, + "ewc_loss": 0.004866136237978935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8661364417057484e-05, + "grad_norm": 3.0992164611816406, + "learning_rate": 8.719796523950826e-07, + "loss": 0.4939, + "mean_token_accuracy": 0.8373337984085083, + "num_tokens": 78692851.0, + "step": 2058 + }, + { + "epoch": 0.2619259636178603, + "ewc_loss": 0.004872000776231289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.87200086354278e-05, + "grad_norm": 3.10030198097229, + "learning_rate": 8.724035608308605e-07, + "loss": 0.388, + "mean_token_accuracy": 0.869900643825531, + "num_tokens": 78725329.0, + "step": 2059 + }, + { + "epoch": 0.26205317389645083, + "ewc_loss": 0.0048782252706587315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8782250814838335e-05, + "grad_norm": 3.0646488666534424, + "learning_rate": 8.728274692666384e-07, + "loss": 0.4442, + "mean_token_accuracy": 0.8540060520172119, + "num_tokens": 78766645.0, + "step": 2060 + }, + { + "epoch": 0.26218038417504136, + "ewc_loss": 0.004856607876718044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.856607847614214e-05, + "grad_norm": 3.135369300842285, + "learning_rate": 8.732513777024162e-07, + "loss": 0.4406, + "mean_token_accuracy": 0.8547143340110779, + "num_tokens": 78801576.0, + "step": 2061 + }, + { + "epoch": 0.26230759445363183, + "ewc_loss": 0.004894217476248741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.894217636319809e-05, + "grad_norm": 3.0665693283081055, + "learning_rate": 8.736752861381941e-07, + "loss": 0.4825, + "mean_token_accuracy": 0.8415551781654358, + "num_tokens": 78845577.0, + "step": 2062 + }, + { + "epoch": 0.26243480473222236, + "ewc_loss": 0.004861178807914257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8611789679853246e-05, + "grad_norm": 3.1002237796783447, + "learning_rate": 8.740991945739719e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8537014722824097, + "num_tokens": 78884229.0, + "step": 2063 + }, + { + "epoch": 0.2625620150108129, + "ewc_loss": 0.004891103133559227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8911031626630574e-05, + "grad_norm": 3.0484519004821777, + "learning_rate": 8.745231030097499e-07, + "loss": 0.4348, + "mean_token_accuracy": 0.8562490940093994, + "num_tokens": 78926948.0, + "step": 2064 + }, + { + "epoch": 0.26268922528940336, + "ewc_loss": 0.004851000849157572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.851000994676724e-05, + "grad_norm": 3.106719970703125, + "learning_rate": 8.749470114455277e-07, + "loss": 0.412, + "mean_token_accuracy": 0.8620733022689819, + "num_tokens": 78963582.0, + "step": 2065 + }, + { + "epoch": 0.2628164355679939, + "ewc_loss": 0.004895035643130541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.895035453955643e-05, + "grad_norm": 3.0813276767730713, + "learning_rate": 8.753709198813056e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.8318130970001221, + "num_tokens": 79007345.0, + "step": 2066 + }, + { + "epoch": 0.2629436458465844, + "ewc_loss": 0.004867664072662592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.867664029006846e-05, + "grad_norm": 3.070805072784424, + "learning_rate": 8.757948283170835e-07, + "loss": 0.4166, + "mean_token_accuracy": 0.8639365434646606, + "num_tokens": 79049505.0, + "step": 2067 + }, + { + "epoch": 0.2630708561251749, + "ewc_loss": 0.004871802404522896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.871802229899913e-05, + "grad_norm": 3.0859062671661377, + "learning_rate": 8.762187367528613e-07, + "loss": 0.4114, + "mean_token_accuracy": 0.8616287708282471, + "num_tokens": 79087419.0, + "step": 2068 + }, + { + "epoch": 0.2631980664037654, + "ewc_loss": 0.004884690511971712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.884690497419797e-05, + "grad_norm": 3.0825445652008057, + "learning_rate": 8.766426451886392e-07, + "loss": 0.459, + "mean_token_accuracy": 0.8470772504806519, + "num_tokens": 79131444.0, + "step": 2069 + }, + { + "epoch": 0.26332527668235595, + "ewc_loss": 0.004871734417974949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.871734563494101e-05, + "grad_norm": 3.1116158962249756, + "learning_rate": 8.770665536244171e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.844926118850708, + "num_tokens": 79171976.0, + "step": 2070 + }, + { + "epoch": 0.2634524869609464, + "ewc_loss": 0.004895169287919998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.895169331575744e-05, + "grad_norm": 3.1282222270965576, + "learning_rate": 8.774904620601949e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8602712750434875, + "num_tokens": 79206273.0, + "step": 2071 + }, + { + "epoch": 0.26357969723953695, + "ewc_loss": 0.004891277756541967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.891277785645798e-05, + "grad_norm": 3.0882527828216553, + "learning_rate": 8.779143704959729e-07, + "loss": 0.4135, + "mean_token_accuracy": 0.8621149063110352, + "num_tokens": 79245564.0, + "step": 2072 + }, + { + "epoch": 0.2637069075181275, + "ewc_loss": 0.004865211434662342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.865211303695105e-05, + "grad_norm": 3.142953395843506, + "learning_rate": 8.783382789317507e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8536006808280945, + "num_tokens": 79281857.0, + "step": 2073 + }, + { + "epoch": 0.26383411779671795, + "ewc_loss": 0.004905758425593376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.905758396489546e-05, + "grad_norm": 3.1923673152923584, + "learning_rate": 8.787621873675286e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8379175662994385, + "num_tokens": 79312482.0, + "step": 2074 + }, + { + "epoch": 0.2639613280753085, + "ewc_loss": 0.004917662125080824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9176622269442305e-05, + "grad_norm": 3.095996141433716, + "learning_rate": 8.791860958033065e-07, + "loss": 0.4197, + "mean_token_accuracy": 0.8620234727859497, + "num_tokens": 79348157.0, + "step": 2075 + }, + { + "epoch": 0.264088538353899, + "ewc_loss": 0.00486418604850769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8641861212672666e-05, + "grad_norm": 3.0699126720428467, + "learning_rate": 8.796100042390842e-07, + "loss": 0.4068, + "mean_token_accuracy": 0.8655723929405212, + "num_tokens": 79387540.0, + "step": 2076 + }, + { + "epoch": 0.2642157486324895, + "ewc_loss": 0.0048848302103579044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.884830195805989e-05, + "grad_norm": 3.180418014526367, + "learning_rate": 8.800339126748622e-07, + "loss": 0.4211, + "mean_token_accuracy": 0.8598476648330688, + "num_tokens": 79421424.0, + "step": 2077 + }, + { + "epoch": 0.26434295891108, + "ewc_loss": 0.004943846259266138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9438462156103924e-05, + "grad_norm": 3.129913568496704, + "learning_rate": 8.8045782111064e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.8374730348587036, + "num_tokens": 79458733.0, + "step": 2078 + }, + { + "epoch": 0.26447016918967053, + "ewc_loss": 0.00489845173433423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.898451879853383e-05, + "grad_norm": 3.1137218475341797, + "learning_rate": 8.808817295464179e-07, + "loss": 0.4434, + "mean_token_accuracy": 0.8547278046607971, + "num_tokens": 79495394.0, + "step": 2079 + }, + { + "epoch": 0.26459737946826106, + "ewc_loss": 0.004902726039290428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.902726141153835e-05, + "grad_norm": 3.115766763687134, + "learning_rate": 8.813056379821958e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8630051612854004, + "num_tokens": 79529799.0, + "step": 2080 + }, + { + "epoch": 0.26472458974685154, + "ewc_loss": 0.004926565568894148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.926565452478826e-05, + "grad_norm": 3.1390984058380127, + "learning_rate": 8.817295464179737e-07, + "loss": 0.4413, + "mean_token_accuracy": 0.8534340858459473, + "num_tokens": 79564664.0, + "step": 2081 + }, + { + "epoch": 0.26485180002544206, + "ewc_loss": 0.004934491124004126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.934491153107956e-05, + "grad_norm": 3.141191005706787, + "learning_rate": 8.821534548537515e-07, + "loss": 0.5176, + "mean_token_accuracy": 0.8286030292510986, + "num_tokens": 79606104.0, + "step": 2082 + }, + { + "epoch": 0.2649790103040326, + "ewc_loss": 0.004952139221131802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9521393520990387e-05, + "grad_norm": 3.068969488143921, + "learning_rate": 8.825773632895295e-07, + "loss": 0.4165, + "mean_token_accuracy": 0.8623230457305908, + "num_tokens": 79647688.0, + "step": 2083 + }, + { + "epoch": 0.26510622058262306, + "ewc_loss": 0.004895524121820927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.895524034509435e-05, + "grad_norm": 3.1340436935424805, + "learning_rate": 8.830012717253072e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.8589597940444946, + "num_tokens": 79685500.0, + "step": 2084 + }, + { + "epoch": 0.2652334308612136, + "ewc_loss": 0.004969038534909487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.969038491253741e-05, + "grad_norm": 3.0749874114990234, + "learning_rate": 8.834251801610852e-07, + "loss": 0.4605, + "mean_token_accuracy": 0.8473818302154541, + "num_tokens": 79729912.0, + "step": 2085 + }, + { + "epoch": 0.2653606411398041, + "ewc_loss": 0.0049184043891727924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.918404374620877e-05, + "grad_norm": 3.113168478012085, + "learning_rate": 8.83849088596863e-07, + "loss": 0.4159, + "mean_token_accuracy": 0.8627642393112183, + "num_tokens": 79767487.0, + "step": 2086 + }, + { + "epoch": 0.2654878514183946, + "ewc_loss": 0.0049493201076984406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.949320282321423e-05, + "grad_norm": 3.1134262084960938, + "learning_rate": 8.842729970326409e-07, + "loss": 0.4007, + "mean_token_accuracy": 0.868587851524353, + "num_tokens": 79804802.0, + "step": 2087 + }, + { + "epoch": 0.2656150616969851, + "ewc_loss": 0.004945898894220591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9458987632533535e-05, + "grad_norm": 3.139463424682617, + "learning_rate": 8.846969054684188e-07, + "loss": 0.5154, + "mean_token_accuracy": 0.8341283202171326, + "num_tokens": 79845685.0, + "step": 2088 + }, + { + "epoch": 0.26574227197557565, + "ewc_loss": 0.0049479054287076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.947905472363345e-05, + "grad_norm": 3.1126084327697754, + "learning_rate": 8.851208139041967e-07, + "loss": 0.4045, + "mean_token_accuracy": 0.8645859956741333, + "num_tokens": 79884692.0, + "step": 2089 + }, + { + "epoch": 0.2658694822541661, + "ewc_loss": 0.004934907890856266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.934908065479249e-05, + "grad_norm": 3.207470417022705, + "learning_rate": 8.855447223399745e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8445471525192261, + "num_tokens": 79919780.0, + "step": 2090 + }, + { + "epoch": 0.26599669253275665, + "ewc_loss": 0.004995001945644617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.995002018404193e-05, + "grad_norm": 3.136378526687622, + "learning_rate": 8.859686307757524e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.8391284346580505, + "num_tokens": 79958317.0, + "step": 2091 + }, + { + "epoch": 0.2661239028113472, + "ewc_loss": 0.00494379922747612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.943799285683781e-05, + "grad_norm": 3.131629228591919, + "learning_rate": 8.863925392115302e-07, + "loss": 0.4232, + "mean_token_accuracy": 0.8573269844055176, + "num_tokens": 79994126.0, + "step": 2092 + }, + { + "epoch": 0.26625111308993765, + "ewc_loss": 0.004957588855177164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.957589044352062e-05, + "grad_norm": 3.0311450958251953, + "learning_rate": 8.868164476473082e-07, + "loss": 0.3917, + "mean_token_accuracy": 0.8686678409576416, + "num_tokens": 80033157.0, + "step": 2093 + }, + { + "epoch": 0.2663783233685282, + "ewc_loss": 0.00492686964571476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.926869587507099e-05, + "grad_norm": 3.092824697494507, + "learning_rate": 8.87240356083086e-07, + "loss": 0.4219, + "mean_token_accuracy": 0.8601235747337341, + "num_tokens": 80073354.0, + "step": 2094 + }, + { + "epoch": 0.2665055336471187, + "ewc_loss": 0.004972478374838829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.972478200215846e-05, + "grad_norm": 3.1150224208831787, + "learning_rate": 8.876642645188639e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8407495021820068, + "num_tokens": 80111819.0, + "step": 2095 + }, + { + "epoch": 0.2666327439257092, + "ewc_loss": 0.0049667637795209885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9667636631056666e-05, + "grad_norm": 3.1312551498413086, + "learning_rate": 8.880881729546418e-07, + "loss": 0.3952, + "mean_token_accuracy": 0.8680433034896851, + "num_tokens": 80147951.0, + "step": 2096 + }, + { + "epoch": 0.2667599542042997, + "ewc_loss": 0.004969751928001642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.969751898897812e-05, + "grad_norm": 3.05489182472229, + "learning_rate": 8.885120813904197e-07, + "loss": 0.4117, + "mean_token_accuracy": 0.8633782267570496, + "num_tokens": 80191319.0, + "step": 2097 + }, + { + "epoch": 0.26688716448289024, + "ewc_loss": 0.004934027791023254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9340276746079326e-05, + "grad_norm": 3.178927183151245, + "learning_rate": 8.889359898261976e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8395348787307739, + "num_tokens": 80226300.0, + "step": 2098 + }, + { + "epoch": 0.2670143747614807, + "ewc_loss": 0.005007521715015173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0075217586709186e-05, + "grad_norm": 3.109971284866333, + "learning_rate": 8.893598982619753e-07, + "loss": 0.4232, + "mean_token_accuracy": 0.8598994016647339, + "num_tokens": 80265465.0, + "step": 2099 + }, + { + "epoch": 0.26714158504007124, + "ewc_loss": 0.004949246998876333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.94924679514952e-05, + "grad_norm": 3.0455775260925293, + "learning_rate": 8.897838066977532e-07, + "loss": 0.427, + "mean_token_accuracy": 0.8587215542793274, + "num_tokens": 80312696.0, + "step": 2100 + }, + { + "epoch": 0.26726879531866177, + "ewc_loss": 0.004925604443997145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.925604662275873e-05, + "grad_norm": 3.1133992671966553, + "learning_rate": 8.902077151335311e-07, + "loss": 0.4445, + "mean_token_accuracy": 0.8514924049377441, + "num_tokens": 80353507.0, + "step": 2101 + }, + { + "epoch": 0.26739600559725224, + "ewc_loss": 0.004988980945199728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9889811634784564e-05, + "grad_norm": 3.1996519565582275, + "learning_rate": 8.90631623569309e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8250398635864258, + "num_tokens": 80390795.0, + "step": 2102 + }, + { + "epoch": 0.26752321587584277, + "ewc_loss": 0.005000895354896784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.000895180273801e-05, + "grad_norm": 3.1194732189178467, + "learning_rate": 8.910555320050868e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8554610013961792, + "num_tokens": 80427599.0, + "step": 2103 + }, + { + "epoch": 0.2676504261544333, + "ewc_loss": 0.004950466100126505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.950466245645657e-05, + "grad_norm": 3.079796314239502, + "learning_rate": 8.914794404408648e-07, + "loss": 0.4406, + "mean_token_accuracy": 0.8559232950210571, + "num_tokens": 80468084.0, + "step": 2104 + }, + { + "epoch": 0.26777763643302377, + "ewc_loss": 0.004967516288161278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.967516360920854e-05, + "grad_norm": 3.041377544403076, + "learning_rate": 8.919033488766426e-07, + "loss": 0.3841, + "mean_token_accuracy": 0.8714897632598877, + "num_tokens": 80507218.0, + "step": 2105 + }, + { + "epoch": 0.2679048467116143, + "ewc_loss": 0.004962222650647163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.962222737958655e-05, + "grad_norm": 3.179168462753296, + "learning_rate": 8.923272573124204e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8465251922607422, + "num_tokens": 80541730.0, + "step": 2106 + }, + { + "epoch": 0.2680320569902048, + "ewc_loss": 0.005022098310291767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.022098412155174e-05, + "grad_norm": 3.078096389770508, + "learning_rate": 8.927511657481983e-07, + "loss": 0.4796, + "mean_token_accuracy": 0.8410966396331787, + "num_tokens": 80584677.0, + "step": 2107 + }, + { + "epoch": 0.2681592672687953, + "ewc_loss": 0.004936764016747475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.936763798468746e-05, + "grad_norm": 3.1463890075683594, + "learning_rate": 8.931750741839762e-07, + "loss": 0.4136, + "mean_token_accuracy": 0.8644872903823853, + "num_tokens": 80616332.0, + "step": 2108 + }, + { + "epoch": 0.2682864775473858, + "ewc_loss": 0.005011836066842079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.011836037738249e-05, + "grad_norm": 3.1503725051879883, + "learning_rate": 8.935989826197541e-07, + "loss": 0.4247, + "mean_token_accuracy": 0.8610947132110596, + "num_tokens": 80654435.0, + "step": 2109 + }, + { + "epoch": 0.26841368782597635, + "ewc_loss": 0.004989828914403915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.989828812540509e-05, + "grad_norm": 3.1048378944396973, + "learning_rate": 8.94022891055532e-07, + "loss": 0.4006, + "mean_token_accuracy": 0.8698071837425232, + "num_tokens": 80691246.0, + "step": 2110 + }, + { + "epoch": 0.2685408981045668, + "ewc_loss": 0.004968077410012484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9680773372529075e-05, + "grad_norm": 3.1073005199432373, + "learning_rate": 8.944467994913098e-07, + "loss": 0.4515, + "mean_token_accuracy": 0.8476307392120361, + "num_tokens": 80730510.0, + "step": 2111 + }, + { + "epoch": 0.26866810838315736, + "ewc_loss": 0.004987494554370642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.987494685337879e-05, + "grad_norm": 3.077362060546875, + "learning_rate": 8.948707079270878e-07, + "loss": 0.3707, + "mean_token_accuracy": 0.876025915145874, + "num_tokens": 80769369.0, + "step": 2112 + }, + { + "epoch": 0.2687953186617479, + "ewc_loss": 0.004968668334186077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.968668145011179e-05, + "grad_norm": 3.2301840782165527, + "learning_rate": 8.952946163628656e-07, + "loss": 0.425, + "mean_token_accuracy": 0.8598051071166992, + "num_tokens": 80802168.0, + "step": 2113 + }, + { + "epoch": 0.26892252894033836, + "ewc_loss": 0.0050395093858242035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.039509414928034e-05, + "grad_norm": 3.085484266281128, + "learning_rate": 8.957185247986434e-07, + "loss": 0.3762, + "mean_token_accuracy": 0.8741588592529297, + "num_tokens": 80839389.0, + "step": 2114 + }, + { + "epoch": 0.2690497392189289, + "ewc_loss": 0.004943116568028927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.94311643706169e-05, + "grad_norm": 3.11987566947937, + "learning_rate": 8.961424332344213e-07, + "loss": 0.4311, + "mean_token_accuracy": 0.8578664064407349, + "num_tokens": 80881139.0, + "step": 2115 + }, + { + "epoch": 0.2691769494975194, + "ewc_loss": 0.005007881671190262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.00788155477494e-05, + "grad_norm": 3.1428561210632324, + "learning_rate": 8.965663416701992e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8555302619934082, + "num_tokens": 80919809.0, + "step": 2116 + }, + { + "epoch": 0.2693041597761099, + "ewc_loss": 0.004996176343411207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9961763579631224e-05, + "grad_norm": 3.1917967796325684, + "learning_rate": 8.969902501059771e-07, + "loss": 0.4082, + "mean_token_accuracy": 0.8666636943817139, + "num_tokens": 80956860.0, + "step": 2117 + }, + { + "epoch": 0.2694313700547004, + "ewc_loss": 0.0050182766281068325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.018276715418324e-05, + "grad_norm": 3.0928142070770264, + "learning_rate": 8.97414158541755e-07, + "loss": 0.3751, + "mean_token_accuracy": 0.8746954798698425, + "num_tokens": 80994322.0, + "step": 2118 + }, + { + "epoch": 0.26955858033329094, + "ewc_loss": 0.004944809712469578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.94480955239851e-05, + "grad_norm": 3.102864980697632, + "learning_rate": 8.978380669775328e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8488471508026123, + "num_tokens": 81032792.0, + "step": 2119 + }, + { + "epoch": 0.2696857906118814, + "ewc_loss": 0.004979995079338551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.979994992027059e-05, + "grad_norm": 3.158071517944336, + "learning_rate": 8.982619754133107e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8498027324676514, + "num_tokens": 81070011.0, + "step": 2120 + }, + { + "epoch": 0.26981300089047194, + "ewc_loss": 0.0049971346743404865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9971346015809104e-05, + "grad_norm": 3.110524892807007, + "learning_rate": 8.986858838490886e-07, + "loss": 0.46, + "mean_token_accuracy": 0.8491268157958984, + "num_tokens": 81113145.0, + "step": 2121 + }, + { + "epoch": 0.26994021116906247, + "ewc_loss": 0.004958468489348888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.958468343829736e-05, + "grad_norm": 3.0887818336486816, + "learning_rate": 8.991097922848663e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8595110177993774, + "num_tokens": 81154175.0, + "step": 2122 + }, + { + "epoch": 0.27006742144765294, + "ewc_loss": 0.004968209192156792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.968209395883605e-05, + "grad_norm": 3.159900426864624, + "learning_rate": 8.995337007206443e-07, + "loss": 0.4532, + "mean_token_accuracy": 0.8522442579269409, + "num_tokens": 81193319.0, + "step": 2123 + }, + { + "epoch": 0.2701946317262435, + "ewc_loss": 0.0049992892891168594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.999289376428351e-05, + "grad_norm": 3.107351303100586, + "learning_rate": 8.999576091564221e-07, + "loss": 0.4206, + "mean_token_accuracy": 0.8620847463607788, + "num_tokens": 81229057.0, + "step": 2124 + }, + { + "epoch": 0.270321842004834, + "ewc_loss": 0.004963517654687166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9635178584139794e-05, + "grad_norm": 3.1701714992523193, + "learning_rate": 9.003815175922001e-07, + "loss": 0.4282, + "mean_token_accuracy": 0.8582058548927307, + "num_tokens": 81262380.0, + "step": 2125 + }, + { + "epoch": 0.2704490522834245, + "ewc_loss": 0.005003539379686117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0035392632707953e-05, + "grad_norm": 3.074575424194336, + "learning_rate": 9.008054260279779e-07, + "loss": 0.4339, + "mean_token_accuracy": 0.856713056564331, + "num_tokens": 81304080.0, + "step": 2126 + }, + { + "epoch": 0.270576262562015, + "ewc_loss": 0.004952296148985624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.952296148985624e-05, + "grad_norm": 3.1905837059020996, + "learning_rate": 9.012293344637558e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8474416136741638, + "num_tokens": 81340226.0, + "step": 2127 + }, + { + "epoch": 0.27070347284060553, + "ewc_loss": 0.005034571047872305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0345712224952877e-05, + "grad_norm": 3.117964506149292, + "learning_rate": 9.016532428995337e-07, + "loss": 0.4099, + "mean_token_accuracy": 0.8631200790405273, + "num_tokens": 81378046.0, + "step": 2128 + }, + { + "epoch": 0.27083068311919606, + "ewc_loss": 0.004975083749741316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9750837206374854e-05, + "grad_norm": 3.0971784591674805, + "learning_rate": 9.020771513353115e-07, + "loss": 0.4551, + "mean_token_accuracy": 0.8501429557800293, + "num_tokens": 81419486.0, + "step": 2129 + }, + { + "epoch": 0.27095789339778653, + "ewc_loss": 0.004984901752322912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9849018978420645e-05, + "grad_norm": 3.152038097381592, + "learning_rate": 9.025010597710894e-07, + "loss": 0.452, + "mean_token_accuracy": 0.8504198789596558, + "num_tokens": 81457488.0, + "step": 2130 + }, + { + "epoch": 0.27108510367637706, + "ewc_loss": 0.005021744407713413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.021744436817244e-05, + "grad_norm": 3.1461665630340576, + "learning_rate": 9.029249682068673e-07, + "loss": 0.4204, + "mean_token_accuracy": 0.8603363037109375, + "num_tokens": 81494812.0, + "step": 2131 + }, + { + "epoch": 0.2712123139549676, + "ewc_loss": 0.004998327232897282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.998327131033875e-05, + "grad_norm": 3.1154139041900635, + "learning_rate": 9.033488766426451e-07, + "loss": 0.4365, + "mean_token_accuracy": 0.8545142412185669, + "num_tokens": 81531421.0, + "step": 2132 + }, + { + "epoch": 0.27133952423355806, + "ewc_loss": 0.0049870810471475124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9870810471475124e-05, + "grad_norm": 3.1302430629730225, + "learning_rate": 9.037727850784231e-07, + "loss": 0.4038, + "mean_token_accuracy": 0.8635791540145874, + "num_tokens": 81567221.0, + "step": 2133 + }, + { + "epoch": 0.2714667345121486, + "ewc_loss": 0.005016940645873547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.01694084960036e-05, + "grad_norm": 3.140537977218628, + "learning_rate": 9.041966935142009e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8550423979759216, + "num_tokens": 81605832.0, + "step": 2134 + }, + { + "epoch": 0.2715939447907391, + "ewc_loss": 0.005016538314521313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.016538125346415e-05, + "grad_norm": 3.084045648574829, + "learning_rate": 9.046206019499788e-07, + "loss": 0.3948, + "mean_token_accuracy": 0.8709938526153564, + "num_tokens": 81646919.0, + "step": 2135 + }, + { + "epoch": 0.2717211550693296, + "ewc_loss": 0.004981021396815777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.98102126584854e-05, + "grad_norm": 3.152071952819824, + "learning_rate": 9.050445103857567e-07, + "loss": 0.4708, + "mean_token_accuracy": 0.8483671545982361, + "num_tokens": 81687214.0, + "step": 2136 + }, + { + "epoch": 0.2718483653479201, + "ewc_loss": 0.005038692150264978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0386919610900804e-05, + "grad_norm": 3.1429483890533447, + "learning_rate": 9.054684188215344e-07, + "loss": 0.3768, + "mean_token_accuracy": 0.8755552768707275, + "num_tokens": 81721437.0, + "step": 2137 + }, + { + "epoch": 0.27197557562651065, + "ewc_loss": 0.005012765526771545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0127655413234606e-05, + "grad_norm": 3.1855595111846924, + "learning_rate": 9.058923272573124e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8511651158332825, + "num_tokens": 81756595.0, + "step": 2138 + }, + { + "epoch": 0.2721027859051011, + "ewc_loss": 0.0050440202467143536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.044020144850947e-05, + "grad_norm": 3.086439371109009, + "learning_rate": 9.063162356930902e-07, + "loss": 0.4147, + "mean_token_accuracy": 0.8634696006774902, + "num_tokens": 81796198.0, + "step": 2139 + }, + { + "epoch": 0.27222999618369165, + "ewc_loss": 0.004986776039004326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9867758207255974e-05, + "grad_norm": 3.155895471572876, + "learning_rate": 9.067401441288681e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8562113046646118, + "num_tokens": 81833002.0, + "step": 2140 + }, + { + "epoch": 0.2723572064622822, + "ewc_loss": 0.005055971443653107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0559716328280047e-05, + "grad_norm": 3.1671700477600098, + "learning_rate": 9.07164052564646e-07, + "loss": 0.4187, + "mean_token_accuracy": 0.856307864189148, + "num_tokens": 81872089.0, + "step": 2141 + }, + { + "epoch": 0.27248441674087265, + "ewc_loss": 0.005038636736571789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0386366638122126e-05, + "grad_norm": 3.146951198577881, + "learning_rate": 9.075879610004239e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8505066633224487, + "num_tokens": 81908356.0, + "step": 2142 + }, + { + "epoch": 0.2726116270194632, + "ewc_loss": 0.005039050243794918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.039050302002579e-05, + "grad_norm": 3.1385152339935303, + "learning_rate": 9.080118694362017e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8471235036849976, + "num_tokens": 81950251.0, + "step": 2143 + }, + { + "epoch": 0.2727388372980537, + "ewc_loss": 0.0050438824109733105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.043882265454158e-05, + "grad_norm": 3.209876298904419, + "learning_rate": 9.084357778719796e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8490997552871704, + "num_tokens": 81987157.0, + "step": 2144 + }, + { + "epoch": 0.2728660475766442, + "ewc_loss": 0.005078715272247791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0787151849363e-05, + "grad_norm": 3.178905487060547, + "learning_rate": 9.088596863077574e-07, + "loss": 0.4349, + "mean_token_accuracy": 0.8591245412826538, + "num_tokens": 82023633.0, + "step": 2145 + }, + { + "epoch": 0.2729932578552347, + "ewc_loss": 0.005037197843194008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.037197843194008e-05, + "grad_norm": 3.1220476627349854, + "learning_rate": 9.092835947435354e-07, + "loss": 0.435, + "mean_token_accuracy": 0.8530451655387878, + "num_tokens": 82061311.0, + "step": 2146 + }, + { + "epoch": 0.27312046813382523, + "ewc_loss": 0.00503232516348362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.032325134379789e-05, + "grad_norm": 3.193652391433716, + "learning_rate": 9.097075031793132e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8455116152763367, + "num_tokens": 82098370.0, + "step": 2147 + }, + { + "epoch": 0.2732476784124157, + "ewc_loss": 0.005077796056866646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0777962314896286e-05, + "grad_norm": 3.173952579498291, + "learning_rate": 9.101314116150911e-07, + "loss": 0.4006, + "mean_token_accuracy": 0.8715108633041382, + "num_tokens": 82134931.0, + "step": 2148 + }, + { + "epoch": 0.27337488869100623, + "ewc_loss": 0.005046016536653042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.046016667620279e-05, + "grad_norm": 3.151492118835449, + "learning_rate": 9.10555320050869e-07, + "loss": 0.4098, + "mean_token_accuracy": 0.8639281392097473, + "num_tokens": 82171391.0, + "step": 2149 + }, + { + "epoch": 0.27350209896959676, + "ewc_loss": 0.0050500743091106415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.050074469181709e-05, + "grad_norm": 3.060697555541992, + "learning_rate": 9.109792284866469e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.8611158728599548, + "num_tokens": 82214064.0, + "step": 2150 + }, + { + "epoch": 0.27362930924818724, + "ewc_loss": 0.005025404039770365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.025404243497178e-05, + "grad_norm": 3.153844118118286, + "learning_rate": 9.114031369224247e-07, + "loss": 0.4428, + "mean_token_accuracy": 0.8531337380409241, + "num_tokens": 82252774.0, + "step": 2151 + }, + { + "epoch": 0.27375651952677776, + "ewc_loss": 0.0050950120203197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.095012238598429e-05, + "grad_norm": 3.2154700756073, + "learning_rate": 9.118270453582026e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.8406081199645996, + "num_tokens": 82291645.0, + "step": 2152 + }, + { + "epoch": 0.2738837298053683, + "ewc_loss": 0.005104533396661282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1045331929344684e-05, + "grad_norm": 3.1481378078460693, + "learning_rate": 9.122509537939804e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.8522589206695557, + "num_tokens": 82333643.0, + "step": 2153 + }, + { + "epoch": 0.27401094008395877, + "ewc_loss": 0.005058396141976118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.058396345702931e-05, + "grad_norm": 3.1824185848236084, + "learning_rate": 9.126748622297584e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8429148197174072, + "num_tokens": 82370792.0, + "step": 2154 + }, + { + "epoch": 0.2741381503625493, + "ewc_loss": 0.00510285934433341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1028593588853255e-05, + "grad_norm": 3.151756525039673, + "learning_rate": 9.130987706655362e-07, + "loss": 0.4579, + "mean_token_accuracy": 0.8479089140892029, + "num_tokens": 82408626.0, + "step": 2155 + }, + { + "epoch": 0.2742653606411398, + "ewc_loss": 0.005076175555586815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.076175511931069e-05, + "grad_norm": 3.14961576461792, + "learning_rate": 9.135226791013141e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.848457932472229, + "num_tokens": 82448401.0, + "step": 2156 + }, + { + "epoch": 0.2743925709197303, + "ewc_loss": 0.005089285783469677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0892856961581856e-05, + "grad_norm": 3.263399362564087, + "learning_rate": 9.13946587537092e-07, + "loss": 0.436, + "mean_token_accuracy": 0.8518137335777283, + "num_tokens": 82476644.0, + "step": 2157 + }, + { + "epoch": 0.2745197811983208, + "ewc_loss": 0.005150616634637117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.150616561877541e-05, + "grad_norm": 3.1665894985198975, + "learning_rate": 9.143704959728699e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.848108172416687, + "num_tokens": 82512722.0, + "step": 2158 + }, + { + "epoch": 0.27464699147691135, + "ewc_loss": 0.005078787449747324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.078787580714561e-05, + "grad_norm": 3.1933412551879883, + "learning_rate": 9.147944044086476e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.8456819653511047, + "num_tokens": 82545393.0, + "step": 2159 + }, + { + "epoch": 0.2747742017555018, + "ewc_loss": 0.005133233033120632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.133232843945734e-05, + "grad_norm": 3.1141357421875, + "learning_rate": 9.152183128444255e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.8510148525238037, + "num_tokens": 82586329.0, + "step": 2160 + }, + { + "epoch": 0.27490141203409235, + "ewc_loss": 0.005086391698569059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.086391684017144e-05, + "grad_norm": 3.141146421432495, + "learning_rate": 9.156422212802034e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.840369701385498, + "num_tokens": 82625563.0, + "step": 2161 + }, + { + "epoch": 0.2750286223126829, + "ewc_loss": 0.005132389720529318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1323895604582503e-05, + "grad_norm": 3.171734094619751, + "learning_rate": 9.160661297159813e-07, + "loss": 0.4133, + "mean_token_accuracy": 0.8626030683517456, + "num_tokens": 82660030.0, + "step": 2162 + }, + { + "epoch": 0.27515583259127335, + "ewc_loss": 0.0051442221738398075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.144222086528316e-05, + "grad_norm": 3.197535514831543, + "learning_rate": 9.164900381517592e-07, + "loss": 0.4337, + "mean_token_accuracy": 0.8561712503433228, + "num_tokens": 82698794.0, + "step": 2163 + }, + { + "epoch": 0.2752830428698639, + "ewc_loss": 0.0051551503129303455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.155150211066939e-05, + "grad_norm": 3.1163084506988525, + "learning_rate": 9.16913946587537e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.8536988496780396, + "num_tokens": 82740015.0, + "step": 2164 + }, + { + "epoch": 0.2754102531484544, + "ewc_loss": 0.005101316142827272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.101316128275357e-05, + "grad_norm": 3.215608835220337, + "learning_rate": 9.17337855023315e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8511569499969482, + "num_tokens": 82775765.0, + "step": 2165 + }, + { + "epoch": 0.2755374634270449, + "ewc_loss": 0.005178081337362528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1780811190837994e-05, + "grad_norm": 3.134631872177124, + "learning_rate": 9.177617634590928e-07, + "loss": 0.3987, + "mean_token_accuracy": 0.8668484687805176, + "num_tokens": 82814324.0, + "step": 2166 + }, + { + "epoch": 0.2756646737056354, + "ewc_loss": 0.005117814522236586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.117814725963399e-05, + "grad_norm": 3.108666181564331, + "learning_rate": 9.181856718948706e-07, + "loss": 0.3856, + "mean_token_accuracy": 0.8692300319671631, + "num_tokens": 82851007.0, + "step": 2167 + }, + { + "epoch": 0.27579188398422594, + "ewc_loss": 0.005121766589581966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1217666623415425e-05, + "grad_norm": 3.212115526199341, + "learning_rate": 9.186095803306485e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8573839068412781, + "num_tokens": 82886846.0, + "step": 2168 + }, + { + "epoch": 0.2759190942628164, + "ewc_loss": 0.005179599393159151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.179599247639999e-05, + "grad_norm": 3.1547844409942627, + "learning_rate": 9.190334887664264e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8462632298469543, + "num_tokens": 82929607.0, + "step": 2169 + }, + { + "epoch": 0.27604630454140694, + "ewc_loss": 0.005123522598296404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.123522714711726e-05, + "grad_norm": 3.2801008224487305, + "learning_rate": 9.194573972022043e-07, + "loss": 0.5218, + "mean_token_accuracy": 0.8356494307518005, + "num_tokens": 82965089.0, + "step": 2170 + }, + { + "epoch": 0.27617351481999747, + "ewc_loss": 0.005204063840210438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.204063927521929e-05, + "grad_norm": 3.1756832599639893, + "learning_rate": 9.198813056379822e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.8356180191040039, + "num_tokens": 83008345.0, + "step": 2171 + }, + { + "epoch": 0.27630072509858794, + "ewc_loss": 0.005116304848343134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.116304964758456e-05, + "grad_norm": 3.2320268154144287, + "learning_rate": 9.2030521407376e-07, + "loss": 0.454, + "mean_token_accuracy": 0.847409188747406, + "num_tokens": 83043763.0, + "step": 2172 + }, + { + "epoch": 0.27642793537717847, + "ewc_loss": 0.005170291289687157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.170291115064174e-05, + "grad_norm": 3.13476824760437, + "learning_rate": 9.20729122509538e-07, + "loss": 0.397, + "mean_token_accuracy": 0.8676724433898926, + "num_tokens": 83081389.0, + "step": 2173 + }, + { + "epoch": 0.276555145655769, + "ewc_loss": 0.005123829934746027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.123830123920925e-05, + "grad_norm": 3.140197515487671, + "learning_rate": 9.211530309453158e-07, + "loss": 0.4372, + "mean_token_accuracy": 0.8538201451301575, + "num_tokens": 83120547.0, + "step": 2174 + }, + { + "epoch": 0.27668235593435947, + "ewc_loss": 0.005156750790774822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.156750921742059e-05, + "grad_norm": 3.220858097076416, + "learning_rate": 9.215769393810936e-07, + "loss": 0.441, + "mean_token_accuracy": 0.8547701239585876, + "num_tokens": 83157229.0, + "step": 2175 + }, + { + "epoch": 0.27680956621295, + "ewc_loss": 0.00519148213788867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.191482341615483e-05, + "grad_norm": 3.1727752685546875, + "learning_rate": 9.220008478168715e-07, + "loss": 0.5223, + "mean_token_accuracy": 0.8293349742889404, + "num_tokens": 83198217.0, + "step": 2176 + }, + { + "epoch": 0.2769367764915405, + "ewc_loss": 0.005141261499375105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.141261499375105e-05, + "grad_norm": 3.161973714828491, + "learning_rate": 9.224247562526494e-07, + "loss": 0.4428, + "mean_token_accuracy": 0.8516650795936584, + "num_tokens": 83238750.0, + "step": 2177 + }, + { + "epoch": 0.277063986770131, + "ewc_loss": 0.005157440435141325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.157440318726003e-05, + "grad_norm": 3.1561357975006104, + "learning_rate": 9.228486646884273e-07, + "loss": 0.4325, + "mean_token_accuracy": 0.8546234369277954, + "num_tokens": 83280138.0, + "step": 2178 + }, + { + "epoch": 0.2771911970487215, + "ewc_loss": 0.005163510795682669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.163511013961397e-05, + "grad_norm": 3.1936964988708496, + "learning_rate": 9.232725731242052e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8397605419158936, + "num_tokens": 83320633.0, + "step": 2179 + }, + { + "epoch": 0.27731840732731206, + "ewc_loss": 0.005170774646103382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1707746024476364e-05, + "grad_norm": 3.2100648880004883, + "learning_rate": 9.23696481559983e-07, + "loss": 0.4676, + "mean_token_accuracy": 0.8453731536865234, + "num_tokens": 83358787.0, + "step": 2180 + }, + { + "epoch": 0.2774456176059026, + "ewc_loss": 0.005172371864318848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.17237167514395e-05, + "grad_norm": 3.0830726623535156, + "learning_rate": 9.24120389995761e-07, + "loss": 0.4102, + "mean_token_accuracy": 0.8633557558059692, + "num_tokens": 83400609.0, + "step": 2181 + }, + { + "epoch": 0.27757282788449306, + "ewc_loss": 0.005113084334880114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1130842621205375e-05, + "grad_norm": 3.1984972953796387, + "learning_rate": 9.245442984315387e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8570671081542969, + "num_tokens": 83436058.0, + "step": 2182 + }, + { + "epoch": 0.2777000381630836, + "ewc_loss": 0.005193980410695076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.193980541662313e-05, + "grad_norm": 3.1567678451538086, + "learning_rate": 9.249682068673165e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8465945720672607, + "num_tokens": 83474542.0, + "step": 2183 + }, + { + "epoch": 0.2778272484416741, + "ewc_loss": 0.005151743534952402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.151743607711978e-05, + "grad_norm": 3.1657676696777344, + "learning_rate": 9.253921153030945e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8372278213500977, + "num_tokens": 83515551.0, + "step": 2184 + }, + { + "epoch": 0.2779544587202646, + "ewc_loss": 0.005165114067494869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1651142712216824e-05, + "grad_norm": 3.198394775390625, + "learning_rate": 9.258160237388723e-07, + "loss": 0.4125, + "mean_token_accuracy": 0.8651337027549744, + "num_tokens": 83551826.0, + "step": 2185 + }, + { + "epoch": 0.2780816689988551, + "ewc_loss": 0.005192681215703487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1926814194303006e-05, + "grad_norm": 3.197300910949707, + "learning_rate": 9.262399321746503e-07, + "loss": 0.428, + "mean_token_accuracy": 0.8583398461341858, + "num_tokens": 83586598.0, + "step": 2186 + }, + { + "epoch": 0.27820887927744564, + "ewc_loss": 0.005159401800483465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.159401916898787e-05, + "grad_norm": 3.1753554344177246, + "learning_rate": 9.266638406104281e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8429684638977051, + "num_tokens": 83625996.0, + "step": 2187 + }, + { + "epoch": 0.2783360895560361, + "ewc_loss": 0.00515811936929822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.158119529369287e-05, + "grad_norm": 3.250699281692505, + "learning_rate": 9.27087749046206e-07, + "loss": 0.4472, + "mean_token_accuracy": 0.8540288805961609, + "num_tokens": 83657322.0, + "step": 2188 + }, + { + "epoch": 0.27846329983462664, + "ewc_loss": 0.005202778149396181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.202778265811503e-05, + "grad_norm": 3.2212908267974854, + "learning_rate": 9.275116574819839e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.8526615500450134, + "num_tokens": 83693578.0, + "step": 2189 + }, + { + "epoch": 0.27859051011321717, + "ewc_loss": 0.005176171660423279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.176171544007957e-05, + "grad_norm": 3.2246756553649902, + "learning_rate": 9.279355659177617e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8534470796585083, + "num_tokens": 83727672.0, + "step": 2190 + }, + { + "epoch": 0.27871772039180764, + "ewc_loss": 0.00518682599067688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1868260925402865e-05, + "grad_norm": 3.2218363285064697, + "learning_rate": 9.283594743535395e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.840185821056366, + "num_tokens": 83761405.0, + "step": 2191 + }, + { + "epoch": 0.2788449306703982, + "ewc_loss": 0.005197007209062576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1970069762319326e-05, + "grad_norm": 3.1364500522613525, + "learning_rate": 9.287833827893175e-07, + "loss": 0.4329, + "mean_token_accuracy": 0.8525606393814087, + "num_tokens": 83801564.0, + "step": 2192 + }, + { + "epoch": 0.2789721409489887, + "ewc_loss": 0.0051622819155454636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.162281740922481e-05, + "grad_norm": 3.233504295349121, + "learning_rate": 9.292072912250953e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.847341001033783, + "num_tokens": 83834260.0, + "step": 2193 + }, + { + "epoch": 0.2790993512275792, + "ewc_loss": 0.005239658057689667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.239658275968395e-05, + "grad_norm": 3.131782054901123, + "learning_rate": 9.296311996608733e-07, + "loss": 0.39, + "mean_token_accuracy": 0.8689215183258057, + "num_tokens": 83873518.0, + "step": 2194 + }, + { + "epoch": 0.2792265615061697, + "ewc_loss": 0.005174820311367512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.174820398679003e-05, + "grad_norm": 3.1859798431396484, + "learning_rate": 9.300551080966511e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.83641517162323, + "num_tokens": 83913902.0, + "step": 2195 + }, + { + "epoch": 0.27935377178476023, + "ewc_loss": 0.005235766526311636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2357663662405685e-05, + "grad_norm": 3.2254090309143066, + "learning_rate": 9.30479016532429e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.8542728424072266, + "num_tokens": 83947952.0, + "step": 2196 + }, + { + "epoch": 0.2794809820633507, + "ewc_loss": 0.005260545760393143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.260545731289312e-05, + "grad_norm": 3.1664912700653076, + "learning_rate": 9.309029249682068e-07, + "loss": 0.4369, + "mean_token_accuracy": 0.8570683002471924, + "num_tokens": 83986057.0, + "step": 2197 + }, + { + "epoch": 0.27960819234194123, + "ewc_loss": 0.0052154529839754105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2154529839754105e-05, + "grad_norm": 3.194329023361206, + "learning_rate": 9.313268334039847e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8509831428527832, + "num_tokens": 84023881.0, + "step": 2198 + }, + { + "epoch": 0.27973540262053176, + "ewc_loss": 0.005251346621662378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2513467380777e-05, + "grad_norm": 3.1566505432128906, + "learning_rate": 9.317507418397625e-07, + "loss": 0.3835, + "mean_token_accuracy": 0.8729802966117859, + "num_tokens": 84061480.0, + "step": 2199 + }, + { + "epoch": 0.27986261289912223, + "ewc_loss": 0.00521905766800046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.219057493377477e-05, + "grad_norm": 3.1755361557006836, + "learning_rate": 9.321746502755404e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.843069314956665, + "num_tokens": 84101782.0, + "step": 2200 + }, + { + "epoch": 0.27998982317771276, + "ewc_loss": 0.005241351202130318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.241351027507335e-05, + "grad_norm": 3.2558724880218506, + "learning_rate": 9.325985587113183e-07, + "loss": 0.4434, + "mean_token_accuracy": 0.8529374003410339, + "num_tokens": 84136125.0, + "step": 2201 + }, + { + "epoch": 0.2801170334563033, + "ewc_loss": 0.00526675907894969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.266759035293944e-05, + "grad_norm": 3.246519088745117, + "learning_rate": 9.330224671470962e-07, + "loss": 0.4532, + "mean_token_accuracy": 0.8567401170730591, + "num_tokens": 84169748.0, + "step": 2202 + }, + { + "epoch": 0.28024424373489376, + "ewc_loss": 0.005254766438156366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2547664381563663e-05, + "grad_norm": 3.2812092304229736, + "learning_rate": 9.334463755828741e-07, + "loss": 0.4683, + "mean_token_accuracy": 0.8410905599594116, + "num_tokens": 84201292.0, + "step": 2203 + }, + { + "epoch": 0.2803714540134843, + "ewc_loss": 0.005278364289551973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.278364187688567e-05, + "grad_norm": 3.1481571197509766, + "learning_rate": 9.338702840186519e-07, + "loss": 0.468, + "mean_token_accuracy": 0.8435064554214478, + "num_tokens": 84240652.0, + "step": 2204 + }, + { + "epoch": 0.2804986642920748, + "ewc_loss": 0.005211121402680874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2111212426098064e-05, + "grad_norm": 3.201751232147217, + "learning_rate": 9.342941924544298e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8477222919464111, + "num_tokens": 84280252.0, + "step": 2205 + }, + { + "epoch": 0.2806258745706653, + "ewc_loss": 0.005271273199468851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.271273403195664e-05, + "grad_norm": 3.215630054473877, + "learning_rate": 9.347181008902076e-07, + "loss": 0.4005, + "mean_token_accuracy": 0.8694552779197693, + "num_tokens": 84316505.0, + "step": 2206 + }, + { + "epoch": 0.2807530848492558, + "ewc_loss": 0.005278678610920906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2786785090574995e-05, + "grad_norm": 3.2276968955993652, + "learning_rate": 9.351420093259855e-07, + "loss": 0.403, + "mean_token_accuracy": 0.8667445778846741, + "num_tokens": 84349990.0, + "step": 2207 + }, + { + "epoch": 0.28088029512784635, + "ewc_loss": 0.005274306982755661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2743067499250174e-05, + "grad_norm": 3.1886370182037354, + "learning_rate": 9.355659177617634e-07, + "loss": 0.3939, + "mean_token_accuracy": 0.8666788339614868, + "num_tokens": 84381066.0, + "step": 2208 + }, + { + "epoch": 0.2810075054064368, + "ewc_loss": 0.005262062419205904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.262062404653989e-05, + "grad_norm": 3.1357204914093018, + "learning_rate": 9.359898261975413e-07, + "loss": 0.3866, + "mean_token_accuracy": 0.87139493227005, + "num_tokens": 84422540.0, + "step": 2209 + }, + { + "epoch": 0.28113471568502735, + "ewc_loss": 0.005242266226559877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.242266342975199e-05, + "grad_norm": 3.261077404022217, + "learning_rate": 9.364137346333192e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.8327536582946777, + "num_tokens": 84462158.0, + "step": 2210 + }, + { + "epoch": 0.2812619259636179, + "ewc_loss": 0.005329743959009647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3297440899768844e-05, + "grad_norm": 3.2356669902801514, + "learning_rate": 9.368376430690971e-07, + "loss": 0.4027, + "mean_token_accuracy": 0.864349365234375, + "num_tokens": 84495887.0, + "step": 2211 + }, + { + "epoch": 0.28138913624220835, + "ewc_loss": 0.005267498083412647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2674979087896645e-05, + "grad_norm": 3.265716075897217, + "learning_rate": 9.372615515048749e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8454293012619019, + "num_tokens": 84530176.0, + "step": 2212 + }, + { + "epoch": 0.2815163465207989, + "ewc_loss": 0.005301843862980604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3018437029095367e-05, + "grad_norm": 3.189419984817505, + "learning_rate": 9.376854599406528e-07, + "loss": 0.4768, + "mean_token_accuracy": 0.8450988531112671, + "num_tokens": 84574450.0, + "step": 2213 + }, + { + "epoch": 0.2816435567993894, + "ewc_loss": 0.0052510518580675125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.251051697996445e-05, + "grad_norm": 3.202068567276001, + "learning_rate": 9.381093683764306e-07, + "loss": 0.4451, + "mean_token_accuracy": 0.8530875444412231, + "num_tokens": 84610661.0, + "step": 2214 + }, + { + "epoch": 0.2817707670779799, + "ewc_loss": 0.0052812653593719006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2812654757872224e-05, + "grad_norm": 3.1421713829040527, + "learning_rate": 9.385332768122085e-07, + "loss": 0.4261, + "mean_token_accuracy": 0.8577894568443298, + "num_tokens": 84651259.0, + "step": 2215 + }, + { + "epoch": 0.2818979773565704, + "ewc_loss": 0.005245262756943703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.24526258232072e-05, + "grad_norm": 3.1680819988250732, + "learning_rate": 9.389571852479864e-07, + "loss": 0.4295, + "mean_token_accuracy": 0.858875036239624, + "num_tokens": 84692389.0, + "step": 2216 + }, + { + "epoch": 0.28202518763516093, + "ewc_loss": 0.005269088316708803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.269088433124125e-05, + "grad_norm": 3.2192962169647217, + "learning_rate": 9.393810936837643e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.8561882972717285, + "num_tokens": 84731905.0, + "step": 2217 + }, + { + "epoch": 0.2821523979137514, + "ewc_loss": 0.005287255626171827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.287255407893099e-05, + "grad_norm": 3.1709017753601074, + "learning_rate": 9.398050021195422e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8511631488800049, + "num_tokens": 84773727.0, + "step": 2218 + }, + { + "epoch": 0.28227960819234194, + "ewc_loss": 0.005247293971478939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2472940296866e-05, + "grad_norm": 3.1447243690490723, + "learning_rate": 9.402289105553201e-07, + "loss": 0.4087, + "mean_token_accuracy": 0.8632808923721313, + "num_tokens": 84816771.0, + "step": 2219 + }, + { + "epoch": 0.28240681847093246, + "ewc_loss": 0.0052465214394032955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.246521322987974e-05, + "grad_norm": 3.2293553352355957, + "learning_rate": 9.406528189910978e-07, + "loss": 0.4325, + "mean_token_accuracy": 0.85706627368927, + "num_tokens": 84853172.0, + "step": 2220 + }, + { + "epoch": 0.28253402874952294, + "ewc_loss": 0.005289122927933931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.289122782414779e-05, + "grad_norm": 3.1605803966522217, + "learning_rate": 9.410767274268757e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.860231339931488, + "num_tokens": 84891508.0, + "step": 2221 + }, + { + "epoch": 0.28266123902811346, + "ewc_loss": 0.005228680092841387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2286799473222345e-05, + "grad_norm": 3.2288477420806885, + "learning_rate": 9.415006358626536e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.831352174282074, + "num_tokens": 84931204.0, + "step": 2222 + }, + { + "epoch": 0.282788449306704, + "ewc_loss": 0.005284283775836229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2842839068034664e-05, + "grad_norm": 3.255272150039673, + "learning_rate": 9.419245442984314e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.8394772410392761, + "num_tokens": 84966782.0, + "step": 2223 + }, + { + "epoch": 0.28291565958529447, + "ewc_loss": 0.005282434169203043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.28243399458006e-05, + "grad_norm": 3.229919672012329, + "learning_rate": 9.423484527342094e-07, + "loss": 0.4366, + "mean_token_accuracy": 0.852374792098999, + "num_tokens": 85001512.0, + "step": 2224 + }, + { + "epoch": 0.283042869863885, + "ewc_loss": 0.005268294829875231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2682946261484176e-05, + "grad_norm": 3.3019604682922363, + "learning_rate": 9.427723611699872e-07, + "loss": 0.4457, + "mean_token_accuracy": 0.8535366654396057, + "num_tokens": 85036082.0, + "step": 2225 + }, + { + "epoch": 0.2831700801424755, + "ewc_loss": 0.005316159222275019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3161591495154426e-05, + "grad_norm": 3.1723644733428955, + "learning_rate": 9.431962696057652e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.8514744639396667, + "num_tokens": 85079433.0, + "step": 2226 + }, + { + "epoch": 0.283297290421066, + "ewc_loss": 0.005243364721536636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.24336464877706e-05, + "grad_norm": 3.186997175216675, + "learning_rate": 9.43620178041543e-07, + "loss": 0.4045, + "mean_token_accuracy": 0.8665479421615601, + "num_tokens": 85116940.0, + "step": 2227 + }, + { + "epoch": 0.2834245006996565, + "ewc_loss": 0.005280980374664068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2809802582487464e-05, + "grad_norm": 3.1661276817321777, + "learning_rate": 9.440440864773208e-07, + "loss": 0.4572, + "mean_token_accuracy": 0.8511545658111572, + "num_tokens": 85153328.0, + "step": 2228 + }, + { + "epoch": 0.28355171097824705, + "ewc_loss": 0.005275833886116743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.2758339734282345e-05, + "grad_norm": 3.2054200172424316, + "learning_rate": 9.444679949130987e-07, + "loss": 0.4586, + "mean_token_accuracy": 0.8465100526809692, + "num_tokens": 85194059.0, + "step": 2229 + }, + { + "epoch": 0.2836789212568376, + "ewc_loss": 0.005292369052767754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.292368950904347e-05, + "grad_norm": 3.298358201980591, + "learning_rate": 9.448919033488766e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.8453236818313599, + "num_tokens": 85230820.0, + "step": 2230 + }, + { + "epoch": 0.28380613153542805, + "ewc_loss": 0.005337124224752188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.337124093784951e-05, + "grad_norm": 3.161853075027466, + "learning_rate": 9.453158117846544e-07, + "loss": 0.4606, + "mean_token_accuracy": 0.8472578525543213, + "num_tokens": 85268864.0, + "step": 2231 + }, + { + "epoch": 0.2839333418140186, + "ewc_loss": 0.005241630133241415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.241630060481839e-05, + "grad_norm": 3.1988513469696045, + "learning_rate": 9.457397202204324e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.8415039777755737, + "num_tokens": 85311377.0, + "step": 2232 + }, + { + "epoch": 0.2840605520926091, + "ewc_loss": 0.005306505132466555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.306505045155063e-05, + "grad_norm": 3.2081241607666016, + "learning_rate": 9.461636286562102e-07, + "loss": 0.4012, + "mean_token_accuracy": 0.8697158694267273, + "num_tokens": 85347082.0, + "step": 2233 + }, + { + "epoch": 0.2841877623711996, + "ewc_loss": 0.00530401011928916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.30401011928916e-05, + "grad_norm": 3.243534803390503, + "learning_rate": 9.465875370919882e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8436502814292908, + "num_tokens": 85384696.0, + "step": 2234 + }, + { + "epoch": 0.2843149726497901, + "ewc_loss": 0.005308045540004969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3080457291798666e-05, + "grad_norm": 3.2677831649780273, + "learning_rate": 9.470114455277659e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8447836637496948, + "num_tokens": 85420192.0, + "step": 2235 + }, + { + "epoch": 0.28444218292838064, + "ewc_loss": 0.005317030008882284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.31703008164186e-05, + "grad_norm": 3.1617913246154785, + "learning_rate": 9.474353539635438e-07, + "loss": 0.4251, + "mean_token_accuracy": 0.8564164638519287, + "num_tokens": 85462281.0, + "step": 2236 + }, + { + "epoch": 0.2845693932069711, + "ewc_loss": 0.00525630684569478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.256306758383289e-05, + "grad_norm": 3.2418909072875977, + "learning_rate": 9.478592623993217e-07, + "loss": 0.4308, + "mean_token_accuracy": 0.8564400672912598, + "num_tokens": 85495191.0, + "step": 2237 + }, + { + "epoch": 0.28469660348556164, + "ewc_loss": 0.0053351023234426975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.33510246896185e-05, + "grad_norm": 3.2157158851623535, + "learning_rate": 9.482831708350996e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.8579258918762207, + "num_tokens": 85533803.0, + "step": 2238 + }, + { + "epoch": 0.28482381376415217, + "ewc_loss": 0.005304293241351843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3042931540403515e-05, + "grad_norm": 3.1932621002197266, + "learning_rate": 9.487070792708775e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8463948965072632, + "num_tokens": 85573424.0, + "step": 2239 + }, + { + "epoch": 0.28495102404274264, + "ewc_loss": 0.005298975855112076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.298975884215906e-05, + "grad_norm": 3.205638885498047, + "learning_rate": 9.491309877066554e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8484852313995361, + "num_tokens": 85616423.0, + "step": 2240 + }, + { + "epoch": 0.28507823432133317, + "ewc_loss": 0.005314910784363747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.314910595188849e-05, + "grad_norm": 3.204425573348999, + "learning_rate": 9.495548961424332e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8418169617652893, + "num_tokens": 85658410.0, + "step": 2241 + }, + { + "epoch": 0.2852054445999237, + "ewc_loss": 0.00530631048604846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.306310413288884e-05, + "grad_norm": 3.218214511871338, + "learning_rate": 9.499788045782111e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8531947135925293, + "num_tokens": 85695767.0, + "step": 2242 + }, + { + "epoch": 0.28533265487851417, + "ewc_loss": 0.005313478875905275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.313479050528258e-05, + "grad_norm": 3.168464183807373, + "learning_rate": 9.504027130139889e-07, + "loss": 0.4575, + "mean_token_accuracy": 0.849818229675293, + "num_tokens": 85737771.0, + "step": 2243 + }, + { + "epoch": 0.2854598651571047, + "ewc_loss": 0.00529650179669261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.296501694829203e-05, + "grad_norm": 3.187131881713867, + "learning_rate": 9.508266214497667e-07, + "loss": 0.4058, + "mean_token_accuracy": 0.8687660694122314, + "num_tokens": 85774789.0, + "step": 2244 + }, + { + "epoch": 0.2855870754356952, + "ewc_loss": 0.005313861183822155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.313861038303003e-05, + "grad_norm": 3.197643995285034, + "learning_rate": 9.512505298855447e-07, + "loss": 0.4443, + "mean_token_accuracy": 0.8560742735862732, + "num_tokens": 85817833.0, + "step": 2245 + }, + { + "epoch": 0.2857142857142857, + "ewc_loss": 0.005314544774591923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.314544614520855e-05, + "grad_norm": 3.271627426147461, + "learning_rate": 9.516744383213225e-07, + "loss": 0.3889, + "mean_token_accuracy": 0.8675522208213806, + "num_tokens": 85854488.0, + "step": 2246 + }, + { + "epoch": 0.2858414959928762, + "ewc_loss": 0.005343212280422449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3432122513186187e-05, + "grad_norm": 3.2499876022338867, + "learning_rate": 9.520983467571005e-07, + "loss": 0.467, + "mean_token_accuracy": 0.8471688032150269, + "num_tokens": 85893746.0, + "step": 2247 + }, + { + "epoch": 0.28596870627146675, + "ewc_loss": 0.005305035971105099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3050360293127596e-05, + "grad_norm": 3.244950532913208, + "learning_rate": 9.525222551928783e-07, + "loss": 0.503, + "mean_token_accuracy": 0.8399125337600708, + "num_tokens": 85932798.0, + "step": 2248 + }, + { + "epoch": 0.2860959165500572, + "ewc_loss": 0.0053220512345433235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.322051219991408e-05, + "grad_norm": 3.300447702407837, + "learning_rate": 9.529461636286562e-07, + "loss": 0.4185, + "mean_token_accuracy": 0.8622017502784729, + "num_tokens": 85964446.0, + "step": 2249 + }, + { + "epoch": 0.28622312682864776, + "ewc_loss": 0.005339312832802534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.339312701835297e-05, + "grad_norm": 3.2076687812805176, + "learning_rate": 9.533700720644341e-07, + "loss": 0.3973, + "mean_token_accuracy": 0.8685221672058105, + "num_tokens": 86000587.0, + "step": 2250 + }, + { + "epoch": 0.2863503371072383, + "ewc_loss": 0.005292518995702267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.292518835631199e-05, + "grad_norm": 3.2638728618621826, + "learning_rate": 9.537939805002118e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8515439033508301, + "num_tokens": 86042891.0, + "step": 2251 + }, + { + "epoch": 0.28647754738582876, + "ewc_loss": 0.0053441585041582584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.344158489606343e-05, + "grad_norm": 3.2227208614349365, + "learning_rate": 9.542178889359898e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8581849336624146, + "num_tokens": 86079973.0, + "step": 2252 + }, + { + "epoch": 0.2866047576644193, + "ewc_loss": 0.005313541274517775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3135412599658594e-05, + "grad_norm": 3.189697742462158, + "learning_rate": 9.546417973717677e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.8533887267112732, + "num_tokens": 86119767.0, + "step": 2253 + }, + { + "epoch": 0.2867319679430098, + "ewc_loss": 0.00532238744199276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3223873692331836e-05, + "grad_norm": 3.1543385982513428, + "learning_rate": 9.550657058075455e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8465828895568848, + "num_tokens": 86163176.0, + "step": 2254 + }, + { + "epoch": 0.2868591782216003, + "ewc_loss": 0.005311469081789255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.31146906723734e-05, + "grad_norm": 3.227410078048706, + "learning_rate": 9.554896142433234e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8460170030593872, + "num_tokens": 86201523.0, + "step": 2255 + }, + { + "epoch": 0.2869863885001908, + "ewc_loss": 0.005359940230846405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.359940041671507e-05, + "grad_norm": 3.222571849822998, + "learning_rate": 9.559135226791012e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8377704620361328, + "num_tokens": 86240277.0, + "step": 2256 + }, + { + "epoch": 0.28711359877878134, + "ewc_loss": 0.005339064169675112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.339064227882773e-05, + "grad_norm": 3.129202127456665, + "learning_rate": 9.563374311148793e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8441200256347656, + "num_tokens": 86289714.0, + "step": 2257 + }, + { + "epoch": 0.2872408090573718, + "ewc_loss": 0.005295851267874241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.295851224218495e-05, + "grad_norm": 3.281705617904663, + "learning_rate": 9.56761339550657e-07, + "loss": 0.4263, + "mean_token_accuracy": 0.8602988719940186, + "num_tokens": 86320786.0, + "step": 2258 + }, + { + "epoch": 0.28736801933596234, + "ewc_loss": 0.005412811879068613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.412811879068613e-05, + "grad_norm": 3.178028106689453, + "learning_rate": 9.57185247986435e-07, + "loss": 0.4143, + "mean_token_accuracy": 0.862922191619873, + "num_tokens": 86358002.0, + "step": 2259 + }, + { + "epoch": 0.28749522961455287, + "ewc_loss": 0.005311696324497461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.311696440912783e-05, + "grad_norm": 3.1884984970092773, + "learning_rate": 9.576091564222128e-07, + "loss": 0.3837, + "mean_token_accuracy": 0.8694461584091187, + "num_tokens": 86395334.0, + "step": 2260 + }, + { + "epoch": 0.28762243989314334, + "ewc_loss": 0.005340091418474913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.340091229300015e-05, + "grad_norm": 3.2144057750701904, + "learning_rate": 9.580330648579906e-07, + "loss": 0.46, + "mean_token_accuracy": 0.846301794052124, + "num_tokens": 86432616.0, + "step": 2261 + }, + { + "epoch": 0.2877496501717339, + "ewc_loss": 0.005354336928576231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3543368267128244e-05, + "grad_norm": 3.24092173576355, + "learning_rate": 9.584569732937685e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8495029807090759, + "num_tokens": 86472293.0, + "step": 2262 + }, + { + "epoch": 0.2878768604503244, + "ewc_loss": 0.00535378884524107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3537889471044764e-05, + "grad_norm": 3.239177942276001, + "learning_rate": 9.588808817295463e-07, + "loss": 0.443, + "mean_token_accuracy": 0.8500390648841858, + "num_tokens": 86506304.0, + "step": 2263 + }, + { + "epoch": 0.2880040707289149, + "ewc_loss": 0.0053585730493068695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.358572889235802e-05, + "grad_norm": 3.2550723552703857, + "learning_rate": 9.593047901653242e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8513250350952148, + "num_tokens": 86540520.0, + "step": 2264 + }, + { + "epoch": 0.2881312810075054, + "ewc_loss": 0.005360775627195835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.360775685403496e-05, + "grad_norm": 3.209500312805176, + "learning_rate": 9.597286986011022e-07, + "loss": 0.4035, + "mean_token_accuracy": 0.865175724029541, + "num_tokens": 86576956.0, + "step": 2265 + }, + { + "epoch": 0.28825849128609593, + "ewc_loss": 0.0053491597063839436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.349159619072452e-05, + "grad_norm": 3.288994550704956, + "learning_rate": 9.601526070368799e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8410066366195679, + "num_tokens": 86613799.0, + "step": 2266 + }, + { + "epoch": 0.2883857015646864, + "ewc_loss": 0.0053925346583127975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.392534512793645e-05, + "grad_norm": 3.245379686355591, + "learning_rate": 9.60576515472658e-07, + "loss": 0.4309, + "mean_token_accuracy": 0.8569900989532471, + "num_tokens": 86650338.0, + "step": 2267 + }, + { + "epoch": 0.28851291184327693, + "ewc_loss": 0.005356478504836559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.356478504836559e-05, + "grad_norm": 3.2028470039367676, + "learning_rate": 9.610004239084358e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8528547286987305, + "num_tokens": 86691605.0, + "step": 2268 + }, + { + "epoch": 0.28864012212186746, + "ewc_loss": 0.005346959922462702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3469600970856845e-05, + "grad_norm": 3.2605185508728027, + "learning_rate": 9.614243323442136e-07, + "loss": 0.5097, + "mean_token_accuracy": 0.8314856290817261, + "num_tokens": 86728767.0, + "step": 2269 + }, + { + "epoch": 0.28876733240045793, + "ewc_loss": 0.005398797802627087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3987976571079344e-05, + "grad_norm": 3.294917345046997, + "learning_rate": 9.618482407799915e-07, + "loss": 0.4196, + "mean_token_accuracy": 0.8589689135551453, + "num_tokens": 86765240.0, + "step": 2270 + }, + { + "epoch": 0.28889454267904846, + "ewc_loss": 0.005404578987509012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4045791330281645e-05, + "grad_norm": 3.1947436332702637, + "learning_rate": 9.622721492157693e-07, + "loss": 0.3888, + "mean_token_accuracy": 0.8704904913902283, + "num_tokens": 86804042.0, + "step": 2271 + }, + { + "epoch": 0.289021752957639, + "ewc_loss": 0.00534262927249074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.342629083315842e-05, + "grad_norm": 3.3128175735473633, + "learning_rate": 9.626960576515472e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.855792760848999, + "num_tokens": 86835220.0, + "step": 2272 + }, + { + "epoch": 0.28914896323622946, + "ewc_loss": 0.0054453290067613125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.44532886124216e-05, + "grad_norm": 3.2268481254577637, + "learning_rate": 9.63119966087325e-07, + "loss": 0.4249, + "mean_token_accuracy": 0.8560086488723755, + "num_tokens": 86873414.0, + "step": 2273 + }, + { + "epoch": 0.28927617351482, + "ewc_loss": 0.0053711929358541965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.371193037717603e-05, + "grad_norm": 3.18697452545166, + "learning_rate": 9.635438745231029e-07, + "loss": 0.4165, + "mean_token_accuracy": 0.8614515662193298, + "num_tokens": 86914272.0, + "step": 2274 + }, + { + "epoch": 0.2894033837934105, + "ewc_loss": 0.005367989186197519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3679890697821975e-05, + "grad_norm": 3.2041425704956055, + "learning_rate": 9.63967782958881e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8471081256866455, + "num_tokens": 86952425.0, + "step": 2275 + }, + { + "epoch": 0.289530594072001, + "ewc_loss": 0.005401703529059887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4017033107811585e-05, + "grad_norm": 3.223999500274658, + "learning_rate": 9.643916913946588e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8609890341758728, + "num_tokens": 86987842.0, + "step": 2276 + }, + { + "epoch": 0.2896578043505915, + "ewc_loss": 0.005410652607679367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.410652738646604e-05, + "grad_norm": 3.16837477684021, + "learning_rate": 9.648155998304366e-07, + "loss": 0.3905, + "mean_token_accuracy": 0.8694045543670654, + "num_tokens": 87026750.0, + "step": 2277 + }, + { + "epoch": 0.28978501462918205, + "ewc_loss": 0.005384353920817375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.384353789850138e-05, + "grad_norm": 3.19711971282959, + "learning_rate": 9.652395082662145e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8357863426208496, + "num_tokens": 87069685.0, + "step": 2278 + }, + { + "epoch": 0.2899122249077726, + "ewc_loss": 0.005412345286458731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.412345126387663e-05, + "grad_norm": 3.283419609069824, + "learning_rate": 9.656634167019923e-07, + "loss": 0.4477, + "mean_token_accuracy": 0.8481913805007935, + "num_tokens": 87107054.0, + "step": 2279 + }, + { + "epoch": 0.29003943518636305, + "ewc_loss": 0.005453723017126322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4537231335416436e-05, + "grad_norm": 3.247178554534912, + "learning_rate": 9.660873251377701e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.850368082523346, + "num_tokens": 87146254.0, + "step": 2280 + }, + { + "epoch": 0.2901666454649536, + "ewc_loss": 0.005404689349234104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4046893637860194e-05, + "grad_norm": 3.269359588623047, + "learning_rate": 9.66511233573548e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8456002473831177, + "num_tokens": 87182404.0, + "step": 2281 + }, + { + "epoch": 0.2902938557435441, + "ewc_loss": 0.005429968237876892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.429968223324977e-05, + "grad_norm": 3.2200863361358643, + "learning_rate": 9.669351420093258e-07, + "loss": 0.4122, + "mean_token_accuracy": 0.8632489442825317, + "num_tokens": 87216551.0, + "step": 2282 + }, + { + "epoch": 0.2904210660221346, + "ewc_loss": 0.005404929164797068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.404929106589407e-05, + "grad_norm": 3.331651449203491, + "learning_rate": 9.67359050445104e-07, + "loss": 0.433, + "mean_token_accuracy": 0.8543288707733154, + "num_tokens": 87249402.0, + "step": 2283 + }, + { + "epoch": 0.2905482763007251, + "ewc_loss": 0.005476931110024452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.476930891745724e-05, + "grad_norm": 3.268352746963501, + "learning_rate": 9.677829588808817e-07, + "loss": 0.4226, + "mean_token_accuracy": 0.8574247360229492, + "num_tokens": 87281846.0, + "step": 2284 + }, + { + "epoch": 0.29067548657931563, + "ewc_loss": 0.005412861239165068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4128613555803895e-05, + "grad_norm": 3.2983782291412354, + "learning_rate": 9.682068673166596e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.8444808721542358, + "num_tokens": 87315765.0, + "step": 2285 + }, + { + "epoch": 0.2908026968579061, + "ewc_loss": 0.005459778942167759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4597789130639285e-05, + "grad_norm": 3.267773151397705, + "learning_rate": 9.686307757524374e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.8534836769104004, + "num_tokens": 87353668.0, + "step": 2286 + }, + { + "epoch": 0.29092990713649663, + "ewc_loss": 0.005446255672723055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4462554544443265e-05, + "grad_norm": 3.207257032394409, + "learning_rate": 9.690546841882153e-07, + "loss": 0.3693, + "mean_token_accuracy": 0.8761337995529175, + "num_tokens": 87390747.0, + "step": 2287 + }, + { + "epoch": 0.29105711741508716, + "ewc_loss": 0.005425540264695883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4255404393188655e-05, + "grad_norm": 3.2430875301361084, + "learning_rate": 9.694785926239931e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8546620607376099, + "num_tokens": 87430686.0, + "step": 2288 + }, + { + "epoch": 0.29118432769367764, + "ewc_loss": 0.0054641468450427055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4641470342176035e-05, + "grad_norm": 3.2957377433776855, + "learning_rate": 9.69902501059771e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.8403642177581787, + "num_tokens": 87465290.0, + "step": 2289 + }, + { + "epoch": 0.29131153797226816, + "ewc_loss": 0.005485055036842823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.48505486221984e-05, + "grad_norm": 3.210536479949951, + "learning_rate": 9.703264094955488e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.8479688763618469, + "num_tokens": 87504553.0, + "step": 2290 + }, + { + "epoch": 0.2914387482508587, + "ewc_loss": 0.005439179018139839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4391788580687717e-05, + "grad_norm": 3.21222186088562, + "learning_rate": 9.707503179313269e-07, + "loss": 0.4282, + "mean_token_accuracy": 0.858973503112793, + "num_tokens": 87544896.0, + "step": 2291 + }, + { + "epoch": 0.29156595852944917, + "ewc_loss": 0.005464187823235989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.464187779580243e-05, + "grad_norm": 3.2547199726104736, + "learning_rate": 9.711742263671047e-07, + "loss": 0.4271, + "mean_token_accuracy": 0.8586256504058838, + "num_tokens": 87582075.0, + "step": 2292 + }, + { + "epoch": 0.2916931688080397, + "ewc_loss": 0.005491447169333696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.491447154781781e-05, + "grad_norm": 3.1913299560546875, + "learning_rate": 9.715981348028826e-07, + "loss": 0.4208, + "mean_token_accuracy": 0.8601869344711304, + "num_tokens": 87623117.0, + "step": 2293 + }, + { + "epoch": 0.2918203790866302, + "ewc_loss": 0.005444606766104698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4446067224489525e-05, + "grad_norm": 3.331054449081421, + "learning_rate": 9.720220432386604e-07, + "loss": 0.4333, + "mean_token_accuracy": 0.8567664623260498, + "num_tokens": 87656263.0, + "step": 2294 + }, + { + "epoch": 0.2919475893652207, + "ewc_loss": 0.005536834709346294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.536834942176938e-05, + "grad_norm": 3.201511859893799, + "learning_rate": 9.724459516744383e-07, + "loss": 0.4093, + "mean_token_accuracy": 0.8634652495384216, + "num_tokens": 87695852.0, + "step": 2295 + }, + { + "epoch": 0.2920747996438112, + "ewc_loss": 0.005431550554931164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.431550380308181e-05, + "grad_norm": 3.2423365116119385, + "learning_rate": 9.728698601102161e-07, + "loss": 0.4058, + "mean_token_accuracy": 0.8655495047569275, + "num_tokens": 87734331.0, + "step": 2296 + }, + { + "epoch": 0.29220200992240175, + "ewc_loss": 0.005481106229126453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.481106200022623e-05, + "grad_norm": 3.2902121543884277, + "learning_rate": 9.73293768545994e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8508599996566772, + "num_tokens": 87768447.0, + "step": 2297 + }, + { + "epoch": 0.2923292202009922, + "ewc_loss": 0.005497752223163843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.497752135852352e-05, + "grad_norm": 3.236783266067505, + "learning_rate": 9.737176769817718e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.8496913909912109, + "num_tokens": 87813213.0, + "step": 2298 + }, + { + "epoch": 0.29245643047958275, + "ewc_loss": 0.005447740666568279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.447740477393381e-05, + "grad_norm": 3.251962900161743, + "learning_rate": 9.741415854175499e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8397855162620544, + "num_tokens": 87851336.0, + "step": 2299 + }, + { + "epoch": 0.2925836407581733, + "ewc_loss": 0.0054875826463103294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4875825298950076e-05, + "grad_norm": 3.312506675720215, + "learning_rate": 9.745654938533277e-07, + "loss": 0.4548, + "mean_token_accuracy": 0.8493835926055908, + "num_tokens": 87888474.0, + "step": 2300 + }, + { + "epoch": 0.29271085103676375, + "ewc_loss": 0.005503325257450342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5033251555869356e-05, + "grad_norm": 3.261169672012329, + "learning_rate": 9.749894022891056e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8441210985183716, + "num_tokens": 87928232.0, + "step": 2301 + }, + { + "epoch": 0.2928380613153543, + "ewc_loss": 0.005455439444631338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4554395319428295e-05, + "grad_norm": 3.1962196826934814, + "learning_rate": 9.754133107248834e-07, + "loss": 0.3991, + "mean_token_accuracy": 0.8661238551139832, + "num_tokens": 87966982.0, + "step": 2302 + }, + { + "epoch": 0.2929652715939448, + "ewc_loss": 0.005453299731016159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.453299672808498e-05, + "grad_norm": 3.292586088180542, + "learning_rate": 9.758372191606612e-07, + "loss": 0.4844, + "mean_token_accuracy": 0.8386785984039307, + "num_tokens": 88003550.0, + "step": 2303 + }, + { + "epoch": 0.2930924818725353, + "ewc_loss": 0.00552013935521245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.520139166037552e-05, + "grad_norm": 3.221066951751709, + "learning_rate": 9.76261127596439e-07, + "loss": 0.3972, + "mean_token_accuracy": 0.8680177927017212, + "num_tokens": 88038038.0, + "step": 2304 + }, + { + "epoch": 0.2932196921511258, + "ewc_loss": 0.005447030533105135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.447030343930237e-05, + "grad_norm": 3.246034622192383, + "learning_rate": 9.76685036032217e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.8434264063835144, + "num_tokens": 88074627.0, + "step": 2305 + }, + { + "epoch": 0.29334690242971634, + "ewc_loss": 0.005488532595336437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.488532406161539e-05, + "grad_norm": 3.1894469261169434, + "learning_rate": 9.771089444679948e-07, + "loss": 0.4121, + "mean_token_accuracy": 0.8627703189849854, + "num_tokens": 88116772.0, + "step": 2306 + }, + { + "epoch": 0.2934741127083068, + "ewc_loss": 0.00546632893383503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.466329093906097e-05, + "grad_norm": 3.325028419494629, + "learning_rate": 9.775328529037728e-07, + "loss": 0.4562, + "mean_token_accuracy": 0.8492940664291382, + "num_tokens": 88152888.0, + "step": 2307 + }, + { + "epoch": 0.29360132298689734, + "ewc_loss": 0.005529600661247969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5296004575211555e-05, + "grad_norm": 3.3287293910980225, + "learning_rate": 9.779567613395507e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8434639573097229, + "num_tokens": 88184544.0, + "step": 2308 + }, + { + "epoch": 0.29372853326548787, + "ewc_loss": 0.005507813300937414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5078133300412446e-05, + "grad_norm": 3.2106549739837646, + "learning_rate": 9.783806697753285e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8586412668228149, + "num_tokens": 88221603.0, + "step": 2309 + }, + { + "epoch": 0.29385574354407834, + "ewc_loss": 0.005462614353746176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.462614353746176e-05, + "grad_norm": 3.1670114994049072, + "learning_rate": 9.788045782111064e-07, + "loss": 0.403, + "mean_token_accuracy": 0.8652461767196655, + "num_tokens": 88265109.0, + "step": 2310 + }, + { + "epoch": 0.29398295382266887, + "ewc_loss": 0.005474777426570654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.474777572089806e-05, + "grad_norm": 3.215790033340454, + "learning_rate": 9.792284866468842e-07, + "loss": 0.3946, + "mean_token_accuracy": 0.8647918701171875, + "num_tokens": 88302756.0, + "step": 2311 + }, + { + "epoch": 0.2941101641012594, + "ewc_loss": 0.005502553656697273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5025535402819514e-05, + "grad_norm": 3.2923049926757812, + "learning_rate": 9.79652395082662e-07, + "loss": 0.3935, + "mean_token_accuracy": 0.8643752336502075, + "num_tokens": 88335605.0, + "step": 2312 + }, + { + "epoch": 0.29423737437984987, + "ewc_loss": 0.005531833972781897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.53183417650871e-05, + "grad_norm": 3.3528568744659424, + "learning_rate": 9.8007630351844e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8477874994277954, + "num_tokens": 88369447.0, + "step": 2313 + }, + { + "epoch": 0.2943645846584404, + "ewc_loss": 0.005537873134016991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.537873221328482e-05, + "grad_norm": 3.364957332611084, + "learning_rate": 9.805002119542178e-07, + "loss": 0.4935, + "mean_token_accuracy": 0.8378316164016724, + "num_tokens": 88404442.0, + "step": 2314 + }, + { + "epoch": 0.2944917949370309, + "ewc_loss": 0.005544800311326981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5448002967750654e-05, + "grad_norm": 3.2112932205200195, + "learning_rate": 9.809241203899958e-07, + "loss": 0.4402, + "mean_token_accuracy": 0.85671067237854, + "num_tokens": 88445870.0, + "step": 2315 + }, + { + "epoch": 0.2946190052156214, + "ewc_loss": 0.005472212564200163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.472212433232926e-05, + "grad_norm": 3.272940158843994, + "learning_rate": 9.813480288257737e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8462531566619873, + "num_tokens": 88480994.0, + "step": 2316 + }, + { + "epoch": 0.2947462154942119, + "ewc_loss": 0.005553851369768381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.553851224249229e-05, + "grad_norm": 3.190629243850708, + "learning_rate": 9.817719372615515e-07, + "loss": 0.4117, + "mean_token_accuracy": 0.8626706004142761, + "num_tokens": 88519666.0, + "step": 2317 + }, + { + "epoch": 0.29487342577280246, + "ewc_loss": 0.00549804512411356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4980453569442034e-05, + "grad_norm": 3.3118858337402344, + "learning_rate": 9.821958456973294e-07, + "loss": 0.3841, + "mean_token_accuracy": 0.8707633018493652, + "num_tokens": 88553855.0, + "step": 2318 + }, + { + "epoch": 0.29500063605139293, + "ewc_loss": 0.005572004243731499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.572004010900855e-05, + "grad_norm": 3.2696261405944824, + "learning_rate": 9.826197541331072e-07, + "loss": 0.3994, + "mean_token_accuracy": 0.8675953149795532, + "num_tokens": 88589053.0, + "step": 2319 + }, + { + "epoch": 0.29512784632998346, + "ewc_loss": 0.005516674369573593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.516674355021678e-05, + "grad_norm": 3.283822774887085, + "learning_rate": 9.83043662568885e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8499824404716492, + "num_tokens": 88627067.0, + "step": 2320 + }, + { + "epoch": 0.295255056608574, + "ewc_loss": 0.00555082643404603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.550826244871132e-05, + "grad_norm": 3.179333209991455, + "learning_rate": 9.83467571004663e-07, + "loss": 0.4029, + "mean_token_accuracy": 0.8647415637969971, + "num_tokens": 88668030.0, + "step": 2321 + }, + { + "epoch": 0.29538226688716446, + "ewc_loss": 0.005483704619109631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4837048082845286e-05, + "grad_norm": 3.2470216751098633, + "learning_rate": 9.838914794404407e-07, + "loss": 0.4004, + "mean_token_accuracy": 0.8667639493942261, + "num_tokens": 88708889.0, + "step": 2322 + }, + { + "epoch": 0.295509477165755, + "ewc_loss": 0.0055551850236952305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5551849072799087e-05, + "grad_norm": 3.350954055786133, + "learning_rate": 9.843153878762188e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8478304147720337, + "num_tokens": 88745689.0, + "step": 2323 + }, + { + "epoch": 0.2956366874443455, + "ewc_loss": 0.005581042729318142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5810425692470744e-05, + "grad_norm": 3.264021158218384, + "learning_rate": 9.847392963119966e-07, + "loss": 0.4201, + "mean_token_accuracy": 0.8611904382705688, + "num_tokens": 88784667.0, + "step": 2324 + }, + { + "epoch": 0.295763897722936, + "ewc_loss": 0.0054947612807154655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.494761353475042e-05, + "grad_norm": 3.214796543121338, + "learning_rate": 9.851632047477745e-07, + "loss": 0.4323, + "mean_token_accuracy": 0.8553590774536133, + "num_tokens": 88827334.0, + "step": 2325 + }, + { + "epoch": 0.2958911080015265, + "ewc_loss": 0.00551581522449851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.515815064427443e-05, + "grad_norm": 3.199791431427002, + "learning_rate": 9.855871131835523e-07, + "loss": 0.3656, + "mean_token_accuracy": 0.8758775591850281, + "num_tokens": 88863744.0, + "step": 2326 + }, + { + "epoch": 0.29601831828011704, + "ewc_loss": 0.005509992595762014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5099924793466926e-05, + "grad_norm": 3.2803702354431152, + "learning_rate": 9.860110216193302e-07, + "loss": 0.3982, + "mean_token_accuracy": 0.8682398796081543, + "num_tokens": 88903693.0, + "step": 2327 + }, + { + "epoch": 0.2961455285587075, + "ewc_loss": 0.005551167763769627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.551167851081118e-05, + "grad_norm": 3.2825145721435547, + "learning_rate": 9.86434930055108e-07, + "loss": 0.3934, + "mean_token_accuracy": 0.8683429956436157, + "num_tokens": 88936668.0, + "step": 2328 + }, + { + "epoch": 0.29627273883729804, + "ewc_loss": 0.005529009737074375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.529009649762884e-05, + "grad_norm": 3.342139482498169, + "learning_rate": 9.868588384908859e-07, + "loss": 0.4031, + "mean_token_accuracy": 0.8621132373809814, + "num_tokens": 88970920.0, + "step": 2329 + }, + { + "epoch": 0.2963999491158886, + "ewc_loss": 0.005561677739024162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.561677971854806e-05, + "grad_norm": 3.2463178634643555, + "learning_rate": 9.872827469266637e-07, + "loss": 0.4067, + "mean_token_accuracy": 0.8615843057632446, + "num_tokens": 89010856.0, + "step": 2330 + }, + { + "epoch": 0.2965271593944791, + "ewc_loss": 0.005480736494064331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4807365813758224e-05, + "grad_norm": 3.2639286518096924, + "learning_rate": 9.877066553624418e-07, + "loss": 0.4138, + "mean_token_accuracy": 0.8617295026779175, + "num_tokens": 89047109.0, + "step": 2331 + }, + { + "epoch": 0.2966543696730696, + "ewc_loss": 0.005515233613550663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5152337154140696e-05, + "grad_norm": 3.208864212036133, + "learning_rate": 9.881305637982196e-07, + "loss": 0.4228, + "mean_token_accuracy": 0.8622084856033325, + "num_tokens": 89087859.0, + "step": 2332 + }, + { + "epoch": 0.2967815799516601, + "ewc_loss": 0.0054802605882287025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.480260369949974e-05, + "grad_norm": 3.27970814704895, + "learning_rate": 9.885544722339975e-07, + "loss": 0.4039, + "mean_token_accuracy": 0.8642889261245728, + "num_tokens": 89123601.0, + "step": 2333 + }, + { + "epoch": 0.29690879023025063, + "ewc_loss": 0.005534825846552849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.534826050279662e-05, + "grad_norm": 3.2520599365234375, + "learning_rate": 9.889783806697753e-07, + "loss": 0.3879, + "mean_token_accuracy": 0.8712567090988159, + "num_tokens": 89160133.0, + "step": 2334 + }, + { + "epoch": 0.2970360005088411, + "ewc_loss": 0.005492835305631161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.492835407494567e-05, + "grad_norm": 3.288323402404785, + "learning_rate": 9.894022891055532e-07, + "loss": 0.3828, + "mean_token_accuracy": 0.870658814907074, + "num_tokens": 89196731.0, + "step": 2335 + }, + { + "epoch": 0.29716321078743163, + "ewc_loss": 0.005525975953787565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.52597593923565e-05, + "grad_norm": 3.239522695541382, + "learning_rate": 9.89826197541331e-07, + "loss": 0.3717, + "mean_token_accuracy": 0.8758172392845154, + "num_tokens": 89230159.0, + "step": 2336 + }, + { + "epoch": 0.29729042106602216, + "ewc_loss": 0.005510114133358002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.51011435163673e-05, + "grad_norm": 3.324653387069702, + "learning_rate": 9.902501059771089e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.8544222116470337, + "num_tokens": 89264648.0, + "step": 2337 + }, + { + "epoch": 0.29741763134461263, + "ewc_loss": 0.005541077349334955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5410775530617684e-05, + "grad_norm": 3.2403664588928223, + "learning_rate": 9.906740144128867e-07, + "loss": 0.432, + "mean_token_accuracy": 0.8549025058746338, + "num_tokens": 89302505.0, + "step": 2338 + }, + { + "epoch": 0.29754484162320316, + "ewc_loss": 0.005490829236805439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.490829062182456e-05, + "grad_norm": 3.3144989013671875, + "learning_rate": 9.910979228486648e-07, + "loss": 0.3918, + "mean_token_accuracy": 0.8677263259887695, + "num_tokens": 89338079.0, + "step": 2339 + }, + { + "epoch": 0.2976720519017937, + "ewc_loss": 0.005558909848332405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.55890983378049e-05, + "grad_norm": 3.254749298095703, + "learning_rate": 9.915218312844426e-07, + "loss": 0.4563, + "mean_token_accuracy": 0.852836012840271, + "num_tokens": 89376717.0, + "step": 2340 + }, + { + "epoch": 0.29779926218038416, + "ewc_loss": 0.005502395797520876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5023960157996044e-05, + "grad_norm": 3.2148451805114746, + "learning_rate": 9.919457397202205e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.8511582612991333, + "num_tokens": 89416164.0, + "step": 2341 + }, + { + "epoch": 0.2979264724589747, + "ewc_loss": 0.005524795968085527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5247957789106295e-05, + "grad_norm": 3.3530423641204834, + "learning_rate": 9.923696481559983e-07, + "loss": 0.4042, + "mean_token_accuracy": 0.8653469085693359, + "num_tokens": 89446491.0, + "step": 2342 + }, + { + "epoch": 0.2980536827375652, + "ewc_loss": 0.005611380096524954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.611380038317293e-05, + "grad_norm": 3.325599431991577, + "learning_rate": 9.927935565917761e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.848400354385376, + "num_tokens": 89483637.0, + "step": 2343 + }, + { + "epoch": 0.2981808930161557, + "ewc_loss": 0.0055575440637767315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5575441365363076e-05, + "grad_norm": 3.214033603668213, + "learning_rate": 9.93217465027554e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8564590215682983, + "num_tokens": 89526532.0, + "step": 2344 + }, + { + "epoch": 0.2983081032947462, + "ewc_loss": 0.005523218773305416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.523218715097755e-05, + "grad_norm": 3.425029754638672, + "learning_rate": 9.936413734633318e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8400294780731201, + "num_tokens": 89558353.0, + "step": 2345 + }, + { + "epoch": 0.29843531357333675, + "ewc_loss": 0.005676670931279659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.676670843968168e-05, + "grad_norm": 3.2189674377441406, + "learning_rate": 9.940652818991097e-07, + "loss": 0.3953, + "mean_token_accuracy": 0.8679221868515015, + "num_tokens": 89598727.0, + "step": 2346 + }, + { + "epoch": 0.2985625238519272, + "ewc_loss": 0.005518010817468166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5180109484354034e-05, + "grad_norm": 3.254148244857788, + "learning_rate": 9.944891903348877e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.8477592468261719, + "num_tokens": 89640093.0, + "step": 2347 + }, + { + "epoch": 0.29868973413051775, + "ewc_loss": 0.005592389963567257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.592389788944274e-05, + "grad_norm": 3.1995553970336914, + "learning_rate": 9.949130987706656e-07, + "loss": 0.4087, + "mean_token_accuracy": 0.8620764017105103, + "num_tokens": 89681401.0, + "step": 2348 + }, + { + "epoch": 0.2988169444091083, + "ewc_loss": 0.005567799787968397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.567799598793499e-05, + "grad_norm": 3.250537872314453, + "learning_rate": 9.953370072064432e-07, + "loss": 0.4014, + "mean_token_accuracy": 0.8648903369903564, + "num_tokens": 89720366.0, + "step": 2349 + }, + { + "epoch": 0.29894415468769875, + "ewc_loss": 0.005592403933405876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.592403977061622e-05, + "grad_norm": 3.223975896835327, + "learning_rate": 9.957609156422213e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.8423142433166504, + "num_tokens": 89762234.0, + "step": 2350 + }, + { + "epoch": 0.2990713649662893, + "ewc_loss": 0.005565573927015066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5655738833593205e-05, + "grad_norm": 3.4240965843200684, + "learning_rate": 9.961848240779991e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8590734601020813, + "num_tokens": 89792208.0, + "step": 2351 + }, + { + "epoch": 0.2991985752448798, + "ewc_loss": 0.005678927060216665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.678927118424326e-05, + "grad_norm": 3.237443447113037, + "learning_rate": 9.96608732513777e-07, + "loss": 0.4044, + "mean_token_accuracy": 0.8686829805374146, + "num_tokens": 89833246.0, + "step": 2352 + }, + { + "epoch": 0.2993257855234703, + "ewc_loss": 0.0055205002427101135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.520500417333096e-05, + "grad_norm": 3.222029209136963, + "learning_rate": 9.970326409495548e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8467603921890259, + "num_tokens": 89873271.0, + "step": 2353 + }, + { + "epoch": 0.2994529958020608, + "ewc_loss": 0.005581743083894253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5817428801674396e-05, + "grad_norm": 3.2668721675872803, + "learning_rate": 9.974565493853327e-07, + "loss": 0.4745, + "mean_token_accuracy": 0.8424758315086365, + "num_tokens": 89913069.0, + "step": 2354 + }, + { + "epoch": 0.29958020608065133, + "ewc_loss": 0.005614182446151972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.614182373392396e-05, + "grad_norm": 3.360454797744751, + "learning_rate": 9.978804578211107e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8482780456542969, + "num_tokens": 89948915.0, + "step": 2355 + }, + { + "epoch": 0.2997074163592418, + "ewc_loss": 0.005633464548736811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.633464388665743e-05, + "grad_norm": 3.373880386352539, + "learning_rate": 9.983043662568886e-07, + "loss": 0.4551, + "mean_token_accuracy": 0.8513765335083008, + "num_tokens": 89982123.0, + "step": 2356 + }, + { + "epoch": 0.29983462663783234, + "ewc_loss": 0.005632517393678427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.632517422782257e-05, + "grad_norm": 3.257075786590576, + "learning_rate": 9.987282746926662e-07, + "loss": 0.4086, + "mean_token_accuracy": 0.8641411662101746, + "num_tokens": 90018259.0, + "step": 2357 + }, + { + "epoch": 0.29996183691642286, + "ewc_loss": 0.00557503430172801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.575034083449282e-05, + "grad_norm": 3.246023178100586, + "learning_rate": 9.991521831284443e-07, + "loss": 0.4254, + "mean_token_accuracy": 0.8590424060821533, + "num_tokens": 90059041.0, + "step": 2358 + }, + { + "epoch": 0.30008904719501334, + "ewc_loss": 0.005598565563559532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.598565621767193e-05, + "grad_norm": 3.281029462814331, + "learning_rate": 9.995760915642221e-07, + "loss": 0.3759, + "mean_token_accuracy": 0.8743548393249512, + "num_tokens": 90089463.0, + "step": 2359 + }, + { + "epoch": 0.30021625747360386, + "ewc_loss": 0.0056212907657027245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.621290983981453e-05, + "grad_norm": 3.2751379013061523, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8644177913665771, + "num_tokens": 90125691.0, + "step": 2360 + }, + { + "epoch": 0.3003434677521944, + "ewc_loss": 0.0056030941195786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.603094177786261e-05, + "grad_norm": 3.275521755218506, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8553894758224487, + "num_tokens": 90161778.0, + "step": 2361 + }, + { + "epoch": 0.30047067803078487, + "ewc_loss": 0.00561496801674366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6149681768147275e-05, + "grad_norm": 3.262909173965454, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8432692289352417, + "num_tokens": 90202243.0, + "step": 2362 + }, + { + "epoch": 0.3005978883093754, + "ewc_loss": 0.0056068142876029015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.606814374914393e-05, + "grad_norm": 3.2827444076538086, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8482522368431091, + "num_tokens": 90248924.0, + "step": 2363 + }, + { + "epoch": 0.3007250985879659, + "ewc_loss": 0.005626667756587267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6266679166583344e-05, + "grad_norm": 3.4112303256988525, + "learning_rate": 1e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8287047147750854, + "num_tokens": 90281988.0, + "step": 2364 + }, + { + "epoch": 0.3008523088665564, + "ewc_loss": 0.005683234427124262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.683234485331923e-05, + "grad_norm": 3.318730115890503, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8530362844467163, + "num_tokens": 90317050.0, + "step": 2365 + }, + { + "epoch": 0.3009795191451469, + "ewc_loss": 0.00560698052868247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.606980630545877e-05, + "grad_norm": 3.1943397521972656, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8642319440841675, + "num_tokens": 90358064.0, + "step": 2366 + }, + { + "epoch": 0.30110672942373745, + "ewc_loss": 0.005586204119026661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.586204133578576e-05, + "grad_norm": 3.258330821990967, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8453841209411621, + "num_tokens": 90400692.0, + "step": 2367 + }, + { + "epoch": 0.3012339397023279, + "ewc_loss": 0.005651313345879316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.651313404086977e-05, + "grad_norm": 3.3901712894439697, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8488163948059082, + "num_tokens": 90432207.0, + "step": 2368 + }, + { + "epoch": 0.30136114998091845, + "ewc_loss": 0.005690970923751593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.690971011063084e-05, + "grad_norm": 3.3626275062561035, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8534431457519531, + "num_tokens": 90466081.0, + "step": 2369 + }, + { + "epoch": 0.301488360259509, + "ewc_loss": 0.00565072987228632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.65072987228632e-05, + "grad_norm": 3.3021631240844727, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8635096549987793, + "num_tokens": 90501963.0, + "step": 2370 + }, + { + "epoch": 0.30161557053809945, + "ewc_loss": 0.005645412486046553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6454126024618745e-05, + "grad_norm": 3.294761896133423, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8676998615264893, + "num_tokens": 90540651.0, + "step": 2371 + }, + { + "epoch": 0.30174278081669, + "ewc_loss": 0.005645043216645718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.645042983815074e-05, + "grad_norm": 3.3018436431884766, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.847461462020874, + "num_tokens": 90585105.0, + "step": 2372 + }, + { + "epoch": 0.3018699910952805, + "ewc_loss": 0.0056538050994277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6538050557719544e-05, + "grad_norm": 3.2661595344543457, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.851925253868103, + "num_tokens": 90626773.0, + "step": 2373 + }, + { + "epoch": 0.301997201373871, + "ewc_loss": 0.005623882170766592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.623882316285744e-05, + "grad_norm": 3.2931089401245117, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8711495399475098, + "num_tokens": 90665974.0, + "step": 2374 + }, + { + "epoch": 0.3021244116524615, + "ewc_loss": 0.005642902106046677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6429020332871005e-05, + "grad_norm": 3.283458948135376, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8514101505279541, + "num_tokens": 90706801.0, + "step": 2375 + }, + { + "epoch": 0.30225162193105204, + "ewc_loss": 0.00562875485047698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.628754661302082e-05, + "grad_norm": 3.3034348487854004, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8518772125244141, + "num_tokens": 90743635.0, + "step": 2376 + }, + { + "epoch": 0.3023788322096425, + "ewc_loss": 0.005639790557324886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.639790470013395e-05, + "grad_norm": 3.315664768218994, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8571146726608276, + "num_tokens": 90782797.0, + "step": 2377 + }, + { + "epoch": 0.30250604248823304, + "ewc_loss": 0.005623877048492432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6238772231154144e-05, + "grad_norm": 3.281376361846924, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8493092060089111, + "num_tokens": 90822661.0, + "step": 2378 + }, + { + "epoch": 0.30263325276682357, + "ewc_loss": 0.005613221786916256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.613221583189443e-05, + "grad_norm": 3.360988140106201, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8379192352294922, + "num_tokens": 90857804.0, + "step": 2379 + }, + { + "epoch": 0.3027604630454141, + "ewc_loss": 0.005658386275172234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.658386362483725e-05, + "grad_norm": 3.247372627258301, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8551731109619141, + "num_tokens": 90895778.0, + "step": 2380 + }, + { + "epoch": 0.30288767332400457, + "ewc_loss": 0.005581045988947153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5810462072258815e-05, + "grad_norm": 3.263885021209717, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8636220097541809, + "num_tokens": 90934463.0, + "step": 2381 + }, + { + "epoch": 0.3030148836025951, + "ewc_loss": 0.0056381537579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6381537433480844e-05, + "grad_norm": 3.2830119132995605, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8596174716949463, + "num_tokens": 90967888.0, + "step": 2382 + }, + { + "epoch": 0.3031420938811856, + "ewc_loss": 0.005624021403491497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.624021287076175e-05, + "grad_norm": 3.2871837615966797, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8638540506362915, + "num_tokens": 91005200.0, + "step": 2383 + }, + { + "epoch": 0.3032693041597761, + "ewc_loss": 0.005621940828859806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6219407269963995e-05, + "grad_norm": 3.3285605907440186, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8395971059799194, + "num_tokens": 91037840.0, + "step": 2384 + }, + { + "epoch": 0.3033965144383666, + "ewc_loss": 0.005660995841026306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.660995884682052e-05, + "grad_norm": 3.320382833480835, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8433273434638977, + "num_tokens": 91073336.0, + "step": 2385 + }, + { + "epoch": 0.30352372471695716, + "ewc_loss": 0.005650085862725973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6500859500374645e-05, + "grad_norm": 3.327773332595825, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8566906452178955, + "num_tokens": 91107001.0, + "step": 2386 + }, + { + "epoch": 0.30365093499554763, + "ewc_loss": 0.005660084076225758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.660083843395114e-05, + "grad_norm": 3.244804859161377, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8634342551231384, + "num_tokens": 91143937.0, + "step": 2387 + }, + { + "epoch": 0.30377814527413816, + "ewc_loss": 0.005626818630844355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.626818528980948e-05, + "grad_norm": 3.269449472427368, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8472968339920044, + "num_tokens": 91185391.0, + "step": 2388 + }, + { + "epoch": 0.3039053555527287, + "ewc_loss": 0.005655169952660799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.655170025420375e-05, + "grad_norm": 3.210531234741211, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8555774688720703, + "num_tokens": 91227653.0, + "step": 2389 + }, + { + "epoch": 0.30403256583131916, + "ewc_loss": 0.005621067713946104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6210676120826975e-05, + "grad_norm": 3.315967321395874, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8757932782173157, + "num_tokens": 91263610.0, + "step": 2390 + }, + { + "epoch": 0.3041597761099097, + "ewc_loss": 0.005698664113879204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.69866388104856e-05, + "grad_norm": 3.359593629837036, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8509519696235657, + "num_tokens": 91296880.0, + "step": 2391 + }, + { + "epoch": 0.3042869863885002, + "ewc_loss": 0.005698624532669783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6986245908774436e-05, + "grad_norm": 3.304720163345337, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8448373079299927, + "num_tokens": 91336242.0, + "step": 2392 + }, + { + "epoch": 0.3044141966670907, + "ewc_loss": 0.0056492420844733715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6492423027521e-05, + "grad_norm": 3.2267022132873535, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8641155958175659, + "num_tokens": 91376527.0, + "step": 2393 + }, + { + "epoch": 0.3045414069456812, + "ewc_loss": 0.005626516416668892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6265165767399594e-05, + "grad_norm": 3.375025749206543, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8477879762649536, + "num_tokens": 91408415.0, + "step": 2394 + }, + { + "epoch": 0.30466861722427174, + "ewc_loss": 0.0057363687083125114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.73636862100102e-05, + "grad_norm": 3.2217299938201904, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8577285408973694, + "num_tokens": 91449189.0, + "step": 2395 + }, + { + "epoch": 0.3047958275028622, + "ewc_loss": 0.005606275983154774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.606275954050943e-05, + "grad_norm": 3.2545270919799805, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.85331130027771, + "num_tokens": 91488599.0, + "step": 2396 + }, + { + "epoch": 0.30492303778145274, + "ewc_loss": 0.00567345367744565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.673453779309057e-05, + "grad_norm": 3.2501540184020996, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8599871397018433, + "num_tokens": 91525138.0, + "step": 2397 + }, + { + "epoch": 0.30505024806004327, + "ewc_loss": 0.0056524453684687614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.652445543091744e-05, + "grad_norm": 3.255500555038452, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8577371835708618, + "num_tokens": 91567444.0, + "step": 2398 + }, + { + "epoch": 0.30517745833863374, + "ewc_loss": 0.005659007932990789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.659007729263976e-05, + "grad_norm": 3.371992826461792, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8537670969963074, + "num_tokens": 91602926.0, + "step": 2399 + }, + { + "epoch": 0.3053046686172243, + "ewc_loss": 0.005708486773073673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7084867876255885e-05, + "grad_norm": 3.3317148685455322, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8681079149246216, + "num_tokens": 91635901.0, + "step": 2400 + }, + { + "epoch": 0.3054318788958148, + "ewc_loss": 0.0056695169769227505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.669517122441903e-05, + "grad_norm": 3.2414307594299316, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8587371110916138, + "num_tokens": 91679542.0, + "step": 2401 + }, + { + "epoch": 0.3055590891744053, + "ewc_loss": 0.005619802046567202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.61980195925571e-05, + "grad_norm": 3.252606153488159, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.850553035736084, + "num_tokens": 91723022.0, + "step": 2402 + }, + { + "epoch": 0.3056862994529958, + "ewc_loss": 0.005646074190735817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.646073987009004e-05, + "grad_norm": 3.313481569290161, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8521443605422974, + "num_tokens": 91760664.0, + "step": 2403 + }, + { + "epoch": 0.30581350973158633, + "ewc_loss": 0.005667475517839193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.667475488735363e-05, + "grad_norm": 3.272549867630005, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8512112498283386, + "num_tokens": 91805570.0, + "step": 2404 + }, + { + "epoch": 0.3059407200101768, + "ewc_loss": 0.005621051415801048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6210512411780655e-05, + "grad_norm": 3.2959961891174316, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8483580946922302, + "num_tokens": 91843342.0, + "step": 2405 + }, + { + "epoch": 0.30606793028876733, + "ewc_loss": 0.005644417833536863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.644417615258135e-05, + "grad_norm": 3.2815723419189453, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8466914892196655, + "num_tokens": 91883872.0, + "step": 2406 + }, + { + "epoch": 0.30619514056735786, + "ewc_loss": 0.005619707517325878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6197077356046066e-05, + "grad_norm": 3.247528076171875, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8546078205108643, + "num_tokens": 91923174.0, + "step": 2407 + }, + { + "epoch": 0.30632235084594833, + "ewc_loss": 0.005610611755400896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.610611697193235e-05, + "grad_norm": 3.2989156246185303, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8546451926231384, + "num_tokens": 91963462.0, + "step": 2408 + }, + { + "epoch": 0.30644956112453886, + "ewc_loss": 0.005645085591822863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.645085548167117e-05, + "grad_norm": 3.310177803039551, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.859302818775177, + "num_tokens": 92000907.0, + "step": 2409 + }, + { + "epoch": 0.3065767714031294, + "ewc_loss": 0.005630497820675373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.63049761694856e-05, + "grad_norm": 3.3231630325317383, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8586440086364746, + "num_tokens": 92036110.0, + "step": 2410 + }, + { + "epoch": 0.30670398168171986, + "ewc_loss": 0.005632202140986919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.632202373817563e-05, + "grad_norm": 3.2576310634613037, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8519536256790161, + "num_tokens": 92080647.0, + "step": 2411 + }, + { + "epoch": 0.3068311919603104, + "ewc_loss": 0.00559683283790946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.596832852461375e-05, + "grad_norm": 3.313420057296753, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8606313467025757, + "num_tokens": 92115407.0, + "step": 2412 + }, + { + "epoch": 0.3069584022389009, + "ewc_loss": 0.005641613155603409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.641613097395748e-05, + "grad_norm": 3.316781759262085, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8559764623641968, + "num_tokens": 92149660.0, + "step": 2413 + }, + { + "epoch": 0.3070856125174914, + "ewc_loss": 0.005641154479235411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6411543482681736e-05, + "grad_norm": 3.2669167518615723, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.858983039855957, + "num_tokens": 92191581.0, + "step": 2414 + }, + { + "epoch": 0.3072128227960819, + "ewc_loss": 0.0056187729351222515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.618773138849065e-05, + "grad_norm": 3.266306161880493, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8509963750839233, + "num_tokens": 92230435.0, + "step": 2415 + }, + { + "epoch": 0.30734003307467245, + "ewc_loss": 0.005632264539599419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.632264583255164e-05, + "grad_norm": 3.2883288860321045, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8435931205749512, + "num_tokens": 92269545.0, + "step": 2416 + }, + { + "epoch": 0.3074672433532629, + "ewc_loss": 0.005648669321089983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.648669321089983e-05, + "grad_norm": 3.2514383792877197, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8638468980789185, + "num_tokens": 92307447.0, + "step": 2417 + }, + { + "epoch": 0.30759445363185345, + "ewc_loss": 0.0056312065571546555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.631206659018062e-05, + "grad_norm": 3.268786668777466, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8537929058074951, + "num_tokens": 92349441.0, + "step": 2418 + }, + { + "epoch": 0.307721663910444, + "ewc_loss": 0.005642680451273918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6426804803777486e-05, + "grad_norm": 3.2997612953186035, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8607590198516846, + "num_tokens": 92380405.0, + "step": 2419 + }, + { + "epoch": 0.30784887418903445, + "ewc_loss": 0.005668234545737505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.668234371114522e-05, + "grad_norm": 3.363403797149658, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8543831706047058, + "num_tokens": 92419596.0, + "step": 2420 + }, + { + "epoch": 0.307976084467625, + "ewc_loss": 0.0056744529865682125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.674452768289484e-05, + "grad_norm": 3.282837390899658, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8489577770233154, + "num_tokens": 92454590.0, + "step": 2421 + }, + { + "epoch": 0.3081032947462155, + "ewc_loss": 0.005628956947475672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6289569329237565e-05, + "grad_norm": 3.3062589168548584, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8525803089141846, + "num_tokens": 92491640.0, + "step": 2422 + }, + { + "epoch": 0.308230505024806, + "ewc_loss": 0.0056649865582585335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6649867474334314e-05, + "grad_norm": 3.306076765060425, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8481704592704773, + "num_tokens": 92528682.0, + "step": 2423 + }, + { + "epoch": 0.3083577153033965, + "ewc_loss": 0.005658084060996771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6580840464448556e-05, + "grad_norm": 3.293186902999878, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8626534938812256, + "num_tokens": 92566368.0, + "step": 2424 + }, + { + "epoch": 0.30848492558198704, + "ewc_loss": 0.005649300757795572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.649300874210894e-05, + "grad_norm": 3.3788843154907227, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8356893062591553, + "num_tokens": 92601380.0, + "step": 2425 + }, + { + "epoch": 0.3086121358605775, + "ewc_loss": 0.005697397980839014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.697397864423692e-05, + "grad_norm": 3.2491652965545654, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8721402287483215, + "num_tokens": 92639724.0, + "step": 2426 + }, + { + "epoch": 0.30873934613916804, + "ewc_loss": 0.005616567563265562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.616567432298325e-05, + "grad_norm": 3.2546589374542236, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8607677817344666, + "num_tokens": 92677450.0, + "step": 2427 + }, + { + "epoch": 0.30886655641775856, + "ewc_loss": 0.005650050472468138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.650050661643036e-05, + "grad_norm": 3.2458581924438477, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8654888272285461, + "num_tokens": 92714681.0, + "step": 2428 + }, + { + "epoch": 0.3089937666963491, + "ewc_loss": 0.005655490327626467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.655490167555399e-05, + "grad_norm": 3.330275058746338, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8527697324752808, + "num_tokens": 92749925.0, + "step": 2429 + }, + { + "epoch": 0.30912097697493957, + "ewc_loss": 0.005700478795915842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.700478868675418e-05, + "grad_norm": 3.264997959136963, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8601089715957642, + "num_tokens": 92791720.0, + "step": 2430 + }, + { + "epoch": 0.3092481872535301, + "ewc_loss": 0.005642237141728401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.642237374559045e-05, + "grad_norm": 3.3662779331207275, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8486920595169067, + "num_tokens": 92826179.0, + "step": 2431 + }, + { + "epoch": 0.3093753975321206, + "ewc_loss": 0.005726232659071684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.726232484448701e-05, + "grad_norm": 3.211303234100342, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8550878763198853, + "num_tokens": 92872751.0, + "step": 2432 + }, + { + "epoch": 0.3095026078107111, + "ewc_loss": 0.005621026270091534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.621026502922177e-05, + "grad_norm": 3.5747122764587402, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8518901467323303, + "num_tokens": 92907470.0, + "step": 2433 + }, + { + "epoch": 0.3096298180893016, + "ewc_loss": 0.005860176868736744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.860177043359727e-05, + "grad_norm": 3.3365490436553955, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8613872528076172, + "num_tokens": 92943321.0, + "step": 2434 + }, + { + "epoch": 0.30975702836789215, + "ewc_loss": 0.0056270393542945385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6270393542945385e-05, + "grad_norm": 3.297353506088257, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8578664064407349, + "num_tokens": 92982136.0, + "step": 2435 + }, + { + "epoch": 0.3098842386464826, + "ewc_loss": 0.00567691121250391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.676911314367317e-05, + "grad_norm": 3.441957950592041, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.846221923828125, + "num_tokens": 93015974.0, + "step": 2436 + }, + { + "epoch": 0.31001144892507315, + "ewc_loss": 0.0057744234800338745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.774423698312603e-05, + "grad_norm": 3.258073568344116, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.859439492225647, + "num_tokens": 93054010.0, + "step": 2437 + }, + { + "epoch": 0.3101386592036637, + "ewc_loss": 0.005609461572021246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.609461368294433e-05, + "grad_norm": 3.2664711475372314, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8641376495361328, + "num_tokens": 93090226.0, + "step": 2438 + }, + { + "epoch": 0.31026586948225415, + "ewc_loss": 0.005683251656591892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.683251583832316e-05, + "grad_norm": 3.2809457778930664, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8537032604217529, + "num_tokens": 93130242.0, + "step": 2439 + }, + { + "epoch": 0.3103930797608447, + "ewc_loss": 0.0056822942569851875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.68229443160817e-05, + "grad_norm": 3.302940607070923, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8516618609428406, + "num_tokens": 93170990.0, + "step": 2440 + }, + { + "epoch": 0.3105202900394352, + "ewc_loss": 0.005685266572982073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.685266660293564e-05, + "grad_norm": 3.3307201862335205, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8480716347694397, + "num_tokens": 93205478.0, + "step": 2441 + }, + { + "epoch": 0.3106475003180257, + "ewc_loss": 0.005712196696549654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7121967984130606e-05, + "grad_norm": 3.3241732120513916, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8472548723220825, + "num_tokens": 93240952.0, + "step": 2442 + }, + { + "epoch": 0.3107747105966162, + "ewc_loss": 0.005705579649657011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7055796787608415e-05, + "grad_norm": 3.3057045936584473, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8457311987876892, + "num_tokens": 93278882.0, + "step": 2443 + }, + { + "epoch": 0.31090192087520674, + "ewc_loss": 0.005693140439689159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.693140337825753e-05, + "grad_norm": 3.235058069229126, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8481190204620361, + "num_tokens": 93324573.0, + "step": 2444 + }, + { + "epoch": 0.3110291311537972, + "ewc_loss": 0.005675598047673702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6755980040179566e-05, + "grad_norm": 3.2721383571624756, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8691537976264954, + "num_tokens": 93361049.0, + "step": 2445 + }, + { + "epoch": 0.31115634143238774, + "ewc_loss": 0.005708749406039715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7087494496954605e-05, + "grad_norm": 3.263235092163086, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8528969287872314, + "num_tokens": 93401748.0, + "step": 2446 + }, + { + "epoch": 0.31128355171097827, + "ewc_loss": 0.005704361479729414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.704361683456227e-05, + "grad_norm": 3.356995105743408, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8666922450065613, + "num_tokens": 93434857.0, + "step": 2447 + }, + { + "epoch": 0.31141076198956874, + "ewc_loss": 0.005741938482969999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7419383665546775e-05, + "grad_norm": 3.254417896270752, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8552985787391663, + "num_tokens": 93474681.0, + "step": 2448 + }, + { + "epoch": 0.31153797226815927, + "ewc_loss": 0.0056643616408109665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.664361742674373e-05, + "grad_norm": 3.3113527297973633, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.855880081653595, + "num_tokens": 93510627.0, + "step": 2449 + }, + { + "epoch": 0.3116651825467498, + "ewc_loss": 0.005724495742470026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7244957133661956e-05, + "grad_norm": 3.26161527633667, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8466893434524536, + "num_tokens": 93551598.0, + "step": 2450 + }, + { + "epoch": 0.31179239282534027, + "ewc_loss": 0.00568786496296525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.687864904757589e-05, + "grad_norm": 3.354346513748169, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8479949235916138, + "num_tokens": 93586622.0, + "step": 2451 + }, + { + "epoch": 0.3119196031039308, + "ewc_loss": 0.005748243536800146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.748243347625248e-05, + "grad_norm": 3.2687580585479736, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.845215916633606, + "num_tokens": 93626496.0, + "step": 2452 + }, + { + "epoch": 0.3120468133825213, + "ewc_loss": 0.005691853817552328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.691853948519565e-05, + "grad_norm": 3.3028292655944824, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8559187650680542, + "num_tokens": 93665641.0, + "step": 2453 + }, + { + "epoch": 0.3121740236611118, + "ewc_loss": 0.005722224712371826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.722224886994809e-05, + "grad_norm": 3.2994225025177, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8604337573051453, + "num_tokens": 93702135.0, + "step": 2454 + }, + { + "epoch": 0.3123012339397023, + "ewc_loss": 0.005703904200345278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.703904389520176e-05, + "grad_norm": 3.334592819213867, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.861510157585144, + "num_tokens": 93738226.0, + "step": 2455 + }, + { + "epoch": 0.31242844421829286, + "ewc_loss": 0.005720372777432203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7203727919841185e-05, + "grad_norm": 3.2603261470794678, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8577821254730225, + "num_tokens": 93780273.0, + "step": 2456 + }, + { + "epoch": 0.31255565449688333, + "ewc_loss": 0.0056607360020279884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6607361329952255e-05, + "grad_norm": 3.2868053913116455, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8434073328971863, + "num_tokens": 93821977.0, + "step": 2457 + }, + { + "epoch": 0.31268286477547386, + "ewc_loss": 0.005695668514817953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.695668369298801e-05, + "grad_norm": 3.3176865577697754, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8643792867660522, + "num_tokens": 93856084.0, + "step": 2458 + }, + { + "epoch": 0.3128100750540644, + "ewc_loss": 0.005692776292562485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.692776176147163e-05, + "grad_norm": 3.3002214431762695, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8585218191146851, + "num_tokens": 93893491.0, + "step": 2459 + }, + { + "epoch": 0.31293728533265486, + "ewc_loss": 0.005673177074640989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.673176929121837e-05, + "grad_norm": 3.2198288440704346, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8434721231460571, + "num_tokens": 93940240.0, + "step": 2460 + }, + { + "epoch": 0.3130644956112454, + "ewc_loss": 0.005653003230690956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6530032452428713e-05, + "grad_norm": 3.3713693618774414, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8488320112228394, + "num_tokens": 93974504.0, + "step": 2461 + }, + { + "epoch": 0.3131917058898359, + "ewc_loss": 0.00574604282155633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7460427342448384e-05, + "grad_norm": 3.3202080726623535, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8579083681106567, + "num_tokens": 94009208.0, + "step": 2462 + }, + { + "epoch": 0.3133189161684264, + "ewc_loss": 0.005679414141923189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.679414243786596e-05, + "grad_norm": 3.2678921222686768, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8560885190963745, + "num_tokens": 94049249.0, + "step": 2463 + }, + { + "epoch": 0.3134461264470169, + "ewc_loss": 0.0056596845388412476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.659684757119976e-05, + "grad_norm": 3.3404009342193604, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8571261167526245, + "num_tokens": 94085553.0, + "step": 2464 + }, + { + "epoch": 0.31357333672560744, + "ewc_loss": 0.005723914131522179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7239140005549416e-05, + "grad_norm": 3.2985870838165283, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8350011706352234, + "num_tokens": 94127531.0, + "step": 2465 + }, + { + "epoch": 0.3137005470041979, + "ewc_loss": 0.005663691554218531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.663691626978107e-05, + "grad_norm": 3.224494457244873, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8683292269706726, + "num_tokens": 94171191.0, + "step": 2466 + }, + { + "epoch": 0.31382775728278844, + "ewc_loss": 0.005654279142618179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6542790844105184e-05, + "grad_norm": 3.2639071941375732, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8604642152786255, + "num_tokens": 94212090.0, + "step": 2467 + }, + { + "epoch": 0.313954967561379, + "ewc_loss": 0.0056825620122253895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.682562186848372e-05, + "grad_norm": 3.282454252243042, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8734861016273499, + "num_tokens": 94246835.0, + "step": 2468 + }, + { + "epoch": 0.31408217783996945, + "ewc_loss": 0.005666013807058334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6660137488506734e-05, + "grad_norm": 3.338425397872925, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8543081283569336, + "num_tokens": 94284581.0, + "step": 2469 + }, + { + "epoch": 0.31420938811856, + "ewc_loss": 0.005698689725250006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6986897106980905e-05, + "grad_norm": 3.285214900970459, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.865693211555481, + "num_tokens": 94323050.0, + "step": 2470 + }, + { + "epoch": 0.3143365983971505, + "ewc_loss": 0.0056515904143452644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.651590254274197e-05, + "grad_norm": 3.2673375606536865, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8504902124404907, + "num_tokens": 94364702.0, + "step": 2471 + }, + { + "epoch": 0.314463808675741, + "ewc_loss": 0.005663969088345766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.663969204761088e-05, + "grad_norm": 3.263765811920166, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8622589111328125, + "num_tokens": 94404735.0, + "step": 2472 + }, + { + "epoch": 0.3145910189543315, + "ewc_loss": 0.005659885238856077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.659885209752247e-05, + "grad_norm": 3.264708995819092, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8423311114311218, + "num_tokens": 94444683.0, + "step": 2473 + }, + { + "epoch": 0.31471822923292203, + "ewc_loss": 0.005654502660036087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.654502456309274e-05, + "grad_norm": 3.285979986190796, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8581833839416504, + "num_tokens": 94484392.0, + "step": 2474 + }, + { + "epoch": 0.3148454395115125, + "ewc_loss": 0.005670533049851656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.670532846124843e-05, + "grad_norm": 3.2735817432403564, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8624567985534668, + "num_tokens": 94524860.0, + "step": 2475 + }, + { + "epoch": 0.31497264979010303, + "ewc_loss": 0.005650075618177652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.650075399898924e-05, + "grad_norm": 3.2853939533233643, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8527543544769287, + "num_tokens": 94565076.0, + "step": 2476 + }, + { + "epoch": 0.31509986006869356, + "ewc_loss": 0.005653908010572195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.653908010572195e-05, + "grad_norm": 3.2862281799316406, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8331917524337769, + "num_tokens": 94606191.0, + "step": 2477 + }, + { + "epoch": 0.31522707034728403, + "ewc_loss": 0.005665334407240152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6653345382073894e-05, + "grad_norm": 3.2801244258880615, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8502177596092224, + "num_tokens": 94648021.0, + "step": 2478 + }, + { + "epoch": 0.31535428062587456, + "ewc_loss": 0.005658237263560295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.658237205352634e-05, + "grad_norm": 3.2810614109039307, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.866020917892456, + "num_tokens": 94686305.0, + "step": 2479 + }, + { + "epoch": 0.3154814909044651, + "ewc_loss": 0.005649408791214228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.649408922181465e-05, + "grad_norm": 3.256709098815918, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8535134792327881, + "num_tokens": 94728606.0, + "step": 2480 + }, + { + "epoch": 0.3156087011830556, + "ewc_loss": 0.0056563373655080795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.656337452819571e-05, + "grad_norm": 3.249898672103882, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.858223021030426, + "num_tokens": 94770162.0, + "step": 2481 + }, + { + "epoch": 0.3157359114616461, + "ewc_loss": 0.005652319639921188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.652319669025019e-05, + "grad_norm": 3.256833791732788, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8617446422576904, + "num_tokens": 94807735.0, + "step": 2482 + }, + { + "epoch": 0.3158631217402366, + "ewc_loss": 0.005666874814778566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.666874858434312e-05, + "grad_norm": 3.339184045791626, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8498272895812988, + "num_tokens": 94840366.0, + "step": 2483 + }, + { + "epoch": 0.31599033201882715, + "ewc_loss": 0.005712163168936968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7121629652101547e-05, + "grad_norm": 3.3420662879943848, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8532719612121582, + "num_tokens": 94874293.0, + "step": 2484 + }, + { + "epoch": 0.3161175422974176, + "ewc_loss": 0.005680537782609463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.680537651642226e-05, + "grad_norm": 3.3189778327941895, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8550409078598022, + "num_tokens": 94909347.0, + "step": 2485 + }, + { + "epoch": 0.31624475257600815, + "ewc_loss": 0.005681041162461042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.681041147909127e-05, + "grad_norm": 3.2277801036834717, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8653448820114136, + "num_tokens": 94955120.0, + "step": 2486 + }, + { + "epoch": 0.3163719628545987, + "ewc_loss": 0.005646481644362211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.646481440635398e-05, + "grad_norm": 3.267932176589966, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8704063892364502, + "num_tokens": 94992319.0, + "step": 2487 + }, + { + "epoch": 0.31649917313318915, + "ewc_loss": 0.005694584455341101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6945842516142875e-05, + "grad_norm": 3.338519811630249, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8682916164398193, + "num_tokens": 95023464.0, + "step": 2488 + }, + { + "epoch": 0.3166263834117797, + "ewc_loss": 0.005711694713681936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.711694757337682e-05, + "grad_norm": 3.307433843612671, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8701653480529785, + "num_tokens": 95055963.0, + "step": 2489 + }, + { + "epoch": 0.3167535936903702, + "ewc_loss": 0.005687526427209377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.687526208930649e-05, + "grad_norm": 3.4415690898895264, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8376876711845398, + "num_tokens": 95089628.0, + "step": 2490 + }, + { + "epoch": 0.3168808039689607, + "ewc_loss": 0.005779010243713856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.779010461992584e-05, + "grad_norm": 3.2306532859802246, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8679724931716919, + "num_tokens": 95128557.0, + "step": 2491 + }, + { + "epoch": 0.3170080142475512, + "ewc_loss": 0.005638869479298592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.63886969757732e-05, + "grad_norm": 3.2992453575134277, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8448566198348999, + "num_tokens": 95168324.0, + "step": 2492 + }, + { + "epoch": 0.31713522452614173, + "ewc_loss": 0.005744850262999535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.744850204791874e-05, + "grad_norm": 3.2998626232147217, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8368259072303772, + "num_tokens": 95208139.0, + "step": 2493 + }, + { + "epoch": 0.3172624348047322, + "ewc_loss": 0.005730315577238798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7303153880639e-05, + "grad_norm": 3.2776753902435303, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8501555919647217, + "num_tokens": 95247337.0, + "step": 2494 + }, + { + "epoch": 0.31738964508332274, + "ewc_loss": 0.005720534361898899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7205343182431534e-05, + "grad_norm": 3.2585411071777344, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8485519886016846, + "num_tokens": 95289208.0, + "step": 2495 + }, + { + "epoch": 0.31751685536191326, + "ewc_loss": 0.005730047821998596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.730047996621579e-05, + "grad_norm": 3.262697458267212, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8550102710723877, + "num_tokens": 95329717.0, + "step": 2496 + }, + { + "epoch": 0.31764406564050374, + "ewc_loss": 0.005733967758715153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7339675549883395e-05, + "grad_norm": 3.268707036972046, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8359842300415039, + "num_tokens": 95371052.0, + "step": 2497 + }, + { + "epoch": 0.31777127591909426, + "ewc_loss": 0.005733154248446226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.733154102927074e-05, + "grad_norm": 3.2178337574005127, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8749237060546875, + "num_tokens": 95412486.0, + "step": 2498 + }, + { + "epoch": 0.3178984861976848, + "ewc_loss": 0.005704532377421856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.70453230466228e-05, + "grad_norm": 3.3101487159729004, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8707839846611023, + "num_tokens": 95446167.0, + "step": 2499 + }, + { + "epoch": 0.31802569647627527, + "ewc_loss": 0.005767395719885826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7673954870551825e-05, + "grad_norm": 3.2562339305877686, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8501344919204712, + "num_tokens": 95485763.0, + "step": 2500 + }, + { + "epoch": 0.3181529067548658, + "ewc_loss": 0.0057026357389986515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.702635826310143e-05, + "grad_norm": 3.311274528503418, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8446637392044067, + "num_tokens": 95526407.0, + "step": 2501 + }, + { + "epoch": 0.3182801170334563, + "ewc_loss": 0.005752556025981903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7525558077031747e-05, + "grad_norm": 3.354888677597046, + "learning_rate": 1e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8235493898391724, + "num_tokens": 95567479.0, + "step": 2502 + }, + { + "epoch": 0.3184073273120468, + "ewc_loss": 0.005761281121522188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7612811360741034e-05, + "grad_norm": 3.2488951683044434, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8623147010803223, + "num_tokens": 95606588.0, + "step": 2503 + }, + { + "epoch": 0.3185345375906373, + "ewc_loss": 0.005688440054655075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.68844006920699e-05, + "grad_norm": 3.2076950073242188, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8597608804702759, + "num_tokens": 95653293.0, + "step": 2504 + }, + { + "epoch": 0.31866174786922785, + "ewc_loss": 0.005709298886358738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7092987844953313e-05, + "grad_norm": 3.3366830348968506, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8472695350646973, + "num_tokens": 95692424.0, + "step": 2505 + }, + { + "epoch": 0.3187889581478183, + "ewc_loss": 0.005766873247921467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.766873073298484e-05, + "grad_norm": 3.230177402496338, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8656694889068604, + "num_tokens": 95733679.0, + "step": 2506 + }, + { + "epoch": 0.31891616842640885, + "ewc_loss": 0.005677029490470886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.677029548678547e-05, + "grad_norm": 3.3517048358917236, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8518530130386353, + "num_tokens": 95768211.0, + "step": 2507 + }, + { + "epoch": 0.3190433787049994, + "ewc_loss": 0.005769023206084967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.769023118773475e-05, + "grad_norm": 3.330535650253296, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.856207013130188, + "num_tokens": 95805974.0, + "step": 2508 + }, + { + "epoch": 0.31917058898358985, + "ewc_loss": 0.005730537697672844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7305376685690135e-05, + "grad_norm": 3.284350633621216, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8543290495872498, + "num_tokens": 95845463.0, + "step": 2509 + }, + { + "epoch": 0.3192977992621804, + "ewc_loss": 0.005705514457076788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7055145589401945e-05, + "grad_norm": 3.388367176055908, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8504769802093506, + "num_tokens": 95888148.0, + "step": 2510 + }, + { + "epoch": 0.3194250095407709, + "ewc_loss": 0.005778767634183168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.778767808806151e-05, + "grad_norm": 3.2739439010620117, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8785544633865356, + "num_tokens": 95925432.0, + "step": 2511 + }, + { + "epoch": 0.3195522198193614, + "ewc_loss": 0.005678587127476931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.678586967405863e-05, + "grad_norm": 3.230060577392578, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8620904684066772, + "num_tokens": 95970695.0, + "step": 2512 + }, + { + "epoch": 0.3196794300979519, + "ewc_loss": 0.005698017776012421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6980177760124207e-05, + "grad_norm": 3.3144757747650146, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.856008768081665, + "num_tokens": 96006093.0, + "step": 2513 + }, + { + "epoch": 0.31980664037654244, + "ewc_loss": 0.005756211467087269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.756211612606421e-05, + "grad_norm": 3.323636531829834, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8565834760665894, + "num_tokens": 96045945.0, + "step": 2514 + }, + { + "epoch": 0.3199338506551329, + "ewc_loss": 0.005721083842217922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.721083653043024e-05, + "grad_norm": 3.3005549907684326, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.873315155506134, + "num_tokens": 96076748.0, + "step": 2515 + }, + { + "epoch": 0.32006106093372344, + "ewc_loss": 0.005729655269533396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.729655094910413e-05, + "grad_norm": 3.2940192222595215, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8584604859352112, + "num_tokens": 96116964.0, + "step": 2516 + }, + { + "epoch": 0.32018827121231397, + "ewc_loss": 0.005746102891862392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7461027608951554e-05, + "grad_norm": 3.34314227104187, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8583245873451233, + "num_tokens": 96152073.0, + "step": 2517 + }, + { + "epoch": 0.32031548149090444, + "ewc_loss": 0.005774192977696657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.774193050456233e-05, + "grad_norm": 3.2395823001861572, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8537651300430298, + "num_tokens": 96191608.0, + "step": 2518 + }, + { + "epoch": 0.32044269176949497, + "ewc_loss": 0.005715464241802692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7154644309775904e-05, + "grad_norm": 3.275668144226074, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.873431921005249, + "num_tokens": 96228124.0, + "step": 2519 + }, + { + "epoch": 0.3205699020480855, + "ewc_loss": 0.005763489753007889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.763489753007889e-05, + "grad_norm": 3.304919719696045, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8675496578216553, + "num_tokens": 96265870.0, + "step": 2520 + }, + { + "epoch": 0.32069711232667597, + "ewc_loss": 0.005761230364441872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.761230568168685e-05, + "grad_norm": 3.2003750801086426, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8663870096206665, + "num_tokens": 96309623.0, + "step": 2521 + }, + { + "epoch": 0.3208243226052665, + "ewc_loss": 0.005710938945412636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.710939149139449e-05, + "grad_norm": 3.354130506515503, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8509091138839722, + "num_tokens": 96344036.0, + "step": 2522 + }, + { + "epoch": 0.320951532883857, + "ewc_loss": 0.005825675092637539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.82567517994903e-05, + "grad_norm": 3.3349602222442627, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8674999475479126, + "num_tokens": 96377627.0, + "step": 2523 + }, + { + "epoch": 0.3210787431624475, + "ewc_loss": 0.0057608336210250854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.760833664680831e-05, + "grad_norm": 3.2775254249572754, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8580435514450073, + "num_tokens": 96413568.0, + "step": 2524 + }, + { + "epoch": 0.32120595344103803, + "ewc_loss": 0.005749713163822889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.749713091063313e-05, + "grad_norm": 3.2703611850738525, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8539780378341675, + "num_tokens": 96454682.0, + "step": 2525 + }, + { + "epoch": 0.32133316371962856, + "ewc_loss": 0.005765036214143038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7650362577987835e-05, + "grad_norm": 3.401704788208008, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8458249568939209, + "num_tokens": 96490456.0, + "step": 2526 + }, + { + "epoch": 0.32146037399821903, + "ewc_loss": 0.005831229966133833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8312300097895786e-05, + "grad_norm": 3.3359954357147217, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.849391520023346, + "num_tokens": 96526891.0, + "step": 2527 + }, + { + "epoch": 0.32158758427680956, + "ewc_loss": 0.005776157137006521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.776157195214182e-05, + "grad_norm": 3.212989091873169, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8508001565933228, + "num_tokens": 96575226.0, + "step": 2528 + }, + { + "epoch": 0.3217147945554001, + "ewc_loss": 0.005735337734222412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.73533761780709e-05, + "grad_norm": 3.3635613918304443, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8472020626068115, + "num_tokens": 96606383.0, + "step": 2529 + }, + { + "epoch": 0.3218420048339906, + "ewc_loss": 0.005860853008925915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.860852979822084e-05, + "grad_norm": 3.3227810859680176, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8598724603652954, + "num_tokens": 96639826.0, + "step": 2530 + }, + { + "epoch": 0.3219692151125811, + "ewc_loss": 0.00579005666077137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.790056457044557e-05, + "grad_norm": 3.3155503273010254, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8601915836334229, + "num_tokens": 96679151.0, + "step": 2531 + }, + { + "epoch": 0.3220964253911716, + "ewc_loss": 0.005799576640129089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.799576683784835e-05, + "grad_norm": 3.2421789169311523, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8630481958389282, + "num_tokens": 96719365.0, + "step": 2532 + }, + { + "epoch": 0.32222363566976214, + "ewc_loss": 0.005782857537269592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7828576245810837e-05, + "grad_norm": 3.3131909370422363, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8539849519729614, + "num_tokens": 96758318.0, + "step": 2533 + }, + { + "epoch": 0.3223508459483526, + "ewc_loss": 0.00584490317851305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.844902989338152e-05, + "grad_norm": 3.329989433288574, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8529807925224304, + "num_tokens": 96791159.0, + "step": 2534 + }, + { + "epoch": 0.32247805622694314, + "ewc_loss": 0.005828575696796179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8285757404519245e-05, + "grad_norm": 3.3074028491973877, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8600685000419617, + "num_tokens": 96825352.0, + "step": 2535 + }, + { + "epoch": 0.32260526650553367, + "ewc_loss": 0.0058163488283753395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.81634885747917e-05, + "grad_norm": 3.353933095932007, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8542073965072632, + "num_tokens": 96859700.0, + "step": 2536 + }, + { + "epoch": 0.32273247678412414, + "ewc_loss": 0.0058509232476353645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8509231166681275e-05, + "grad_norm": 3.286761522293091, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8551726341247559, + "num_tokens": 96897077.0, + "step": 2537 + }, + { + "epoch": 0.3228596870627147, + "ewc_loss": 0.005820385180413723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.820385194965638e-05, + "grad_norm": 3.2551448345184326, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8515108227729797, + "num_tokens": 96936936.0, + "step": 2538 + }, + { + "epoch": 0.3229868973413052, + "ewc_loss": 0.005824994295835495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8249945141142234e-05, + "grad_norm": 3.303544759750366, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8619012832641602, + "num_tokens": 96975702.0, + "step": 2539 + }, + { + "epoch": 0.3231141076198957, + "ewc_loss": 0.0058590201660990715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8590201660990715e-05, + "grad_norm": 3.3291568756103516, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8567806482315063, + "num_tokens": 97010585.0, + "step": 2540 + }, + { + "epoch": 0.3232413178984862, + "ewc_loss": 0.005857464391738176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.857464566361159e-05, + "grad_norm": 3.2267982959747314, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8800225257873535, + "num_tokens": 97047210.0, + "step": 2541 + }, + { + "epoch": 0.32336852817707673, + "ewc_loss": 0.005814952775835991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.814952965010889e-05, + "grad_norm": 3.3297626972198486, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.851813793182373, + "num_tokens": 97084835.0, + "step": 2542 + }, + { + "epoch": 0.3234957384556672, + "ewc_loss": 0.005881284363567829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.881284232600592e-05, + "grad_norm": 3.248957633972168, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8536840677261353, + "num_tokens": 97125829.0, + "step": 2543 + }, + { + "epoch": 0.32362294873425773, + "ewc_loss": 0.005816243588924408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.816243356093764e-05, + "grad_norm": 3.3279569149017334, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8504716157913208, + "num_tokens": 97162874.0, + "step": 2544 + }, + { + "epoch": 0.32375015901284826, + "ewc_loss": 0.005877225194126368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.877224975847639e-05, + "grad_norm": 3.2950828075408936, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8606035709381104, + "num_tokens": 97197559.0, + "step": 2545 + }, + { + "epoch": 0.32387736929143873, + "ewc_loss": 0.005835357122123241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.835357296746224e-05, + "grad_norm": 3.4597110748291016, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8446274995803833, + "num_tokens": 97227028.0, + "step": 2546 + }, + { + "epoch": 0.32400457957002926, + "ewc_loss": 0.005937762558460236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.937762398389168e-05, + "grad_norm": 3.2381367683410645, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8519085645675659, + "num_tokens": 97270209.0, + "step": 2547 + }, + { + "epoch": 0.3241317898486198, + "ewc_loss": 0.005786041263490915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7860412198351696e-05, + "grad_norm": 3.3059492111206055, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8599384427070618, + "num_tokens": 97309176.0, + "step": 2548 + }, + { + "epoch": 0.32425900012721026, + "ewc_loss": 0.005890904925763607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.890904867555946e-05, + "grad_norm": 3.4492387771606445, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8383128046989441, + "num_tokens": 97340464.0, + "step": 2549 + }, + { + "epoch": 0.3243862104058008, + "ewc_loss": 0.005927089136093855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9270892961649224e-05, + "grad_norm": 3.2487146854400635, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8604812622070312, + "num_tokens": 97378435.0, + "step": 2550 + }, + { + "epoch": 0.3245134206843913, + "ewc_loss": 0.005774845834821463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7748457038542256e-05, + "grad_norm": 3.2802398204803467, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8529751300811768, + "num_tokens": 97417886.0, + "step": 2551 + }, + { + "epoch": 0.3246406309629818, + "ewc_loss": 0.005857228301465511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.857228097738698e-05, + "grad_norm": 3.3046908378601074, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8593918085098267, + "num_tokens": 97450552.0, + "step": 2552 + }, + { + "epoch": 0.3247678412415723, + "ewc_loss": 0.005863877013325691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.863876867806539e-05, + "grad_norm": 3.2817587852478027, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8621358275413513, + "num_tokens": 97488586.0, + "step": 2553 + }, + { + "epoch": 0.32489505152016285, + "ewc_loss": 0.005836639553308487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.836639684275724e-05, + "grad_norm": 3.2779996395111084, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8513315916061401, + "num_tokens": 97528893.0, + "step": 2554 + }, + { + "epoch": 0.3250222617987533, + "ewc_loss": 0.005850478541105986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.850478555657901e-05, + "grad_norm": 3.3086113929748535, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8325179815292358, + "num_tokens": 97568654.0, + "step": 2555 + }, + { + "epoch": 0.32514947207734385, + "ewc_loss": 0.005872142501175404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.872142355656251e-05, + "grad_norm": 3.305631399154663, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8595198392868042, + "num_tokens": 97607068.0, + "step": 2556 + }, + { + "epoch": 0.3252766823559344, + "ewc_loss": 0.005845196545124054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8451965742278844e-05, + "grad_norm": 3.212362766265869, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8765535950660706, + "num_tokens": 97647505.0, + "step": 2557 + }, + { + "epoch": 0.32540389263452485, + "ewc_loss": 0.005803274922072887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8032750530401245e-05, + "grad_norm": 3.2695963382720947, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8499140739440918, + "num_tokens": 97690505.0, + "step": 2558 + }, + { + "epoch": 0.3255311029131154, + "ewc_loss": 0.005848082713782787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.848082946613431e-05, + "grad_norm": 3.372751235961914, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8443496227264404, + "num_tokens": 97723955.0, + "step": 2559 + }, + { + "epoch": 0.3256583131917059, + "ewc_loss": 0.005879065487533808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.879065429326147e-05, + "grad_norm": 3.3127198219299316, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8501082062721252, + "num_tokens": 97763822.0, + "step": 2560 + }, + { + "epoch": 0.3257855234702964, + "ewc_loss": 0.005822163075208664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.822163075208664e-05, + "grad_norm": 3.2507762908935547, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8514392375946045, + "num_tokens": 97803827.0, + "step": 2561 + }, + { + "epoch": 0.3259127337488869, + "ewc_loss": 0.005813944153487682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8139441534876823e-05, + "grad_norm": 3.2896296977996826, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8550654649734497, + "num_tokens": 97841056.0, + "step": 2562 + }, + { + "epoch": 0.32603994402747744, + "ewc_loss": 0.005843768827617168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8437690313439816e-05, + "grad_norm": 3.24343204498291, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8725062012672424, + "num_tokens": 97881714.0, + "step": 2563 + }, + { + "epoch": 0.3261671543060679, + "ewc_loss": 0.005798292346298695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.798292477265932e-05, + "grad_norm": 3.2791213989257812, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8535622358322144, + "num_tokens": 97925797.0, + "step": 2564 + }, + { + "epoch": 0.32629436458465844, + "ewc_loss": 0.0058242860250175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.824286199640483e-05, + "grad_norm": 3.253295660018921, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8614087104797363, + "num_tokens": 97963638.0, + "step": 2565 + }, + { + "epoch": 0.32642157486324896, + "ewc_loss": 0.005800163373351097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.800163489766419e-05, + "grad_norm": 3.308645725250244, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8506516814231873, + "num_tokens": 98002043.0, + "step": 2566 + }, + { + "epoch": 0.32654878514183944, + "ewc_loss": 0.005819808226078749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8198082115268335e-05, + "grad_norm": 3.2707862854003906, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8589065670967102, + "num_tokens": 98041031.0, + "step": 2567 + }, + { + "epoch": 0.32667599542042997, + "ewc_loss": 0.005784354638308287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.784354652860202e-05, + "grad_norm": 3.311823606491089, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8634606003761292, + "num_tokens": 98075510.0, + "step": 2568 + }, + { + "epoch": 0.3268032056990205, + "ewc_loss": 0.005808465648442507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.808465721202083e-05, + "grad_norm": 3.2595508098602295, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8624323606491089, + "num_tokens": 98114162.0, + "step": 2569 + }, + { + "epoch": 0.32693041597761097, + "ewc_loss": 0.005767973139882088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7679731980897486e-05, + "grad_norm": 3.388526439666748, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8530635833740234, + "num_tokens": 98148722.0, + "step": 2570 + }, + { + "epoch": 0.3270576262562015, + "ewc_loss": 0.005857118405401707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.857118594576605e-05, + "grad_norm": 3.326742649078369, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8358453512191772, + "num_tokens": 98188255.0, + "step": 2571 + }, + { + "epoch": 0.327184836534792, + "ewc_loss": 0.005777894519269466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7778943300945684e-05, + "grad_norm": 3.2823266983032227, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8667306900024414, + "num_tokens": 98228597.0, + "step": 2572 + }, + { + "epoch": 0.3273120468133825, + "ewc_loss": 0.005770720075815916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.770719872089103e-05, + "grad_norm": 3.2628214359283447, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.846874475479126, + "num_tokens": 98273405.0, + "step": 2573 + }, + { + "epoch": 0.327439257091973, + "ewc_loss": 0.005762365646660328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7623656175564975e-05, + "grad_norm": 3.300855875015259, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8421313762664795, + "num_tokens": 98315091.0, + "step": 2574 + }, + { + "epoch": 0.32756646737056355, + "ewc_loss": 0.005790126509964466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.790126306237653e-05, + "grad_norm": 3.3168299198150635, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8569201231002808, + "num_tokens": 98356967.0, + "step": 2575 + }, + { + "epoch": 0.327693677649154, + "ewc_loss": 0.005788763985037804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.788763883174397e-05, + "grad_norm": 3.358703374862671, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8468436598777771, + "num_tokens": 98394455.0, + "step": 2576 + }, + { + "epoch": 0.32782088792774455, + "ewc_loss": 0.005802699364721775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8026995247928426e-05, + "grad_norm": 3.3178470134735107, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8648260235786438, + "num_tokens": 98430455.0, + "step": 2577 + }, + { + "epoch": 0.3279480982063351, + "ewc_loss": 0.005769494455307722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7694946008268744e-05, + "grad_norm": 3.288374185562134, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.841943621635437, + "num_tokens": 98472218.0, + "step": 2578 + }, + { + "epoch": 0.3280753084849256, + "ewc_loss": 0.005759299732744694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.759299892815761e-05, + "grad_norm": 3.3395659923553467, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8637100458145142, + "num_tokens": 98509540.0, + "step": 2579 + }, + { + "epoch": 0.3282025187635161, + "ewc_loss": 0.005791972391307354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.791972216684371e-05, + "grad_norm": 3.3249080181121826, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8548296689987183, + "num_tokens": 98544761.0, + "step": 2580 + }, + { + "epoch": 0.3283297290421066, + "ewc_loss": 0.005781596526503563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.781596701126546e-05, + "grad_norm": 3.2990567684173584, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8685294985771179, + "num_tokens": 98580522.0, + "step": 2581 + }, + { + "epoch": 0.32845693932069714, + "ewc_loss": 0.005778151098638773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.778151171398349e-05, + "grad_norm": 3.3484578132629395, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8653050661087036, + "num_tokens": 98612988.0, + "step": 2582 + }, + { + "epoch": 0.3285841495992876, + "ewc_loss": 0.00581606850028038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.816068733111024e-05, + "grad_norm": 3.2602744102478027, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8449852466583252, + "num_tokens": 98654213.0, + "step": 2583 + }, + { + "epoch": 0.32871135987787814, + "ewc_loss": 0.005756373517215252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.756373502663337e-05, + "grad_norm": 3.246349811553955, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8591710329055786, + "num_tokens": 98697168.0, + "step": 2584 + }, + { + "epoch": 0.32883857015646867, + "ewc_loss": 0.00578915374353528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.789153874502517e-05, + "grad_norm": 3.3064188957214355, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8614781498908997, + "num_tokens": 98735664.0, + "step": 2585 + }, + { + "epoch": 0.32896578043505914, + "ewc_loss": 0.005814939271658659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.814939140691422e-05, + "grad_norm": 3.264127254486084, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8525838851928711, + "num_tokens": 98777257.0, + "step": 2586 + }, + { + "epoch": 0.32909299071364967, + "ewc_loss": 0.005773745942860842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.773745942860842e-05, + "grad_norm": 3.3640153408050537, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8608670234680176, + "num_tokens": 98812443.0, + "step": 2587 + }, + { + "epoch": 0.3292202009922402, + "ewc_loss": 0.005832819268107414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.832819078932516e-05, + "grad_norm": 3.3206982612609863, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8633878231048584, + "num_tokens": 98848231.0, + "step": 2588 + }, + { + "epoch": 0.32934741127083067, + "ewc_loss": 0.005781697109341621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7816971093416214e-05, + "grad_norm": 3.2878339290618896, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8597184419631958, + "num_tokens": 98886487.0, + "step": 2589 + }, + { + "epoch": 0.3294746215494212, + "ewc_loss": 0.005778464954346418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.778465128969401e-05, + "grad_norm": 3.254641532897949, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.845393180847168, + "num_tokens": 98924750.0, + "step": 2590 + }, + { + "epoch": 0.3296018318280117, + "ewc_loss": 0.005770277231931686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.77027713006828e-05, + "grad_norm": 3.348907470703125, + "learning_rate": 1e-06, + "loss": 0.5151, + "mean_token_accuracy": 0.835172712802887, + "num_tokens": 98964151.0, + "step": 2591 + }, + { + "epoch": 0.3297290421066022, + "ewc_loss": 0.005841537844389677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.841537858941592e-05, + "grad_norm": 3.268549680709839, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8679060339927673, + "num_tokens": 99000277.0, + "step": 2592 + }, + { + "epoch": 0.3298562523851927, + "ewc_loss": 0.005775409750640392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.775409590569325e-05, + "grad_norm": 3.243166446685791, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8477295637130737, + "num_tokens": 99042335.0, + "step": 2593 + }, + { + "epoch": 0.32998346266378326, + "ewc_loss": 0.005779522005468607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.779521961812861e-05, + "grad_norm": 3.2508723735809326, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8556022047996521, + "num_tokens": 99082964.0, + "step": 2594 + }, + { + "epoch": 0.33011067294237373, + "ewc_loss": 0.005811566021293402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8115660067414865e-05, + "grad_norm": 3.2431511878967285, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8791803121566772, + "num_tokens": 99123440.0, + "step": 2595 + }, + { + "epoch": 0.33023788322096426, + "ewc_loss": 0.005780581384897232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7805813412414864e-05, + "grad_norm": 3.2858481407165527, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8543775677680969, + "num_tokens": 99167047.0, + "step": 2596 + }, + { + "epoch": 0.3303650934995548, + "ewc_loss": 0.00582075584679842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.820755905006081e-05, + "grad_norm": 3.3216779232025146, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8538232445716858, + "num_tokens": 99202594.0, + "step": 2597 + }, + { + "epoch": 0.33049230377814526, + "ewc_loss": 0.005817423574626446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.817423516418785e-05, + "grad_norm": 3.2346904277801514, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8621893525123596, + "num_tokens": 99244732.0, + "step": 2598 + }, + { + "epoch": 0.3306195140567358, + "ewc_loss": 0.005759022664278746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.759022678830661e-05, + "grad_norm": 3.2470831871032715, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.866274893283844, + "num_tokens": 99287161.0, + "step": 2599 + }, + { + "epoch": 0.3307467243353263, + "ewc_loss": 0.005791777279227972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.791777221020311e-05, + "grad_norm": 3.341287851333618, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8491311073303223, + "num_tokens": 99322650.0, + "step": 2600 + }, + { + "epoch": 0.3308739346139168, + "ewc_loss": 0.005828247871249914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8282479585614055e-05, + "grad_norm": 3.3574423789978027, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8569798469543457, + "num_tokens": 99358529.0, + "step": 2601 + }, + { + "epoch": 0.3310011448925073, + "ewc_loss": 0.005807329900562763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.807329944218509e-05, + "grad_norm": 3.306185245513916, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8657581806182861, + "num_tokens": 99394010.0, + "step": 2602 + }, + { + "epoch": 0.33112835517109784, + "ewc_loss": 0.005780271720141172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.780271749245003e-05, + "grad_norm": 3.2464826107025146, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8599826693534851, + "num_tokens": 99434764.0, + "step": 2603 + }, + { + "epoch": 0.3312555654496883, + "ewc_loss": 0.005777999758720398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.777999831479974e-05, + "grad_norm": 3.2391793727874756, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8589370846748352, + "num_tokens": 99481322.0, + "step": 2604 + }, + { + "epoch": 0.33138277572827884, + "ewc_loss": 0.005779915954917669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.779915954917669e-05, + "grad_norm": 3.3470816612243652, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8471053242683411, + "num_tokens": 99517719.0, + "step": 2605 + }, + { + "epoch": 0.3315099860068694, + "ewc_loss": 0.005842216778546572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8422167057869956e-05, + "grad_norm": 3.3241472244262695, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8598078489303589, + "num_tokens": 99553662.0, + "step": 2606 + }, + { + "epoch": 0.33163719628545985, + "ewc_loss": 0.005791936535388231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.791936564492062e-05, + "grad_norm": 3.3636603355407715, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8570077419281006, + "num_tokens": 99589195.0, + "step": 2607 + }, + { + "epoch": 0.3317644065640504, + "ewc_loss": 0.00582831259816885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.828312714584172e-05, + "grad_norm": 3.331942319869995, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8583046793937683, + "num_tokens": 99626092.0, + "step": 2608 + }, + { + "epoch": 0.3318916168426409, + "ewc_loss": 0.005804970394819975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.804970351164229e-05, + "grad_norm": 3.3329479694366455, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8331105709075928, + "num_tokens": 99667087.0, + "step": 2609 + }, + { + "epoch": 0.3320188271212314, + "ewc_loss": 0.005811555311083794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.811555092805065e-05, + "grad_norm": 3.288872241973877, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8514718413352966, + "num_tokens": 99705929.0, + "step": 2610 + }, + { + "epoch": 0.3321460373998219, + "ewc_loss": 0.005785806570202112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.785806570202112e-05, + "grad_norm": 3.3140764236450195, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8538570404052734, + "num_tokens": 99742044.0, + "step": 2611 + }, + { + "epoch": 0.33227324767841243, + "ewc_loss": 0.0058102733455598354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.810273432871327e-05, + "grad_norm": 3.38688325881958, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8606218099594116, + "num_tokens": 99772621.0, + "step": 2612 + }, + { + "epoch": 0.3324004579570029, + "ewc_loss": 0.005843530874699354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.843530743732117e-05, + "grad_norm": 3.287108898162842, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8628382086753845, + "num_tokens": 99814405.0, + "step": 2613 + }, + { + "epoch": 0.33252766823559343, + "ewc_loss": 0.005761734209954739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.761734064435586e-05, + "grad_norm": 3.2409369945526123, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8674355745315552, + "num_tokens": 99852197.0, + "step": 2614 + }, + { + "epoch": 0.33265487851418396, + "ewc_loss": 0.005790241062641144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.790240902570076e-05, + "grad_norm": 3.3088490962982178, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.851841926574707, + "num_tokens": 99894254.0, + "step": 2615 + }, + { + "epoch": 0.33278208879277443, + "ewc_loss": 0.005842737387865782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8427373005542904e-05, + "grad_norm": 3.2850944995880127, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.852051854133606, + "num_tokens": 99936230.0, + "step": 2616 + }, + { + "epoch": 0.33290929907136496, + "ewc_loss": 0.005800309125334024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8003090089187026e-05, + "grad_norm": 3.316999673843384, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8578826785087585, + "num_tokens": 99974964.0, + "step": 2617 + }, + { + "epoch": 0.3330365093499555, + "ewc_loss": 0.005836018826812506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8360186812933534e-05, + "grad_norm": 3.2634479999542236, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8481454849243164, + "num_tokens": 100018750.0, + "step": 2618 + }, + { + "epoch": 0.33316371962854596, + "ewc_loss": 0.005804742686450481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.804742613690905e-05, + "grad_norm": 3.394254684448242, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8509470224380493, + "num_tokens": 100054064.0, + "step": 2619 + }, + { + "epoch": 0.3332909299071365, + "ewc_loss": 0.005886409431695938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.886409417144023e-05, + "grad_norm": 3.31036376953125, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8620684742927551, + "num_tokens": 100092549.0, + "step": 2620 + }, + { + "epoch": 0.333418140185727, + "ewc_loss": 0.005797690246254206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.797690027975477e-05, + "grad_norm": 3.43807053565979, + "learning_rate": 1e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8310754895210266, + "num_tokens": 100123973.0, + "step": 2621 + }, + { + "epoch": 0.3335453504643175, + "ewc_loss": 0.005901140160858631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9011403209296986e-05, + "grad_norm": 3.2993741035461426, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.854843020439148, + "num_tokens": 100162142.0, + "step": 2622 + }, + { + "epoch": 0.333672560742908, + "ewc_loss": 0.005798790138214827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.798790152766742e-05, + "grad_norm": 3.303879976272583, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8519877195358276, + "num_tokens": 100199352.0, + "step": 2623 + }, + { + "epoch": 0.33379977102149855, + "ewc_loss": 0.005847297143191099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.847297143191099e-05, + "grad_norm": 3.348449230194092, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8420780301094055, + "num_tokens": 100237446.0, + "step": 2624 + }, + { + "epoch": 0.333926981300089, + "ewc_loss": 0.005864068865776062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8640689530875534e-05, + "grad_norm": 3.3041038513183594, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8623273372650146, + "num_tokens": 100272986.0, + "step": 2625 + }, + { + "epoch": 0.33405419157867955, + "ewc_loss": 0.00583230284973979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.83230284973979e-05, + "grad_norm": 3.354877233505249, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8470624089241028, + "num_tokens": 100304998.0, + "step": 2626 + }, + { + "epoch": 0.3341814018572701, + "ewc_loss": 0.005894079338759184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8940793678630143e-05, + "grad_norm": 3.334883451461792, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8522862195968628, + "num_tokens": 100338473.0, + "step": 2627 + }, + { + "epoch": 0.33430861213586055, + "ewc_loss": 0.005882334895431995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.88233488088008e-05, + "grad_norm": 3.3995018005371094, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8408024907112122, + "num_tokens": 100372644.0, + "step": 2628 + }, + { + "epoch": 0.3344358224144511, + "ewc_loss": 0.00591711699962616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.917117232456803e-05, + "grad_norm": 3.2788846492767334, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8370712995529175, + "num_tokens": 100413248.0, + "step": 2629 + }, + { + "epoch": 0.3345630326930416, + "ewc_loss": 0.005846840795129538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.84684094064869e-05, + "grad_norm": 3.3936619758605957, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8511571884155273, + "num_tokens": 100448022.0, + "step": 2630 + }, + { + "epoch": 0.33469024297163213, + "ewc_loss": 0.0059517319314181805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9517318732105196e-05, + "grad_norm": 3.2186293601989746, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8445993661880493, + "num_tokens": 100493222.0, + "step": 2631 + }, + { + "epoch": 0.3348174532502226, + "ewc_loss": 0.005839567631483078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8395675296196714e-05, + "grad_norm": 3.2671873569488525, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.861304521560669, + "num_tokens": 100534736.0, + "step": 2632 + }, + { + "epoch": 0.33494466352881314, + "ewc_loss": 0.0059143854305148125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.914385474170558e-05, + "grad_norm": 3.303250551223755, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8630455732345581, + "num_tokens": 100576754.0, + "step": 2633 + }, + { + "epoch": 0.33507187380740366, + "ewc_loss": 0.005914590321481228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9145902923773974e-05, + "grad_norm": 3.3752012252807617, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8494083881378174, + "num_tokens": 100611407.0, + "step": 2634 + }, + { + "epoch": 0.33519908408599414, + "ewc_loss": 0.005937186069786549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.937186142546125e-05, + "grad_norm": 3.327003240585327, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8630766868591309, + "num_tokens": 100643531.0, + "step": 2635 + }, + { + "epoch": 0.33532629436458466, + "ewc_loss": 0.005903853103518486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.903853161726147e-05, + "grad_norm": 3.291038990020752, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8575429916381836, + "num_tokens": 100678612.0, + "step": 2636 + }, + { + "epoch": 0.3354535046431752, + "ewc_loss": 0.0059081814251840115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.908181265112944e-05, + "grad_norm": 3.2739126682281494, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8605687618255615, + "num_tokens": 100718873.0, + "step": 2637 + }, + { + "epoch": 0.33558071492176567, + "ewc_loss": 0.005912113469094038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9121135564055294e-05, + "grad_norm": 3.281264066696167, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8560734987258911, + "num_tokens": 100759883.0, + "step": 2638 + }, + { + "epoch": 0.3357079252003562, + "ewc_loss": 0.005922495387494564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.922495256527327e-05, + "grad_norm": 3.3325881958007812, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8368703126907349, + "num_tokens": 100800022.0, + "step": 2639 + }, + { + "epoch": 0.3358351354789467, + "ewc_loss": 0.005942618008702993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9426180087029934e-05, + "grad_norm": 3.334470510482788, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8417925834655762, + "num_tokens": 100837299.0, + "step": 2640 + }, + { + "epoch": 0.3359623457575372, + "ewc_loss": 0.005928356666117907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.928356767981313e-05, + "grad_norm": 3.3802437782287598, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8528151512145996, + "num_tokens": 100868433.0, + "step": 2641 + }, + { + "epoch": 0.3360895560361277, + "ewc_loss": 0.005958698224276304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9586982388282195e-05, + "grad_norm": 3.33233642578125, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8405271172523499, + "num_tokens": 100907768.0, + "step": 2642 + }, + { + "epoch": 0.33621676631471825, + "ewc_loss": 0.005915163084864616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9151629102416337e-05, + "grad_norm": 3.3555843830108643, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8346747756004333, + "num_tokens": 100942723.0, + "step": 2643 + }, + { + "epoch": 0.3363439765933087, + "ewc_loss": 0.0059425076469779015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.942507414147258e-05, + "grad_norm": 3.2489705085754395, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8614187240600586, + "num_tokens": 100984997.0, + "step": 2644 + }, + { + "epoch": 0.33647118687189925, + "ewc_loss": 0.005880037322640419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.880037497263402e-05, + "grad_norm": 3.2661497592926025, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8701447248458862, + "num_tokens": 101026290.0, + "step": 2645 + }, + { + "epoch": 0.3365983971504898, + "ewc_loss": 0.005915351212024689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.91535099374596e-05, + "grad_norm": 3.3701553344726562, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8483721017837524, + "num_tokens": 101066766.0, + "step": 2646 + }, + { + "epoch": 0.33672560742908025, + "ewc_loss": 0.005949966609477997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9499667258933187e-05, + "grad_norm": 3.3578035831451416, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8598241806030273, + "num_tokens": 101100833.0, + "step": 2647 + }, + { + "epoch": 0.3368528177076708, + "ewc_loss": 0.005902392789721489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.90239287703298e-05, + "grad_norm": 3.2325687408447266, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8671764135360718, + "num_tokens": 101145552.0, + "step": 2648 + }, + { + "epoch": 0.3369800279862613, + "ewc_loss": 0.005837671924382448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.837671778863296e-05, + "grad_norm": 3.283646583557129, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.858302116394043, + "num_tokens": 101186644.0, + "step": 2649 + }, + { + "epoch": 0.3371072382648518, + "ewc_loss": 0.0058873500674963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.887349834665656e-05, + "grad_norm": 3.3283965587615967, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8537750244140625, + "num_tokens": 101223356.0, + "step": 2650 + }, + { + "epoch": 0.3372344485434423, + "ewc_loss": 0.005888741929084063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.88874208915513e-05, + "grad_norm": 3.2815816402435303, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8690646886825562, + "num_tokens": 101260527.0, + "step": 2651 + }, + { + "epoch": 0.33736165882203284, + "ewc_loss": 0.005849436856806278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8494370023254305e-05, + "grad_norm": 3.2505369186401367, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.855039119720459, + "num_tokens": 101305403.0, + "step": 2652 + }, + { + "epoch": 0.3374888691006233, + "ewc_loss": 0.005840722005814314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.840722224093042e-05, + "grad_norm": 3.2868306636810303, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8412477374076843, + "num_tokens": 101345858.0, + "step": 2653 + }, + { + "epoch": 0.33761607937921384, + "ewc_loss": 0.005870855879038572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.870855966350064e-05, + "grad_norm": 3.379812240600586, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8331940174102783, + "num_tokens": 101380321.0, + "step": 2654 + }, + { + "epoch": 0.33774328965780437, + "ewc_loss": 0.005902765318751335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9027654060628265e-05, + "grad_norm": 3.2599143981933594, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8644462823867798, + "num_tokens": 101419120.0, + "step": 2655 + }, + { + "epoch": 0.33787049993639484, + "ewc_loss": 0.0058207325637340546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8207326219417155e-05, + "grad_norm": 3.3660287857055664, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.858290433883667, + "num_tokens": 101453272.0, + "step": 2656 + }, + { + "epoch": 0.33799771021498537, + "ewc_loss": 0.005923802964389324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9238031099084765e-05, + "grad_norm": 3.4123663902282715, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8449104428291321, + "num_tokens": 101486879.0, + "step": 2657 + }, + { + "epoch": 0.3381249204935759, + "ewc_loss": 0.005904001649469137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.904001591261476e-05, + "grad_norm": 3.369837522506714, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8484355807304382, + "num_tokens": 101523691.0, + "step": 2658 + }, + { + "epoch": 0.33825213077216637, + "ewc_loss": 0.005869442597031593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8694426115835086e-05, + "grad_norm": 3.3470845222473145, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.856590747833252, + "num_tokens": 101555279.0, + "step": 2659 + }, + { + "epoch": 0.3383793410507569, + "ewc_loss": 0.005883427336812019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.88342736591585e-05, + "grad_norm": 3.272136926651001, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.872487485408783, + "num_tokens": 101591780.0, + "step": 2660 + }, + { + "epoch": 0.3385065513293474, + "ewc_loss": 0.005849563051015139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8495632401900366e-05, + "grad_norm": 3.213819742202759, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8717429637908936, + "num_tokens": 101631041.0, + "step": 2661 + }, + { + "epoch": 0.3386337616079379, + "ewc_loss": 0.005839659832417965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8396599342813715e-05, + "grad_norm": 3.3428406715393066, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8456590175628662, + "num_tokens": 101666315.0, + "step": 2662 + }, + { + "epoch": 0.33876097188652843, + "ewc_loss": 0.005930923856794834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.930923725827597e-05, + "grad_norm": 3.3385660648345947, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8681399822235107, + "num_tokens": 101704046.0, + "step": 2663 + }, + { + "epoch": 0.33888818216511896, + "ewc_loss": 0.005886401981115341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.886401777388528e-05, + "grad_norm": 3.274573564529419, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8670021295547485, + "num_tokens": 101743076.0, + "step": 2664 + }, + { + "epoch": 0.33901539244370943, + "ewc_loss": 0.0058754426427185535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.875442730030045e-05, + "grad_norm": 3.2128915786743164, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8690894246101379, + "num_tokens": 101784006.0, + "step": 2665 + }, + { + "epoch": 0.33914260272229996, + "ewc_loss": 0.005859612487256527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.859612429048866e-05, + "grad_norm": 3.3415894508361816, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8625912070274353, + "num_tokens": 101821845.0, + "step": 2666 + }, + { + "epoch": 0.3392698130008905, + "ewc_loss": 0.005945160519331694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.94516059209127e-05, + "grad_norm": 3.2359323501586914, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8618910312652588, + "num_tokens": 101861121.0, + "step": 2667 + }, + { + "epoch": 0.33939702327948096, + "ewc_loss": 0.005856500007212162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8564997743815184e-05, + "grad_norm": 3.391094207763672, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8535438179969788, + "num_tokens": 101894330.0, + "step": 2668 + }, + { + "epoch": 0.3395242335580715, + "ewc_loss": 0.0059690698981285095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.969070116407238e-05, + "grad_norm": 3.3137710094451904, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8428412079811096, + "num_tokens": 101934835.0, + "step": 2669 + }, + { + "epoch": 0.339651443836662, + "ewc_loss": 0.0058807856403291225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8807858295040205e-05, + "grad_norm": 3.369637966156006, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8378773927688599, + "num_tokens": 101970943.0, + "step": 2670 + }, + { + "epoch": 0.3397786541152525, + "ewc_loss": 0.005937933921813965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.937934110988863e-05, + "grad_norm": 3.4790902137756348, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8449267148971558, + "num_tokens": 102001211.0, + "step": 2671 + }, + { + "epoch": 0.339905864393843, + "ewc_loss": 0.005974968895316124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9749687352450565e-05, + "grad_norm": 3.2877414226531982, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8632878065109253, + "num_tokens": 102039399.0, + "step": 2672 + }, + { + "epoch": 0.34003307467243354, + "ewc_loss": 0.005850025452673435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.850025263498537e-05, + "grad_norm": 3.3756508827209473, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8457669615745544, + "num_tokens": 102070635.0, + "step": 2673 + }, + { + "epoch": 0.340160284951024, + "ewc_loss": 0.005959742236882448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.959742338745855e-05, + "grad_norm": 3.285649538040161, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8610943555831909, + "num_tokens": 102104590.0, + "step": 2674 + }, + { + "epoch": 0.34028749522961454, + "ewc_loss": 0.005888385232537985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8883852034341544e-05, + "grad_norm": 3.2635304927825928, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8624786138534546, + "num_tokens": 102149267.0, + "step": 2675 + }, + { + "epoch": 0.3404147055082051, + "ewc_loss": 0.0058895135298371315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.889513704460114e-05, + "grad_norm": 3.32170033454895, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8402395248413086, + "num_tokens": 102188089.0, + "step": 2676 + }, + { + "epoch": 0.34054191578679555, + "ewc_loss": 0.005942399613559246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.942399729974568e-05, + "grad_norm": 3.290149450302124, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8636422753334045, + "num_tokens": 102224335.0, + "step": 2677 + }, + { + "epoch": 0.3406691260653861, + "ewc_loss": 0.00590467918664217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9046793467132375e-05, + "grad_norm": 3.273646354675293, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8599562048912048, + "num_tokens": 102263421.0, + "step": 2678 + }, + { + "epoch": 0.3407963363439766, + "ewc_loss": 0.0059058149345219135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.905814759898931e-05, + "grad_norm": 3.3104028701782227, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8395507335662842, + "num_tokens": 102301133.0, + "step": 2679 + }, + { + "epoch": 0.34092354662256713, + "ewc_loss": 0.005956328473985195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.95632845943328e-05, + "grad_norm": 3.3200201988220215, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8598387241363525, + "num_tokens": 102338516.0, + "step": 2680 + }, + { + "epoch": 0.3410507569011576, + "ewc_loss": 0.0059272125363349915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.927212623646483e-05, + "grad_norm": 3.2867178916931152, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8468849062919617, + "num_tokens": 102376010.0, + "step": 2681 + }, + { + "epoch": 0.34117796717974813, + "ewc_loss": 0.005922546144574881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.922546188230626e-05, + "grad_norm": 3.243610382080078, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.878221333026886, + "num_tokens": 102417080.0, + "step": 2682 + }, + { + "epoch": 0.34130517745833866, + "ewc_loss": 0.005907618440687656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.907618469791487e-05, + "grad_norm": 3.2931971549987793, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8589397072792053, + "num_tokens": 102458042.0, + "step": 2683 + }, + { + "epoch": 0.34143238773692913, + "ewc_loss": 0.005941434297710657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.941434210399166e-05, + "grad_norm": 3.306405782699585, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8654389381408691, + "num_tokens": 102497152.0, + "step": 2684 + }, + { + "epoch": 0.34155959801551966, + "ewc_loss": 0.0059263198636472225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9263198636472225e-05, + "grad_norm": 3.3159844875335693, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8545486927032471, + "num_tokens": 102531486.0, + "step": 2685 + }, + { + "epoch": 0.3416868082941102, + "ewc_loss": 0.005931528750807047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9315287217032164e-05, + "grad_norm": 3.3157074451446533, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8554750084877014, + "num_tokens": 102571552.0, + "step": 2686 + }, + { + "epoch": 0.34181401857270066, + "ewc_loss": 0.005921435076743364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9214351495029405e-05, + "grad_norm": 3.3319780826568604, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8616165518760681, + "num_tokens": 102607338.0, + "step": 2687 + }, + { + "epoch": 0.3419412288512912, + "ewc_loss": 0.005932895466685295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.932895510341041e-05, + "grad_norm": 3.2317137718200684, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.857524037361145, + "num_tokens": 102654445.0, + "step": 2688 + }, + { + "epoch": 0.3420684391298817, + "ewc_loss": 0.005875683389604092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.875683200429194e-05, + "grad_norm": 3.2655389308929443, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8653438687324524, + "num_tokens": 102695265.0, + "step": 2689 + }, + { + "epoch": 0.3421956494084722, + "ewc_loss": 0.005911160726100206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.911160769755952e-05, + "grad_norm": 3.31201434135437, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8632436990737915, + "num_tokens": 102728621.0, + "step": 2690 + }, + { + "epoch": 0.3423228596870627, + "ewc_loss": 0.005921653471887112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.921653428231366e-05, + "grad_norm": 3.3582603931427, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8510943055152893, + "num_tokens": 102766378.0, + "step": 2691 + }, + { + "epoch": 0.34245006996565325, + "ewc_loss": 0.005934704560786486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9347046772018075e-05, + "grad_norm": 3.294377326965332, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8546549081802368, + "num_tokens": 102806331.0, + "step": 2692 + }, + { + "epoch": 0.3425772802442437, + "ewc_loss": 0.005888311192393303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.88831098866649e-05, + "grad_norm": 3.3142940998077393, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8598001003265381, + "num_tokens": 102845314.0, + "step": 2693 + }, + { + "epoch": 0.34270449052283425, + "ewc_loss": 0.00591727439314127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.91727439314127e-05, + "grad_norm": 3.26670241355896, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8558307886123657, + "num_tokens": 102888614.0, + "step": 2694 + }, + { + "epoch": 0.3428317008014248, + "ewc_loss": 0.005880427546799183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.880427488591522e-05, + "grad_norm": 3.3576440811157227, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8512784838676453, + "num_tokens": 102925988.0, + "step": 2695 + }, + { + "epoch": 0.34295891108001525, + "ewc_loss": 0.005953318905085325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.953318759566173e-05, + "grad_norm": 3.3063066005706787, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8649737238883972, + "num_tokens": 102963150.0, + "step": 2696 + }, + { + "epoch": 0.3430861213586058, + "ewc_loss": 0.005877082236111164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8770820032805204e-05, + "grad_norm": 3.2845571041107178, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8656127452850342, + "num_tokens": 102999164.0, + "step": 2697 + }, + { + "epoch": 0.3432133316371963, + "ewc_loss": 0.005898538511246443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8985384384868667e-05, + "grad_norm": 3.327008008956909, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.864971399307251, + "num_tokens": 103034337.0, + "step": 2698 + }, + { + "epoch": 0.3433405419157868, + "ewc_loss": 0.005923325661569834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.923325807088986e-05, + "grad_norm": 3.278980016708374, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8632093667984009, + "num_tokens": 103075062.0, + "step": 2699 + }, + { + "epoch": 0.3434677521943773, + "ewc_loss": 0.005878697149455547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.878697265870869e-05, + "grad_norm": 3.3526759147644043, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8528228998184204, + "num_tokens": 103113209.0, + "step": 2700 + }, + { + "epoch": 0.34359496247296784, + "ewc_loss": 0.005940582603216171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9405825595604256e-05, + "grad_norm": 3.266422748565674, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8609325885772705, + "num_tokens": 103153117.0, + "step": 2701 + }, + { + "epoch": 0.3437221727515583, + "ewc_loss": 0.0058641936630010605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8641937357606366e-05, + "grad_norm": 3.309318780899048, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8559015989303589, + "num_tokens": 103192234.0, + "step": 2702 + }, + { + "epoch": 0.34384938303014884, + "ewc_loss": 0.005917925387620926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.917925591347739e-05, + "grad_norm": 3.3414456844329834, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8550583124160767, + "num_tokens": 103224958.0, + "step": 2703 + }, + { + "epoch": 0.34397659330873936, + "ewc_loss": 0.005924104247242212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.924104334553704e-05, + "grad_norm": 3.35918927192688, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8584009408950806, + "num_tokens": 103259679.0, + "step": 2704 + }, + { + "epoch": 0.34410380358732984, + "ewc_loss": 0.00592898391187191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.928983955527656e-05, + "grad_norm": 3.310641050338745, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8493826985359192, + "num_tokens": 103299987.0, + "step": 2705 + }, + { + "epoch": 0.34423101386592037, + "ewc_loss": 0.005890566390007734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.890566171729006e-05, + "grad_norm": 3.255798578262329, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8434176445007324, + "num_tokens": 103344457.0, + "step": 2706 + }, + { + "epoch": 0.3443582241445109, + "ewc_loss": 0.005892767105251551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8927671489072964e-05, + "grad_norm": 3.2677907943725586, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8622484803199768, + "num_tokens": 103385173.0, + "step": 2707 + }, + { + "epoch": 0.34448543442310137, + "ewc_loss": 0.005908762104809284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.908761886530556e-05, + "grad_norm": 3.311384439468384, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8419864177703857, + "num_tokens": 103425972.0, + "step": 2708 + }, + { + "epoch": 0.3446126447016919, + "ewc_loss": 0.0059129162691533566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9129164583282545e-05, + "grad_norm": 3.2923636436462402, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8531301021575928, + "num_tokens": 103466819.0, + "step": 2709 + }, + { + "epoch": 0.3447398549802824, + "ewc_loss": 0.0058922842144966125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.892284389119595e-05, + "grad_norm": 3.294196844100952, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8454938530921936, + "num_tokens": 103509660.0, + "step": 2710 + }, + { + "epoch": 0.3448670652588729, + "ewc_loss": 0.005910798441618681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.910798427066766e-05, + "grad_norm": 3.3335914611816406, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.851738452911377, + "num_tokens": 103544557.0, + "step": 2711 + }, + { + "epoch": 0.3449942755374634, + "ewc_loss": 0.00594274140894413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.942741336184554e-05, + "grad_norm": 3.267927408218384, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8673190474510193, + "num_tokens": 103585132.0, + "step": 2712 + }, + { + "epoch": 0.34512148581605395, + "ewc_loss": 0.005885929800570011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.885929931537248e-05, + "grad_norm": 3.250977039337158, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8657857775688171, + "num_tokens": 103626353.0, + "step": 2713 + }, + { + "epoch": 0.3452486960946444, + "ewc_loss": 0.00589572312310338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8957230066880584e-05, + "grad_norm": 3.3690154552459717, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8584904074668884, + "num_tokens": 103659035.0, + "step": 2714 + }, + { + "epoch": 0.34537590637323495, + "ewc_loss": 0.00595963466912508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.959634654573165e-05, + "grad_norm": 3.274890899658203, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8639484643936157, + "num_tokens": 103699497.0, + "step": 2715 + }, + { + "epoch": 0.3455031166518255, + "ewc_loss": 0.005860048811882734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.860048622707836e-05, + "grad_norm": 3.3962481021881104, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8460628390312195, + "num_tokens": 103733685.0, + "step": 2716 + }, + { + "epoch": 0.34563032693041595, + "ewc_loss": 0.0059699066914618015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.969906851532869e-05, + "grad_norm": 3.3184096813201904, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8403111100196838, + "num_tokens": 103769399.0, + "step": 2717 + }, + { + "epoch": 0.3457575372090065, + "ewc_loss": 0.005871044937521219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8710451412480325e-05, + "grad_norm": 3.2877233028411865, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8651982545852661, + "num_tokens": 103809078.0, + "step": 2718 + }, + { + "epoch": 0.345884747487597, + "ewc_loss": 0.005890662781894207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.890662578167394e-05, + "grad_norm": 3.2413628101348877, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8738493323326111, + "num_tokens": 103846764.0, + "step": 2719 + }, + { + "epoch": 0.3460119577661875, + "ewc_loss": 0.005871435161679983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.871435132576153e-05, + "grad_norm": 3.317594289779663, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8657753467559814, + "num_tokens": 103883089.0, + "step": 2720 + }, + { + "epoch": 0.346139168044778, + "ewc_loss": 0.0059169684536755085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.916968439123593e-05, + "grad_norm": 3.319744110107422, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8754968643188477, + "num_tokens": 103915775.0, + "step": 2721 + }, + { + "epoch": 0.34626637832336854, + "ewc_loss": 0.005893394351005554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.893394336453639e-05, + "grad_norm": 3.3249361515045166, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8516837358474731, + "num_tokens": 103953703.0, + "step": 2722 + }, + { + "epoch": 0.346393588601959, + "ewc_loss": 0.0058960942551493645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8960944443242624e-05, + "grad_norm": 3.3050923347473145, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8455933928489685, + "num_tokens": 103990775.0, + "step": 2723 + }, + { + "epoch": 0.34652079888054954, + "ewc_loss": 0.005884167272597551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.884167330805212e-05, + "grad_norm": 3.2915987968444824, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8612442016601562, + "num_tokens": 104025927.0, + "step": 2724 + }, + { + "epoch": 0.34664800915914007, + "ewc_loss": 0.005883183795958757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8831839851336554e-05, + "grad_norm": 3.3042421340942383, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.847472071647644, + "num_tokens": 104065860.0, + "step": 2725 + }, + { + "epoch": 0.34677521943773054, + "ewc_loss": 0.005894078873097897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.894078640267253e-05, + "grad_norm": 3.3929014205932617, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8588515520095825, + "num_tokens": 104102117.0, + "step": 2726 + }, + { + "epoch": 0.34690242971632107, + "ewc_loss": 0.005943578667938709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.943578798905946e-05, + "grad_norm": 3.3215386867523193, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8535715341567993, + "num_tokens": 104140896.0, + "step": 2727 + }, + { + "epoch": 0.3470296399949116, + "ewc_loss": 0.0058770813047885895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.877081275684759e-05, + "grad_norm": 3.3154592514038086, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8561025857925415, + "num_tokens": 104181355.0, + "step": 2728 + }, + { + "epoch": 0.3471568502735021, + "ewc_loss": 0.005892500746995211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.892500848858617e-05, + "grad_norm": 3.3231194019317627, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8580799102783203, + "num_tokens": 104218564.0, + "step": 2729 + }, + { + "epoch": 0.3472840605520926, + "ewc_loss": 0.005907371640205383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.907371451030485e-05, + "grad_norm": 3.2768828868865967, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8613805770874023, + "num_tokens": 104256553.0, + "step": 2730 + }, + { + "epoch": 0.3474112708306831, + "ewc_loss": 0.005870908964425325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.870909080840647e-05, + "grad_norm": 3.3253822326660156, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8702049255371094, + "num_tokens": 104291150.0, + "step": 2731 + }, + { + "epoch": 0.34753848110927366, + "ewc_loss": 0.005927611142396927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.92761134612374e-05, + "grad_norm": 3.3405892848968506, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8495990633964539, + "num_tokens": 104327977.0, + "step": 2732 + }, + { + "epoch": 0.34766569138786413, + "ewc_loss": 0.005906618200242519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9066183894174173e-05, + "grad_norm": 3.284727096557617, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8551837205886841, + "num_tokens": 104365167.0, + "step": 2733 + }, + { + "epoch": 0.34779290166645466, + "ewc_loss": 0.005887063220143318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.887063161935657e-05, + "grad_norm": 3.3389670848846436, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8634229302406311, + "num_tokens": 104400304.0, + "step": 2734 + }, + { + "epoch": 0.3479201119450452, + "ewc_loss": 0.005937998183071613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9379981394158676e-05, + "grad_norm": 3.3398120403289795, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8422645926475525, + "num_tokens": 104437974.0, + "step": 2735 + }, + { + "epoch": 0.34804732222363566, + "ewc_loss": 0.0059070708230137825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.907070590183139e-05, + "grad_norm": 3.2959983348846436, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8618384599685669, + "num_tokens": 104474474.0, + "step": 2736 + }, + { + "epoch": 0.3481745325022262, + "ewc_loss": 0.005897146183997393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8971461839973927e-05, + "grad_norm": 3.319554567337036, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8435600996017456, + "num_tokens": 104511004.0, + "step": 2737 + }, + { + "epoch": 0.3483017427808167, + "ewc_loss": 0.005939805414527655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.939805487287231e-05, + "grad_norm": 3.2886157035827637, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8515901565551758, + "num_tokens": 104551889.0, + "step": 2738 + }, + { + "epoch": 0.3484289530594072, + "ewc_loss": 0.005927093327045441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.92709329794161e-05, + "grad_norm": 3.315467119216919, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8422407507896423, + "num_tokens": 104591962.0, + "step": 2739 + }, + { + "epoch": 0.3485561633379977, + "ewc_loss": 0.005949714686721563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.949714613961987e-05, + "grad_norm": 3.2709429264068604, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.85520339012146, + "num_tokens": 104636201.0, + "step": 2740 + }, + { + "epoch": 0.34868337361658824, + "ewc_loss": 0.005923596676439047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9235968365101144e-05, + "grad_norm": 3.281388759613037, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8580379486083984, + "num_tokens": 104673448.0, + "step": 2741 + }, + { + "epoch": 0.3488105838951787, + "ewc_loss": 0.005951797589659691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.951797356829047e-05, + "grad_norm": 3.3337528705596924, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.864469587802887, + "num_tokens": 104708879.0, + "step": 2742 + }, + { + "epoch": 0.34893779417376924, + "ewc_loss": 0.005973161198198795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9731610235758126e-05, + "grad_norm": 3.276211738586426, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8671368360519409, + "num_tokens": 104747073.0, + "step": 2743 + }, + { + "epoch": 0.3490650044523598, + "ewc_loss": 0.005928949918597937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.92895012232475e-05, + "grad_norm": 3.3166425228118896, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8571011424064636, + "num_tokens": 104786950.0, + "step": 2744 + }, + { + "epoch": 0.34919221473095025, + "ewc_loss": 0.005976279266178608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9762794990092516e-05, + "grad_norm": 3.291630983352661, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8619773387908936, + "num_tokens": 104828306.0, + "step": 2745 + }, + { + "epoch": 0.3493194250095408, + "ewc_loss": 0.005938620772212744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.938620597589761e-05, + "grad_norm": 3.2910940647125244, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8502270579338074, + "num_tokens": 104867030.0, + "step": 2746 + }, + { + "epoch": 0.3494466352881313, + "ewc_loss": 0.005948230624198914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9482306824065745e-05, + "grad_norm": 3.2631051540374756, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8629072904586792, + "num_tokens": 104909182.0, + "step": 2747 + }, + { + "epoch": 0.3495738455667218, + "ewc_loss": 0.005929320119321346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.929320104769431e-05, + "grad_norm": 3.2754948139190674, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8539916276931763, + "num_tokens": 104950648.0, + "step": 2748 + }, + { + "epoch": 0.3497010558453123, + "ewc_loss": 0.0059544178657233715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9544177929637954e-05, + "grad_norm": 3.270359754562378, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8634523153305054, + "num_tokens": 104994780.0, + "step": 2749 + }, + { + "epoch": 0.34982826612390283, + "ewc_loss": 0.0059333979152143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9333979152143e-05, + "grad_norm": 3.3231453895568848, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8650461435317993, + "num_tokens": 105029469.0, + "step": 2750 + }, + { + "epoch": 0.3499554764024933, + "ewc_loss": 0.005958235822618008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.958235851721838e-05, + "grad_norm": 3.326124668121338, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8515526056289673, + "num_tokens": 105066797.0, + "step": 2751 + }, + { + "epoch": 0.35008268668108383, + "ewc_loss": 0.005958282854408026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9582827816484496e-05, + "grad_norm": 3.3515207767486572, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8713728189468384, + "num_tokens": 105100869.0, + "step": 2752 + }, + { + "epoch": 0.35020989695967436, + "ewc_loss": 0.005946511402726173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.946511373622343e-05, + "grad_norm": 3.3436636924743652, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.870436429977417, + "num_tokens": 105135409.0, + "step": 2753 + }, + { + "epoch": 0.35033710723826483, + "ewc_loss": 0.005941291805356741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9412919654278085e-05, + "grad_norm": 3.348208427429199, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8648010492324829, + "num_tokens": 105172839.0, + "step": 2754 + }, + { + "epoch": 0.35046431751685536, + "ewc_loss": 0.005929204635322094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.929204780841246e-05, + "grad_norm": 3.2495131492614746, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8644602298736572, + "num_tokens": 105216656.0, + "step": 2755 + }, + { + "epoch": 0.3505915277954459, + "ewc_loss": 0.005885210819542408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.885210703127086e-05, + "grad_norm": 3.3310763835906982, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8524386882781982, + "num_tokens": 105253740.0, + "step": 2756 + }, + { + "epoch": 0.35071873807403636, + "ewc_loss": 0.005958347115665674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.958347173873335e-05, + "grad_norm": 3.318370819091797, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8622905611991882, + "num_tokens": 105290073.0, + "step": 2757 + }, + { + "epoch": 0.3508459483526269, + "ewc_loss": 0.00591942248865962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9194226196268573e-05, + "grad_norm": 3.338726758956909, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8653466105461121, + "num_tokens": 105325933.0, + "step": 2758 + }, + { + "epoch": 0.3509731586312174, + "ewc_loss": 0.005933902692049742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9339028666727245e-05, + "grad_norm": 3.323781728744507, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8615113496780396, + "num_tokens": 105362962.0, + "step": 2759 + }, + { + "epoch": 0.3511003689098079, + "ewc_loss": 0.00592694990336895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.926949961576611e-05, + "grad_norm": 3.234651803970337, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8580905199050903, + "num_tokens": 105410074.0, + "step": 2760 + }, + { + "epoch": 0.3512275791883984, + "ewc_loss": 0.005893811117857695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.893811248824932e-05, + "grad_norm": 3.362372875213623, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8430293798446655, + "num_tokens": 105450264.0, + "step": 2761 + }, + { + "epoch": 0.35135478946698895, + "ewc_loss": 0.00598895875737071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9889589465456083e-05, + "grad_norm": 3.326633930206299, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8495416045188904, + "num_tokens": 105486566.0, + "step": 2762 + }, + { + "epoch": 0.3514819997455794, + "ewc_loss": 0.005923316348344088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9233163483440876e-05, + "grad_norm": 3.304835796356201, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8611304759979248, + "num_tokens": 105527185.0, + "step": 2763 + }, + { + "epoch": 0.35160921002416995, + "ewc_loss": 0.00592711940407753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.927119491389021e-05, + "grad_norm": 3.410855531692505, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8573508262634277, + "num_tokens": 105557401.0, + "step": 2764 + }, + { + "epoch": 0.3517364203027605, + "ewc_loss": 0.0059845782816410065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9845780924661085e-05, + "grad_norm": 3.3834798336029053, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8659672737121582, + "num_tokens": 105586877.0, + "step": 2765 + }, + { + "epoch": 0.35186363058135095, + "ewc_loss": 0.0059533738531172276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.953374056844041e-05, + "grad_norm": 3.28389048576355, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8630588054656982, + "num_tokens": 105626988.0, + "step": 2766 + }, + { + "epoch": 0.3519908408599415, + "ewc_loss": 0.005921126808971167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.921126648900099e-05, + "grad_norm": 3.285942316055298, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.850121259689331, + "num_tokens": 105665796.0, + "step": 2767 + }, + { + "epoch": 0.352118051138532, + "ewc_loss": 0.005950380116701126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.950380000285804e-05, + "grad_norm": 3.2432916164398193, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8494256138801575, + "num_tokens": 105708892.0, + "step": 2768 + }, + { + "epoch": 0.3522452614171225, + "ewc_loss": 0.005940624978393316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9406251239124686e-05, + "grad_norm": 3.275970220565796, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.852729082107544, + "num_tokens": 105748221.0, + "step": 2769 + }, + { + "epoch": 0.352372471695713, + "ewc_loss": 0.005957962945103645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.957963003311306e-05, + "grad_norm": 3.34829044342041, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8564753532409668, + "num_tokens": 105784568.0, + "step": 2770 + }, + { + "epoch": 0.35249968197430354, + "ewc_loss": 0.0059932745061814785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.993274680804461e-05, + "grad_norm": 3.305192232131958, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8459222316741943, + "num_tokens": 105827223.0, + "step": 2771 + }, + { + "epoch": 0.352626892252894, + "ewc_loss": 0.005941619630903006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9416197473183274e-05, + "grad_norm": 3.2410106658935547, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8770196437835693, + "num_tokens": 105871014.0, + "step": 2772 + }, + { + "epoch": 0.35275410253148454, + "ewc_loss": 0.005936102941632271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9361031162552536e-05, + "grad_norm": 3.343196392059326, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8546838760375977, + "num_tokens": 105908995.0, + "step": 2773 + }, + { + "epoch": 0.35288131281007507, + "ewc_loss": 0.006010683253407478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0106831369921565e-05, + "grad_norm": 3.3140745162963867, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8525370955467224, + "num_tokens": 105948243.0, + "step": 2774 + }, + { + "epoch": 0.35300852308866554, + "ewc_loss": 0.005955482833087444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.955482629360631e-05, + "grad_norm": 3.3221664428710938, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.86115562915802, + "num_tokens": 105983997.0, + "step": 2775 + }, + { + "epoch": 0.35313573336725607, + "ewc_loss": 0.005970942322164774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9709422203013673e-05, + "grad_norm": 3.357874870300293, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8452485203742981, + "num_tokens": 106020336.0, + "step": 2776 + }, + { + "epoch": 0.3532629436458466, + "ewc_loss": 0.005984910763800144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.984910603729077e-05, + "grad_norm": 3.3615310192108154, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8595249652862549, + "num_tokens": 106053300.0, + "step": 2777 + }, + { + "epoch": 0.35339015392443707, + "ewc_loss": 0.005973116960376501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.973117004032247e-05, + "grad_norm": 3.2392418384552, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8727422952651978, + "num_tokens": 106091790.0, + "step": 2778 + }, + { + "epoch": 0.3535173642030276, + "ewc_loss": 0.005905073136091232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.905072976020165e-05, + "grad_norm": 3.3531062602996826, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8631093502044678, + "num_tokens": 106123768.0, + "step": 2779 + }, + { + "epoch": 0.3536445744816181, + "ewc_loss": 0.006025112699717283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.025112816132605e-05, + "grad_norm": 3.3580501079559326, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8660902380943298, + "num_tokens": 106158219.0, + "step": 2780 + }, + { + "epoch": 0.35377178476020865, + "ewc_loss": 0.005979309789836407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.979309571557678e-05, + "grad_norm": 3.3117516040802, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8526442646980286, + "num_tokens": 106195795.0, + "step": 2781 + }, + { + "epoch": 0.3538989950387991, + "ewc_loss": 0.005953728687018156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.953728759777732e-05, + "grad_norm": 3.3198766708374023, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8590786457061768, + "num_tokens": 106234973.0, + "step": 2782 + }, + { + "epoch": 0.35402620531738965, + "ewc_loss": 0.0059847962111234665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.984796371194534e-05, + "grad_norm": 3.4242122173309326, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8431069850921631, + "num_tokens": 106267297.0, + "step": 2783 + }, + { + "epoch": 0.3541534155959802, + "ewc_loss": 0.006023617461323738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.02361724304501e-05, + "grad_norm": 3.2646498680114746, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8653596043586731, + "num_tokens": 106309070.0, + "step": 2784 + }, + { + "epoch": 0.35428062587457065, + "ewc_loss": 0.0059211598709225655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9211597545072436e-05, + "grad_norm": 3.284757137298584, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8624882698059082, + "num_tokens": 106345784.0, + "step": 2785 + }, + { + "epoch": 0.3544078361531612, + "ewc_loss": 0.006006061099469662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0060610849177465e-05, + "grad_norm": 3.2820072174072266, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8661182522773743, + "num_tokens": 106383172.0, + "step": 2786 + }, + { + "epoch": 0.3545350464317517, + "ewc_loss": 0.005981349851936102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.981349750072695e-05, + "grad_norm": 3.3784573078155518, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8557401895523071, + "num_tokens": 106416916.0, + "step": 2787 + }, + { + "epoch": 0.3546622567103422, + "ewc_loss": 0.006037663668394089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.037663479219191e-05, + "grad_norm": 3.317289352416992, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8436688184738159, + "num_tokens": 106456969.0, + "step": 2788 + }, + { + "epoch": 0.3547894669889327, + "ewc_loss": 0.005980382207781076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.980382047710009e-05, + "grad_norm": 3.4243595600128174, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8565394878387451, + "num_tokens": 106490476.0, + "step": 2789 + }, + { + "epoch": 0.35491667726752324, + "ewc_loss": 0.0060646794736385345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0646794736385345e-05, + "grad_norm": 3.3226335048675537, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8587362170219421, + "num_tokens": 106525990.0, + "step": 2790 + }, + { + "epoch": 0.3550438875461137, + "ewc_loss": 0.00597049854695797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.970498386886902e-05, + "grad_norm": 3.388134241104126, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8342082500457764, + "num_tokens": 106562953.0, + "step": 2791 + }, + { + "epoch": 0.35517109782470424, + "ewc_loss": 0.006050627678632736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.050627780496143e-05, + "grad_norm": 3.3449547290802, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8604288101196289, + "num_tokens": 106599142.0, + "step": 2792 + }, + { + "epoch": 0.35529830810329477, + "ewc_loss": 0.006020217668265104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0202175518497825e-05, + "grad_norm": 3.2764971256256104, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8698534965515137, + "num_tokens": 106636743.0, + "step": 2793 + }, + { + "epoch": 0.35542551838188524, + "ewc_loss": 0.0059868572279810905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9868572861887515e-05, + "grad_norm": 3.2592217922210693, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8624586462974548, + "num_tokens": 106678992.0, + "step": 2794 + }, + { + "epoch": 0.35555272866047577, + "ewc_loss": 0.006013678852468729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0136790125397965e-05, + "grad_norm": 3.2769086360931396, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8487426042556763, + "num_tokens": 106719884.0, + "step": 2795 + }, + { + "epoch": 0.3556799389390663, + "ewc_loss": 0.006020058412104845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0200585721759126e-05, + "grad_norm": 3.311086893081665, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8534080386161804, + "num_tokens": 106755224.0, + "step": 2796 + }, + { + "epoch": 0.35580714921765677, + "ewc_loss": 0.006041915155947208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.041915185051039e-05, + "grad_norm": 3.3139266967773438, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8708497881889343, + "num_tokens": 106789366.0, + "step": 2797 + }, + { + "epoch": 0.3559343594962473, + "ewc_loss": 0.006017099134624004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0170990764163435e-05, + "grad_norm": 3.3674066066741943, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8504422903060913, + "num_tokens": 106826382.0, + "step": 2798 + }, + { + "epoch": 0.3560615697748378, + "ewc_loss": 0.006054168101400137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.054167897673324e-05, + "grad_norm": 3.323873281478882, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8487817049026489, + "num_tokens": 106860027.0, + "step": 2799 + }, + { + "epoch": 0.3561887800534283, + "ewc_loss": 0.006010662764310837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.010662764310837e-05, + "grad_norm": 3.2822108268737793, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8661643862724304, + "num_tokens": 106898684.0, + "step": 2800 + }, + { + "epoch": 0.35631599033201883, + "ewc_loss": 0.0060039907693862915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0039907111786306e-05, + "grad_norm": 3.3696017265319824, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8592848777770996, + "num_tokens": 106934496.0, + "step": 2801 + }, + { + "epoch": 0.35644320061060936, + "ewc_loss": 0.0060759722255170345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.075972123653628e-05, + "grad_norm": 3.271749496459961, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8713960647583008, + "num_tokens": 106974335.0, + "step": 2802 + }, + { + "epoch": 0.35657041088919983, + "ewc_loss": 0.005986517760902643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.98651786276605e-05, + "grad_norm": 3.3190431594848633, + "learning_rate": 1e-06, + "loss": 0.5277, + "mean_token_accuracy": 0.8332068920135498, + "num_tokens": 107013914.0, + "step": 2803 + }, + { + "epoch": 0.35669762116779036, + "ewc_loss": 0.00604131817817688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.041318192728795e-05, + "grad_norm": 3.2746033668518066, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8623329401016235, + "num_tokens": 107053747.0, + "step": 2804 + }, + { + "epoch": 0.3568248314463809, + "ewc_loss": 0.005997049622237682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9970494476146996e-05, + "grad_norm": 3.272972345352173, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8487657308578491, + "num_tokens": 107095245.0, + "step": 2805 + }, + { + "epoch": 0.35695204172497136, + "ewc_loss": 0.006020746659487486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0207465139683336e-05, + "grad_norm": 3.3310036659240723, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8581746816635132, + "num_tokens": 107133627.0, + "step": 2806 + }, + { + "epoch": 0.3570792520035619, + "ewc_loss": 0.006045397371053696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.045397458365187e-05, + "grad_norm": 3.3292853832244873, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8341118097305298, + "num_tokens": 107170128.0, + "step": 2807 + }, + { + "epoch": 0.3572064622821524, + "ewc_loss": 0.0060287294909358025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.028729330864735e-05, + "grad_norm": 3.3215649127960205, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8676999807357788, + "num_tokens": 107209137.0, + "step": 2808 + }, + { + "epoch": 0.3573336725607429, + "ewc_loss": 0.006018946412950754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0189464420545846e-05, + "grad_norm": 3.331027030944824, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8522185683250427, + "num_tokens": 107244866.0, + "step": 2809 + }, + { + "epoch": 0.3574608828393334, + "ewc_loss": 0.006029778625816107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0297785239527e-05, + "grad_norm": 3.2685608863830566, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8591166734695435, + "num_tokens": 107287399.0, + "step": 2810 + }, + { + "epoch": 0.35758809311792394, + "ewc_loss": 0.005991295911371708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.991295984131284e-05, + "grad_norm": 3.324472188949585, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8474067449569702, + "num_tokens": 107325627.0, + "step": 2811 + }, + { + "epoch": 0.3577153033965144, + "ewc_loss": 0.006025380454957485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.025380571372807e-05, + "grad_norm": 3.2955973148345947, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8736419677734375, + "num_tokens": 107359375.0, + "step": 2812 + }, + { + "epoch": 0.35784251367510495, + "ewc_loss": 0.006003403104841709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.003403177601285e-05, + "grad_norm": 3.272156000137329, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8456772565841675, + "num_tokens": 107400286.0, + "step": 2813 + }, + { + "epoch": 0.3579697239536955, + "ewc_loss": 0.005993772763758898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.993772720103152e-05, + "grad_norm": 3.3929853439331055, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8479313850402832, + "num_tokens": 107431768.0, + "step": 2814 + }, + { + "epoch": 0.35809693423228595, + "ewc_loss": 0.006070279981940985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0702797782141715e-05, + "grad_norm": 3.281013011932373, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8679870367050171, + "num_tokens": 107470875.0, + "step": 2815 + }, + { + "epoch": 0.3582241445108765, + "ewc_loss": 0.005975177977234125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.975177919026464e-05, + "grad_norm": 3.3201303482055664, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8490207195281982, + "num_tokens": 107510988.0, + "step": 2816 + }, + { + "epoch": 0.358351354789467, + "ewc_loss": 0.006030007731169462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.030007716617547e-05, + "grad_norm": 3.255518674850464, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8654249310493469, + "num_tokens": 107552423.0, + "step": 2817 + }, + { + "epoch": 0.3584785650680575, + "ewc_loss": 0.005981934256851673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.981934373266995e-05, + "grad_norm": 3.3325417041778564, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.849999189376831, + "num_tokens": 107591693.0, + "step": 2818 + }, + { + "epoch": 0.358605775346648, + "ewc_loss": 0.00604363065212965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0436304920585826e-05, + "grad_norm": 3.3359932899475098, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8622414469718933, + "num_tokens": 107626017.0, + "step": 2819 + }, + { + "epoch": 0.35873298562523853, + "ewc_loss": 0.006022462621331215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.022462548571639e-05, + "grad_norm": 3.2959976196289062, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8735244274139404, + "num_tokens": 107664927.0, + "step": 2820 + }, + { + "epoch": 0.358860195903829, + "ewc_loss": 0.0059900106862187386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9900106862187386e-05, + "grad_norm": 3.3865175247192383, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8507102727890015, + "num_tokens": 107702487.0, + "step": 2821 + }, + { + "epoch": 0.35898740618241953, + "ewc_loss": 0.006051951553672552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0519516409840435e-05, + "grad_norm": 3.2497637271881104, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8712934255599976, + "num_tokens": 107741263.0, + "step": 2822 + }, + { + "epoch": 0.35911461646101006, + "ewc_loss": 0.0059510283172130585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.951028288109228e-05, + "grad_norm": 3.293520450592041, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8609805703163147, + "num_tokens": 107781079.0, + "step": 2823 + }, + { + "epoch": 0.35924182673960053, + "ewc_loss": 0.006020860746502876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.020860746502876e-05, + "grad_norm": 3.2611501216888428, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8762393593788147, + "num_tokens": 107819309.0, + "step": 2824 + }, + { + "epoch": 0.35936903701819106, + "ewc_loss": 0.005981877911835909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.981877984595485e-05, + "grad_norm": 3.3947594165802, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8594624400138855, + "num_tokens": 107853251.0, + "step": 2825 + }, + { + "epoch": 0.3594962472967816, + "ewc_loss": 0.00604893546551466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.048935392755084e-05, + "grad_norm": 3.2337801456451416, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8774574398994446, + "num_tokens": 107893476.0, + "step": 2826 + }, + { + "epoch": 0.35962345757537206, + "ewc_loss": 0.005928223952651024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.928223981754854e-05, + "grad_norm": 3.2813310623168945, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8683228492736816, + "num_tokens": 107931204.0, + "step": 2827 + }, + { + "epoch": 0.3597506678539626, + "ewc_loss": 0.006006136536598206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.006136754876934e-05, + "grad_norm": 3.3910741806030273, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8409508466720581, + "num_tokens": 107966471.0, + "step": 2828 + }, + { + "epoch": 0.3598778781325531, + "ewc_loss": 0.006056737154722214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.056737038306892e-05, + "grad_norm": 3.283416271209717, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8603723049163818, + "num_tokens": 108005328.0, + "step": 2829 + }, + { + "epoch": 0.36000508841114365, + "ewc_loss": 0.005947154015302658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9471538406796753e-05, + "grad_norm": 3.285418748855591, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8472161293029785, + "num_tokens": 108046295.0, + "step": 2830 + }, + { + "epoch": 0.3601322986897341, + "ewc_loss": 0.005991588346660137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.991588477627374e-05, + "grad_norm": 3.462233066558838, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8602006435394287, + "num_tokens": 108078164.0, + "step": 2831 + }, + { + "epoch": 0.36025950896832465, + "ewc_loss": 0.006083971820771694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0839720390504226e-05, + "grad_norm": 3.3130853176116943, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8532129526138306, + "num_tokens": 108117105.0, + "step": 2832 + }, + { + "epoch": 0.3603867192469152, + "ewc_loss": 0.005934962071478367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.934961882303469e-05, + "grad_norm": 3.308931350708008, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8551532030105591, + "num_tokens": 108155301.0, + "step": 2833 + }, + { + "epoch": 0.36051392952550565, + "ewc_loss": 0.006000010296702385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0000103985657915e-05, + "grad_norm": 3.2911391258239746, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8679962158203125, + "num_tokens": 108193091.0, + "step": 2834 + }, + { + "epoch": 0.3606411398040962, + "ewc_loss": 0.005987293086946011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.987293116049841e-05, + "grad_norm": 3.2740890979766846, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8568993210792542, + "num_tokens": 108233719.0, + "step": 2835 + }, + { + "epoch": 0.3607683500826867, + "ewc_loss": 0.0059751225635409355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9751226217485964e-05, + "grad_norm": 3.3699822425842285, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8642439842224121, + "num_tokens": 108265775.0, + "step": 2836 + }, + { + "epoch": 0.3608955603612772, + "ewc_loss": 0.006037314888089895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0373149608494714e-05, + "grad_norm": 3.2389397621154785, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8690235018730164, + "num_tokens": 108307953.0, + "step": 2837 + }, + { + "epoch": 0.3610227706398677, + "ewc_loss": 0.005941924173384905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.941924246144481e-05, + "grad_norm": 3.2952632904052734, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8534809350967407, + "num_tokens": 108351998.0, + "step": 2838 + }, + { + "epoch": 0.36114998091845824, + "ewc_loss": 0.0060047185979783535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0047186707379296e-05, + "grad_norm": 3.291799783706665, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.845746636390686, + "num_tokens": 108395968.0, + "step": 2839 + }, + { + "epoch": 0.3612771911970487, + "ewc_loss": 0.005978894419968128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.978894478175789e-05, + "grad_norm": 3.361690044403076, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8387764096260071, + "num_tokens": 108433045.0, + "step": 2840 + }, + { + "epoch": 0.36140440147563924, + "ewc_loss": 0.006023035384714603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.023035530233756e-05, + "grad_norm": 3.3091986179351807, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8530633449554443, + "num_tokens": 108470964.0, + "step": 2841 + }, + { + "epoch": 0.36153161175422976, + "ewc_loss": 0.005972062703222036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9720627177739516e-05, + "grad_norm": 3.2626452445983887, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8592617511749268, + "num_tokens": 108512572.0, + "step": 2842 + }, + { + "epoch": 0.36165882203282024, + "ewc_loss": 0.005965669173747301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.965668970020488e-05, + "grad_norm": 3.320786952972412, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8432530164718628, + "num_tokens": 108548526.0, + "step": 2843 + }, + { + "epoch": 0.36178603231141077, + "ewc_loss": 0.006004784256219864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.004784154356457e-05, + "grad_norm": 3.352067708969116, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8544428944587708, + "num_tokens": 108588447.0, + "step": 2844 + }, + { + "epoch": 0.3619132425900013, + "ewc_loss": 0.00599738210439682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9973823226755485e-05, + "grad_norm": 3.335819959640503, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8495280742645264, + "num_tokens": 108628426.0, + "step": 2845 + }, + { + "epoch": 0.36204045286859177, + "ewc_loss": 0.005971084348857403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9710844652727246e-05, + "grad_norm": 3.3493053913116455, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8531399965286255, + "num_tokens": 108661355.0, + "step": 2846 + }, + { + "epoch": 0.3621676631471823, + "ewc_loss": 0.005995526909828186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.995526953483932e-05, + "grad_norm": 3.28371000289917, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8755394220352173, + "num_tokens": 108698505.0, + "step": 2847 + }, + { + "epoch": 0.3622948734257728, + "ewc_loss": 0.005951968487352133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.951968705630861e-05, + "grad_norm": 3.271315813064575, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8632119297981262, + "num_tokens": 108737409.0, + "step": 2848 + }, + { + "epoch": 0.3624220837043633, + "ewc_loss": 0.005966066382825375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.966066237306222e-05, + "grad_norm": 3.3053319454193115, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.859580397605896, + "num_tokens": 108772969.0, + "step": 2849 + }, + { + "epoch": 0.3625492939829538, + "ewc_loss": 0.005983192939311266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.983193113934249e-05, + "grad_norm": 3.3666772842407227, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8638871908187866, + "num_tokens": 108805129.0, + "step": 2850 + }, + { + "epoch": 0.36267650426154435, + "ewc_loss": 0.006014850456267595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.01485044171568e-05, + "grad_norm": 3.3467326164245605, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8435901403427124, + "num_tokens": 108841197.0, + "step": 2851 + }, + { + "epoch": 0.3628037145401348, + "ewc_loss": 0.005992372520267963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9923724620603025e-05, + "grad_norm": 3.297070026397705, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8711885213851929, + "num_tokens": 108876442.0, + "step": 2852 + }, + { + "epoch": 0.36293092481872535, + "ewc_loss": 0.005997838918119669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9978388890158385e-05, + "grad_norm": 3.4109690189361572, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8457093238830566, + "num_tokens": 108911432.0, + "step": 2853 + }, + { + "epoch": 0.3630581350973159, + "ewc_loss": 0.00606882106512785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.068820948712528e-05, + "grad_norm": 3.302267074584961, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8453161716461182, + "num_tokens": 108952977.0, + "step": 2854 + }, + { + "epoch": 0.36318534537590635, + "ewc_loss": 0.005984399933367968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.984399831504561e-05, + "grad_norm": 3.3453733921051025, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.835961103439331, + "num_tokens": 108995602.0, + "step": 2855 + }, + { + "epoch": 0.3633125556544969, + "ewc_loss": 0.006055053789168596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0550537455128506e-05, + "grad_norm": 3.2490768432617188, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.859603226184845, + "num_tokens": 109040226.0, + "step": 2856 + }, + { + "epoch": 0.3634397659330874, + "ewc_loss": 0.005998292006552219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9982918173773214e-05, + "grad_norm": 3.275024652481079, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.859595000743866, + "num_tokens": 109082898.0, + "step": 2857 + }, + { + "epoch": 0.3635669762116779, + "ewc_loss": 0.006042758002877235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.042758104740642e-05, + "grad_norm": 3.337932586669922, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8490514755249023, + "num_tokens": 109122460.0, + "step": 2858 + }, + { + "epoch": 0.3636941864902684, + "ewc_loss": 0.006062780506908894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.062780448701233e-05, + "grad_norm": 3.3208167552948, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8566391468048096, + "num_tokens": 109161680.0, + "step": 2859 + }, + { + "epoch": 0.36382139676885894, + "ewc_loss": 0.006040174979716539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.040175139787607e-05, + "grad_norm": 3.268400192260742, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8777801990509033, + "num_tokens": 109198943.0, + "step": 2860 + }, + { + "epoch": 0.3639486070474494, + "ewc_loss": 0.0060208518989384174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.020852015353739e-05, + "grad_norm": 3.3212926387786865, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8652849197387695, + "num_tokens": 109235719.0, + "step": 2861 + }, + { + "epoch": 0.36407581732603994, + "ewc_loss": 0.006066984497010708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.066984497010708e-05, + "grad_norm": 3.3368163108825684, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.850502610206604, + "num_tokens": 109272681.0, + "step": 2862 + }, + { + "epoch": 0.36420302760463047, + "ewc_loss": 0.006043539848178625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.043539906386286e-05, + "grad_norm": 3.355210304260254, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.845910370349884, + "num_tokens": 109306114.0, + "step": 2863 + }, + { + "epoch": 0.36433023788322094, + "ewc_loss": 0.006075636483728886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0756363382097334e-05, + "grad_norm": 3.3948044776916504, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.849979817867279, + "num_tokens": 109337728.0, + "step": 2864 + }, + { + "epoch": 0.36445744816181147, + "ewc_loss": 0.006080427672713995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.080427556298673e-05, + "grad_norm": 3.3373799324035645, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8490438461303711, + "num_tokens": 109377809.0, + "step": 2865 + }, + { + "epoch": 0.364584658440402, + "ewc_loss": 0.006053233053535223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.053232937119901e-05, + "grad_norm": 3.271139621734619, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8719922304153442, + "num_tokens": 109413499.0, + "step": 2866 + }, + { + "epoch": 0.36471186871899247, + "ewc_loss": 0.0060402750968933105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.040275184204802e-05, + "grad_norm": 3.2765018939971924, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8479210734367371, + "num_tokens": 109460961.0, + "step": 2867 + }, + { + "epoch": 0.364839078997583, + "ewc_loss": 0.006057375576347113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0573755035875365e-05, + "grad_norm": 3.239379405975342, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8681207299232483, + "num_tokens": 109500615.0, + "step": 2868 + }, + { + "epoch": 0.36496628927617353, + "ewc_loss": 0.00602388521656394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.023885362083092e-05, + "grad_norm": 3.3469419479370117, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8616785407066345, + "num_tokens": 109533921.0, + "step": 2869 + }, + { + "epoch": 0.365093499554764, + "ewc_loss": 0.0060959504917263985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.095950448070653e-05, + "grad_norm": 3.398989677429199, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8470675349235535, + "num_tokens": 109567069.0, + "step": 2870 + }, + { + "epoch": 0.36522070983335453, + "ewc_loss": 0.006102355662733316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.1023554735584185e-05, + "grad_norm": 3.3023581504821777, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8700906038284302, + "num_tokens": 109603696.0, + "step": 2871 + }, + { + "epoch": 0.36534792011194506, + "ewc_loss": 0.006035066209733486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.035066326148808e-05, + "grad_norm": 3.2534525394439697, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8744706511497498, + "num_tokens": 109644590.0, + "step": 2872 + }, + { + "epoch": 0.36547513039053553, + "ewc_loss": 0.0060341074131429195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0341073549352586e-05, + "grad_norm": 3.354506015777588, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.852258563041687, + "num_tokens": 109678869.0, + "step": 2873 + }, + { + "epoch": 0.36560234066912606, + "ewc_loss": 0.006108480505645275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.108480738475919e-05, + "grad_norm": 3.290154218673706, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.853238582611084, + "num_tokens": 109722517.0, + "step": 2874 + }, + { + "epoch": 0.3657295509477166, + "ewc_loss": 0.006015857215970755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0158570704516023e-05, + "grad_norm": 3.353876829147339, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8547768592834473, + "num_tokens": 109757869.0, + "step": 2875 + }, + { + "epoch": 0.36585676122630706, + "ewc_loss": 0.00607341667637229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.073416807339527e-05, + "grad_norm": 3.3675975799560547, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8536766767501831, + "num_tokens": 109793162.0, + "step": 2876 + }, + { + "epoch": 0.3659839715048976, + "ewc_loss": 0.0060753608122467995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.075360579416156e-05, + "grad_norm": 3.280576705932617, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8649518489837646, + "num_tokens": 109832942.0, + "step": 2877 + }, + { + "epoch": 0.3661111817834881, + "ewc_loss": 0.0060106124728918076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.010612560203299e-05, + "grad_norm": 3.2911806106567383, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8565260171890259, + "num_tokens": 109876230.0, + "step": 2878 + }, + { + "epoch": 0.3662383920620786, + "ewc_loss": 0.006059533916413784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.059533916413784e-05, + "grad_norm": 3.38250994682312, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8479360938072205, + "num_tokens": 109911844.0, + "step": 2879 + }, + { + "epoch": 0.3663656023406691, + "ewc_loss": 0.006094710435718298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.094710624893196e-05, + "grad_norm": 3.2847862243652344, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8455480337142944, + "num_tokens": 109957619.0, + "step": 2880 + }, + { + "epoch": 0.36649281261925964, + "ewc_loss": 0.006013117730617523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.013117672409862e-05, + "grad_norm": 3.2838971614837646, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8471895456314087, + "num_tokens": 109999117.0, + "step": 2881 + }, + { + "epoch": 0.3666200228978502, + "ewc_loss": 0.006051565520465374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0515652876347303e-05, + "grad_norm": 3.292383909225464, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8646050095558167, + "num_tokens": 110039557.0, + "step": 2882 + }, + { + "epoch": 0.36674723317644065, + "ewc_loss": 0.006046986673027277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0469868913060054e-05, + "grad_norm": 3.3677685260772705, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8498435020446777, + "num_tokens": 110074418.0, + "step": 2883 + }, + { + "epoch": 0.3668744434550312, + "ewc_loss": 0.006088875699788332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0888756706845015e-05, + "grad_norm": 3.334660768508911, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8618899583816528, + "num_tokens": 110109255.0, + "step": 2884 + }, + { + "epoch": 0.3670016537336217, + "ewc_loss": 0.006050182040780783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.050182128092274e-05, + "grad_norm": 3.2827024459838867, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.86434006690979, + "num_tokens": 110150180.0, + "step": 2885 + }, + { + "epoch": 0.3671288640122122, + "ewc_loss": 0.006029148586094379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.029148426023312e-05, + "grad_norm": 3.3364169597625732, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8421182632446289, + "num_tokens": 110193653.0, + "step": 2886 + }, + { + "epoch": 0.3672560742908027, + "ewc_loss": 0.006073239259421825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0732392739737406e-05, + "grad_norm": 3.3474044799804688, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8655725717544556, + "num_tokens": 110231020.0, + "step": 2887 + }, + { + "epoch": 0.36738328456939323, + "ewc_loss": 0.006051955744624138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.051955642760731e-05, + "grad_norm": 3.298738718032837, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8737035989761353, + "num_tokens": 110269397.0, + "step": 2888 + }, + { + "epoch": 0.3675104948479837, + "ewc_loss": 0.006028466392308474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.028466304996982e-05, + "grad_norm": 3.287813663482666, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8719048500061035, + "num_tokens": 110311188.0, + "step": 2889 + }, + { + "epoch": 0.36763770512657423, + "ewc_loss": 0.006042311433702707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.042311360943131e-05, + "grad_norm": 3.4125077724456787, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8627178072929382, + "num_tokens": 110343801.0, + "step": 2890 + }, + { + "epoch": 0.36776491540516476, + "ewc_loss": 0.006091732997447252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0917329392395914e-05, + "grad_norm": 3.3162760734558105, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8411492109298706, + "num_tokens": 110385830.0, + "step": 2891 + }, + { + "epoch": 0.36789212568375523, + "ewc_loss": 0.0060001760721206665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.000176290399395e-05, + "grad_norm": 3.364312171936035, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8573170304298401, + "num_tokens": 110421494.0, + "step": 2892 + }, + { + "epoch": 0.36801933596234576, + "ewc_loss": 0.006067199166864157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.067199137760326e-05, + "grad_norm": 3.303614616394043, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.856032133102417, + "num_tokens": 110463132.0, + "step": 2893 + }, + { + "epoch": 0.3681465462409363, + "ewc_loss": 0.00601805467158556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0180547734489664e-05, + "grad_norm": 3.336094617843628, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8607079982757568, + "num_tokens": 110498330.0, + "step": 2894 + }, + { + "epoch": 0.36827375651952676, + "ewc_loss": 0.006049070507287979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.049070361768827e-05, + "grad_norm": 3.3151612281799316, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8662512302398682, + "num_tokens": 110534438.0, + "step": 2895 + }, + { + "epoch": 0.3684009667981173, + "ewc_loss": 0.006029952317476273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.029952419339679e-05, + "grad_norm": 3.3614981174468994, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8531954288482666, + "num_tokens": 110572059.0, + "step": 2896 + }, + { + "epoch": 0.3685281770767078, + "ewc_loss": 0.00605798838660121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0579885030165315e-05, + "grad_norm": 3.2920422554016113, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.860276997089386, + "num_tokens": 110611012.0, + "step": 2897 + }, + { + "epoch": 0.3686553873552983, + "ewc_loss": 0.006010074634104967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.01007450313773e-05, + "grad_norm": 3.3075757026672363, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.865676760673523, + "num_tokens": 110647429.0, + "step": 2898 + }, + { + "epoch": 0.3687825976338888, + "ewc_loss": 0.00604534475132823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.045344707672484e-05, + "grad_norm": 3.3530375957489014, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8525010347366333, + "num_tokens": 110685052.0, + "step": 2899 + }, + { + "epoch": 0.36890980791247935, + "ewc_loss": 0.006069953553378582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0699534515151754e-05, + "grad_norm": 3.3444509506225586, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8376598954200745, + "num_tokens": 110725222.0, + "step": 2900 + }, + { + "epoch": 0.3690370181910698, + "ewc_loss": 0.006042142398655415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.042142558726482e-05, + "grad_norm": 3.356172561645508, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8472223281860352, + "num_tokens": 110764314.0, + "step": 2901 + }, + { + "epoch": 0.36916422846966035, + "ewc_loss": 0.006071853451430798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0718535678461194e-05, + "grad_norm": 3.2547531127929688, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8700711727142334, + "num_tokens": 110803585.0, + "step": 2902 + }, + { + "epoch": 0.3692914387482509, + "ewc_loss": 0.005999058950692415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.999059067107737e-05, + "grad_norm": 3.2848682403564453, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8530412912368774, + "num_tokens": 110846532.0, + "step": 2903 + }, + { + "epoch": 0.36941864902684135, + "ewc_loss": 0.0060488563030958176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.048856084817089e-05, + "grad_norm": 3.3086771965026855, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.856032133102417, + "num_tokens": 110889807.0, + "step": 2904 + }, + { + "epoch": 0.3695458593054319, + "ewc_loss": 0.006041331682354212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0413316532503814e-05, + "grad_norm": 3.3112120628356934, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8384486436843872, + "num_tokens": 110930567.0, + "step": 2905 + }, + { + "epoch": 0.3696730695840224, + "ewc_loss": 0.006021652836352587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.02165273448918e-05, + "grad_norm": 3.3352465629577637, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.86616450548172, + "num_tokens": 110968725.0, + "step": 2906 + }, + { + "epoch": 0.3698002798626129, + "ewc_loss": 0.006045124027878046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0451242461567745e-05, + "grad_norm": 3.307096242904663, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8813361525535583, + "num_tokens": 111002724.0, + "step": 2907 + }, + { + "epoch": 0.3699274901412034, + "ewc_loss": 0.006025140639394522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.02514082856942e-05, + "grad_norm": 3.361903429031372, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8567276000976562, + "num_tokens": 111036668.0, + "step": 2908 + }, + { + "epoch": 0.37005470041979394, + "ewc_loss": 0.006062006112188101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0620062868110836e-05, + "grad_norm": 3.3357417583465576, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8477160930633545, + "num_tokens": 111077712.0, + "step": 2909 + }, + { + "epoch": 0.3701819106983844, + "ewc_loss": 0.0060251797549426556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0251797549426556e-05, + "grad_norm": 3.344326972961426, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8494328856468201, + "num_tokens": 111118856.0, + "step": 2910 + }, + { + "epoch": 0.37030912097697494, + "ewc_loss": 0.0060431682504713535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.043168104952201e-05, + "grad_norm": 3.314340114593506, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8496023416519165, + "num_tokens": 111156712.0, + "step": 2911 + }, + { + "epoch": 0.37043633125556547, + "ewc_loss": 0.00602155365049839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.021553781465627e-05, + "grad_norm": 3.3558037281036377, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8583352565765381, + "num_tokens": 111195542.0, + "step": 2912 + }, + { + "epoch": 0.37056354153415594, + "ewc_loss": 0.006048084236681461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.048084105714224e-05, + "grad_norm": 3.323051929473877, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8395279049873352, + "num_tokens": 111237827.0, + "step": 2913 + }, + { + "epoch": 0.37069075181274647, + "ewc_loss": 0.006012843921780586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.012844096403569e-05, + "grad_norm": 3.289311170578003, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8650879859924316, + "num_tokens": 111278802.0, + "step": 2914 + }, + { + "epoch": 0.370817962091337, + "ewc_loss": 0.006002665963023901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0026661230949685e-05, + "grad_norm": 3.393723726272583, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8545109033584595, + "num_tokens": 111313795.0, + "step": 2915 + }, + { + "epoch": 0.37094517236992747, + "ewc_loss": 0.006076524965465069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.076524732634425e-05, + "grad_norm": 3.291978359222412, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8662082552909851, + "num_tokens": 111353798.0, + "step": 2916 + }, + { + "epoch": 0.371072382648518, + "ewc_loss": 0.005985313560813665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9853136917809024e-05, + "grad_norm": 3.3596668243408203, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8579497337341309, + "num_tokens": 111390613.0, + "step": 2917 + }, + { + "epoch": 0.3711995929271085, + "ewc_loss": 0.006055910140275955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.05591012572404e-05, + "grad_norm": 3.2789905071258545, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.858108401298523, + "num_tokens": 111432043.0, + "step": 2918 + }, + { + "epoch": 0.371326803205699, + "ewc_loss": 0.0059957485646009445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9957485063932836e-05, + "grad_norm": 3.2575762271881104, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8639490604400635, + "num_tokens": 111476575.0, + "step": 2919 + }, + { + "epoch": 0.3714540134842895, + "ewc_loss": 0.00601544976234436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.015449980623089e-05, + "grad_norm": 3.3456814289093018, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8576529026031494, + "num_tokens": 111515304.0, + "step": 2920 + }, + { + "epoch": 0.37158122376288005, + "ewc_loss": 0.006062516942620277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.062517059035599e-05, + "grad_norm": 3.3516576290130615, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8671555519104004, + "num_tokens": 111550246.0, + "step": 2921 + }, + { + "epoch": 0.3717084340414705, + "ewc_loss": 0.006025291979312897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.025291804689914e-05, + "grad_norm": 3.3213653564453125, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8587559461593628, + "num_tokens": 111592189.0, + "step": 2922 + }, + { + "epoch": 0.37183564432006105, + "ewc_loss": 0.0060147386975586414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0147387557663023e-05, + "grad_norm": 3.276585817337036, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8613069653511047, + "num_tokens": 111638177.0, + "step": 2923 + }, + { + "epoch": 0.3719628545986516, + "ewc_loss": 0.005999175366014242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.999175482429564e-05, + "grad_norm": 3.3822712898254395, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.858020544052124, + "num_tokens": 111674365.0, + "step": 2924 + }, + { + "epoch": 0.37209006487724205, + "ewc_loss": 0.006059091538190842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.059091538190842e-05, + "grad_norm": 3.3359036445617676, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8534755110740662, + "num_tokens": 111714571.0, + "step": 2925 + }, + { + "epoch": 0.3722172751558326, + "ewc_loss": 0.005995185114443302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.995184983476065e-05, + "grad_norm": 3.309771776199341, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8587955832481384, + "num_tokens": 111755214.0, + "step": 2926 + }, + { + "epoch": 0.3723444854344231, + "ewc_loss": 0.005992605816572905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.992606020299718e-05, + "grad_norm": 3.339901924133301, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8317698240280151, + "num_tokens": 111796516.0, + "step": 2927 + }, + { + "epoch": 0.3724716957130136, + "ewc_loss": 0.0060243066400289536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0243066400289536e-05, + "grad_norm": 3.360016107559204, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.843868613243103, + "num_tokens": 111832615.0, + "step": 2928 + }, + { + "epoch": 0.3725989059916041, + "ewc_loss": 0.006020992062985897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.020992077537812e-05, + "grad_norm": 3.2956366539001465, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8481549024581909, + "num_tokens": 111874682.0, + "step": 2929 + }, + { + "epoch": 0.37272611627019464, + "ewc_loss": 0.005995102226734161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9951020375592634e-05, + "grad_norm": 3.326869249343872, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.846569299697876, + "num_tokens": 111916369.0, + "step": 2930 + }, + { + "epoch": 0.37285332654878517, + "ewc_loss": 0.006024165544658899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0241654864512384e-05, + "grad_norm": 3.3461050987243652, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8513424396514893, + "num_tokens": 111950671.0, + "step": 2931 + }, + { + "epoch": 0.37298053682737564, + "ewc_loss": 0.006017300765961409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.017300984240137e-05, + "grad_norm": 3.348006248474121, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8421775698661804, + "num_tokens": 111988846.0, + "step": 2932 + }, + { + "epoch": 0.37310774710596617, + "ewc_loss": 0.006029484793543816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.029484575265087e-05, + "grad_norm": 3.4017510414123535, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8682266473770142, + "num_tokens": 112022853.0, + "step": 2933 + }, + { + "epoch": 0.3732349573845567, + "ewc_loss": 0.0060525513254106045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.052551179891452e-05, + "grad_norm": 3.2780423164367676, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8546739816665649, + "num_tokens": 112065631.0, + "step": 2934 + }, + { + "epoch": 0.37336216766314717, + "ewc_loss": 0.005978237371891737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9782374592032284e-05, + "grad_norm": 3.3210651874542236, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8464111089706421, + "num_tokens": 112107236.0, + "step": 2935 + }, + { + "epoch": 0.3734893779417377, + "ewc_loss": 0.006042039021849632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.042039240128361e-05, + "grad_norm": 3.280982494354248, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8362147808074951, + "num_tokens": 112153914.0, + "step": 2936 + }, + { + "epoch": 0.3736165882203282, + "ewc_loss": 0.006014061160385609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.014061000314541e-05, + "grad_norm": 3.4105329513549805, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8598469495773315, + "num_tokens": 112187596.0, + "step": 2937 + }, + { + "epoch": 0.3737437984989187, + "ewc_loss": 0.006090730894356966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0907306760782376e-05, + "grad_norm": 3.3331828117370605, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.857019305229187, + "num_tokens": 112222409.0, + "step": 2938 + }, + { + "epoch": 0.37387100877750923, + "ewc_loss": 0.006021690089255571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.021690205670893e-05, + "grad_norm": 3.404726505279541, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8453697562217712, + "num_tokens": 112257699.0, + "step": 2939 + }, + { + "epoch": 0.37399821905609976, + "ewc_loss": 0.006087036337703466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.087036308599636e-05, + "grad_norm": 3.3196451663970947, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8650667071342468, + "num_tokens": 112292431.0, + "step": 2940 + }, + { + "epoch": 0.37412542933469023, + "ewc_loss": 0.006031816825270653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.031816883478314e-05, + "grad_norm": 3.3078413009643555, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8581935167312622, + "num_tokens": 112331241.0, + "step": 2941 + }, + { + "epoch": 0.37425263961328076, + "ewc_loss": 0.006061595398932695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.061595559003763e-05, + "grad_norm": 3.3125364780426025, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8536905646324158, + "num_tokens": 112368486.0, + "step": 2942 + }, + { + "epoch": 0.3743798498918713, + "ewc_loss": 0.0060748811811208725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.074881093809381e-05, + "grad_norm": 3.3204386234283447, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8477915525436401, + "num_tokens": 112409638.0, + "step": 2943 + }, + { + "epoch": 0.37450706017046176, + "ewc_loss": 0.006069409195333719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0694092098856345e-05, + "grad_norm": 3.2744808197021484, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8529649972915649, + "num_tokens": 112448257.0, + "step": 2944 + }, + { + "epoch": 0.3746342704490523, + "ewc_loss": 0.00605802470818162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0580248828046024e-05, + "grad_norm": 3.2464780807495117, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.871125340461731, + "num_tokens": 112488739.0, + "step": 2945 + }, + { + "epoch": 0.3747614807276428, + "ewc_loss": 0.0060551841743290424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0551839851541445e-05, + "grad_norm": 3.3503739833831787, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8423784971237183, + "num_tokens": 112528621.0, + "step": 2946 + }, + { + "epoch": 0.3748886910062333, + "ewc_loss": 0.006114520598202944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.114520510891452e-05, + "grad_norm": 3.358229875564575, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.852406919002533, + "num_tokens": 112564615.0, + "step": 2947 + }, + { + "epoch": 0.3750159012848238, + "ewc_loss": 0.006088635418564081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.088635200285353e-05, + "grad_norm": 3.400409460067749, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8474633693695068, + "num_tokens": 112598190.0, + "step": 2948 + }, + { + "epoch": 0.37514311156341434, + "ewc_loss": 0.006109106354415417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.109106470830739e-05, + "grad_norm": 3.373891830444336, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8557429313659668, + "num_tokens": 112632078.0, + "step": 2949 + }, + { + "epoch": 0.3752703218420048, + "ewc_loss": 0.006074090022593737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.074089833418839e-05, + "grad_norm": 3.3550002574920654, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8338863253593445, + "num_tokens": 112667848.0, + "step": 2950 + }, + { + "epoch": 0.37539753212059535, + "ewc_loss": 0.006094689480960369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.094689524616115e-05, + "grad_norm": 3.300847053527832, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8548918962478638, + "num_tokens": 112707037.0, + "step": 2951 + }, + { + "epoch": 0.3755247423991859, + "ewc_loss": 0.006067883223295212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.06788344157394e-05, + "grad_norm": 3.2281510829925537, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8667829036712646, + "num_tokens": 112751921.0, + "step": 2952 + }, + { + "epoch": 0.37565195267777635, + "ewc_loss": 0.0060453568585217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0453567130025476e-05, + "grad_norm": 3.2997186183929443, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.861478328704834, + "num_tokens": 112793061.0, + "step": 2953 + }, + { + "epoch": 0.3757791629563669, + "ewc_loss": 0.006096395198255777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.09639537287876e-05, + "grad_norm": 3.292124032974243, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8670127987861633, + "num_tokens": 112833852.0, + "step": 2954 + }, + { + "epoch": 0.3759063732349574, + "ewc_loss": 0.006080235354602337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.080235471017659e-05, + "grad_norm": 3.3802788257598877, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8540738821029663, + "num_tokens": 112866718.0, + "step": 2955 + }, + { + "epoch": 0.3760335835135479, + "ewc_loss": 0.006129909306764603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.12990916124545e-05, + "grad_norm": 3.299882650375366, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8584451675415039, + "num_tokens": 112909420.0, + "step": 2956 + }, + { + "epoch": 0.3761607937921384, + "ewc_loss": 0.006050231400877237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0502316046040505e-05, + "grad_norm": 3.3474462032318115, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8597827553749084, + "num_tokens": 112944570.0, + "step": 2957 + }, + { + "epoch": 0.37628800407072893, + "ewc_loss": 0.0061141615733504295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.114161806181073e-05, + "grad_norm": 3.28562593460083, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8684536814689636, + "num_tokens": 112984107.0, + "step": 2958 + }, + { + "epoch": 0.3764152143493194, + "ewc_loss": 0.006059057544916868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.059057704987936e-05, + "grad_norm": 3.2592058181762695, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8802086710929871, + "num_tokens": 113023066.0, + "step": 2959 + }, + { + "epoch": 0.37654242462790993, + "ewc_loss": 0.006072891876101494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0728918469976634e-05, + "grad_norm": 3.36930775642395, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8566240668296814, + "num_tokens": 113060093.0, + "step": 2960 + }, + { + "epoch": 0.37666963490650046, + "ewc_loss": 0.006125187035650015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.125187064753845e-05, + "grad_norm": 3.4239695072174072, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8484355211257935, + "num_tokens": 113095940.0, + "step": 2961 + }, + { + "epoch": 0.37679684518509093, + "ewc_loss": 0.00612177699804306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.121776823420078e-05, + "grad_norm": 3.3274874687194824, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8677332997322083, + "num_tokens": 113130699.0, + "step": 2962 + }, + { + "epoch": 0.37692405546368146, + "ewc_loss": 0.0060597630217671394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.059763109078631e-05, + "grad_norm": 3.305298089981079, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8566210269927979, + "num_tokens": 113169145.0, + "step": 2963 + }, + { + "epoch": 0.377051265742272, + "ewc_loss": 0.006087825167924166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0878250224050134e-05, + "grad_norm": 3.371107816696167, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.872275710105896, + "num_tokens": 113201987.0, + "step": 2964 + }, + { + "epoch": 0.37717847602086246, + "ewc_loss": 0.006121359299868345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.121359183453023e-05, + "grad_norm": 3.3389980792999268, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8651555180549622, + "num_tokens": 113239588.0, + "step": 2965 + }, + { + "epoch": 0.377305686299453, + "ewc_loss": 0.006089568138122559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.089567978051491e-05, + "grad_norm": 3.31658935546875, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8581968545913696, + "num_tokens": 113277075.0, + "step": 2966 + }, + { + "epoch": 0.3774328965780435, + "ewc_loss": 0.006092917639762163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0929174651391804e-05, + "grad_norm": 3.303520441055298, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.884125292301178, + "num_tokens": 113313708.0, + "step": 2967 + }, + { + "epoch": 0.377560106856634, + "ewc_loss": 0.006101270671933889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.101270628278144e-05, + "grad_norm": 3.4253973960876465, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8411617875099182, + "num_tokens": 113344027.0, + "step": 2968 + }, + { + "epoch": 0.3776873171352245, + "ewc_loss": 0.006182405166327953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.182405195431784e-05, + "grad_norm": 3.2919039726257324, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8611037135124207, + "num_tokens": 113388444.0, + "step": 2969 + }, + { + "epoch": 0.37781452741381505, + "ewc_loss": 0.006061244290322065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.061244130250998e-05, + "grad_norm": 3.292950391769409, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8538705110549927, + "num_tokens": 113428660.0, + "step": 2970 + }, + { + "epoch": 0.3779417376924055, + "ewc_loss": 0.006111144553869963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.111144466558471e-05, + "grad_norm": 3.351712465286255, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8366389870643616, + "num_tokens": 113469160.0, + "step": 2971 + }, + { + "epoch": 0.37806894797099605, + "ewc_loss": 0.006148168351501226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.148168176878244e-05, + "grad_norm": 3.388017416000366, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8341158032417297, + "num_tokens": 113504291.0, + "step": 2972 + }, + { + "epoch": 0.3781961582495866, + "ewc_loss": 0.006132268812507391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.13226875429973e-05, + "grad_norm": 3.245192766189575, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8590795993804932, + "num_tokens": 113548085.0, + "step": 2973 + }, + { + "epoch": 0.37832336852817705, + "ewc_loss": 0.006062713451683521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.062713509891182e-05, + "grad_norm": 3.3920671939849854, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.862615704536438, + "num_tokens": 113581664.0, + "step": 2974 + }, + { + "epoch": 0.3784505788067676, + "ewc_loss": 0.0061913467943668365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.191346619743854e-05, + "grad_norm": 3.3123886585235596, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8700860738754272, + "num_tokens": 113618552.0, + "step": 2975 + }, + { + "epoch": 0.3785777890853581, + "ewc_loss": 0.006087950896471739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.087950896471739e-05, + "grad_norm": 3.3910834789276123, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8381763696670532, + "num_tokens": 113653406.0, + "step": 2976 + }, + { + "epoch": 0.3787049993639486, + "ewc_loss": 0.006162410136312246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.162410136312246e-05, + "grad_norm": 3.2728521823883057, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8657439351081848, + "num_tokens": 113693803.0, + "step": 2977 + }, + { + "epoch": 0.3788322096425391, + "ewc_loss": 0.00606429623439908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.064296030672267e-05, + "grad_norm": 3.2945098876953125, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8719303607940674, + "num_tokens": 113734637.0, + "step": 2978 + }, + { + "epoch": 0.37895941992112964, + "ewc_loss": 0.006114650052040815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.114650022936985e-05, + "grad_norm": 3.30649733543396, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8496903777122498, + "num_tokens": 113776579.0, + "step": 2979 + }, + { + "epoch": 0.37908663019972016, + "ewc_loss": 0.006113667041063309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.113667041063309e-05, + "grad_norm": 3.3078930377960205, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8477507829666138, + "num_tokens": 113818857.0, + "step": 2980 + }, + { + "epoch": 0.37921384047831064, + "ewc_loss": 0.006106534507125616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.106534419814125e-05, + "grad_norm": 3.378138780593872, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8499626517295837, + "num_tokens": 113853248.0, + "step": 2981 + }, + { + "epoch": 0.37934105075690117, + "ewc_loss": 0.006160884164273739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.160884368000552e-05, + "grad_norm": 3.293365478515625, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8629558086395264, + "num_tokens": 113894206.0, + "step": 2982 + }, + { + "epoch": 0.3794682610354917, + "ewc_loss": 0.006089251022785902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0892511100973934e-05, + "grad_norm": 3.3582956790924072, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.864591658115387, + "num_tokens": 113931309.0, + "step": 2983 + }, + { + "epoch": 0.37959547131408217, + "ewc_loss": 0.006136631593108177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.136631418485194e-05, + "grad_norm": 3.2836849689483643, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8467444181442261, + "num_tokens": 113976559.0, + "step": 2984 + }, + { + "epoch": 0.3797226815926727, + "ewc_loss": 0.006078108213841915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.078108344809152e-05, + "grad_norm": 3.3804991245269775, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8575127124786377, + "num_tokens": 114010814.0, + "step": 2985 + }, + { + "epoch": 0.3798498918712632, + "ewc_loss": 0.006149984896183014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.149984983494505e-05, + "grad_norm": 3.287752151489258, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8752155900001526, + "num_tokens": 114049389.0, + "step": 2986 + }, + { + "epoch": 0.3799771021498537, + "ewc_loss": 0.006079425103962421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.079424929339439e-05, + "grad_norm": 3.3704171180725098, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8754544854164124, + "num_tokens": 114081463.0, + "step": 2987 + }, + { + "epoch": 0.3801043124284442, + "ewc_loss": 0.0061448416672647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.144841609057039e-05, + "grad_norm": 3.328540325164795, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8742795586585999, + "num_tokens": 114117175.0, + "step": 2988 + }, + { + "epoch": 0.38023152270703475, + "ewc_loss": 0.006093548145145178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0935479268664494e-05, + "grad_norm": 3.361685276031494, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8453142046928406, + "num_tokens": 114153764.0, + "step": 2989 + }, + { + "epoch": 0.3803587329856252, + "ewc_loss": 0.00611724192276597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.117241719039157e-05, + "grad_norm": 3.3237392902374268, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.85625159740448, + "num_tokens": 114191487.0, + "step": 2990 + }, + { + "epoch": 0.38048594326421575, + "ewc_loss": 0.0060865082778036594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.086508437874727e-05, + "grad_norm": 3.3527746200561523, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8339626789093018, + "num_tokens": 114233880.0, + "step": 2991 + }, + { + "epoch": 0.3806131535428063, + "ewc_loss": 0.006107914727181196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.107914668973535e-05, + "grad_norm": 3.350316047668457, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8552277088165283, + "num_tokens": 114269238.0, + "step": 2992 + }, + { + "epoch": 0.38074036382139675, + "ewc_loss": 0.006103403400629759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.103403211454861e-05, + "grad_norm": 3.326596736907959, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8699957132339478, + "num_tokens": 114307645.0, + "step": 2993 + }, + { + "epoch": 0.3808675740999873, + "ewc_loss": 0.006088678725063801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.088678492233157e-05, + "grad_norm": 3.380632162094116, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8616716861724854, + "num_tokens": 114341800.0, + "step": 2994 + }, + { + "epoch": 0.3809947843785778, + "ewc_loss": 0.0061287106946110725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.128710811026394e-05, + "grad_norm": 3.3239808082580566, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8671067357063293, + "num_tokens": 114378363.0, + "step": 2995 + }, + { + "epoch": 0.3811219946571683, + "ewc_loss": 0.006074296310544014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.074296106817201e-05, + "grad_norm": 3.4167532920837402, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8463579416275024, + "num_tokens": 114414489.0, + "step": 2996 + }, + { + "epoch": 0.3812492049357588, + "ewc_loss": 0.006151242181658745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.151242268970236e-05, + "grad_norm": 3.3415403366088867, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8532077074050903, + "num_tokens": 114449197.0, + "step": 2997 + }, + { + "epoch": 0.38137641521434934, + "ewc_loss": 0.0060893092304468155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.089309317758307e-05, + "grad_norm": 3.367985725402832, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8544996976852417, + "num_tokens": 114478654.0, + "step": 2998 + }, + { + "epoch": 0.3815036254929398, + "ewc_loss": 0.006134296767413616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.134296563686803e-05, + "grad_norm": 3.263237476348877, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8665802478790283, + "num_tokens": 114521807.0, + "step": 2999 + }, + { + "epoch": 0.38163083577153034, + "ewc_loss": 0.006088050082325935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.088049849495292e-05, + "grad_norm": 3.3324689865112305, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.871203601360321, + "num_tokens": 114558297.0, + "step": 3000 + }, + { + "epoch": 0.38175804605012087, + "ewc_loss": 0.006148630753159523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.148630927782506e-05, + "grad_norm": 3.318845748901367, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8637872338294983, + "num_tokens": 114599630.0, + "step": 3001 + }, + { + "epoch": 0.38188525632871134, + "ewc_loss": 0.0061219483613967896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.121948536019772e-05, + "grad_norm": 3.352278470993042, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8539224863052368, + "num_tokens": 114638697.0, + "step": 3002 + }, + { + "epoch": 0.38201246660730187, + "ewc_loss": 0.006141050718724728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.14105083514005e-05, + "grad_norm": 3.2908592224121094, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8579757213592529, + "num_tokens": 114676106.0, + "step": 3003 + }, + { + "epoch": 0.3821396768858924, + "ewc_loss": 0.0061074672266840935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.107467197580263e-05, + "grad_norm": 3.3726699352264404, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8554903864860535, + "num_tokens": 114712026.0, + "step": 3004 + }, + { + "epoch": 0.38226688716448287, + "ewc_loss": 0.006174259819090366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.174259760882705e-05, + "grad_norm": 3.336611032485962, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8367695808410645, + "num_tokens": 114752348.0, + "step": 3005 + }, + { + "epoch": 0.3823940974430734, + "ewc_loss": 0.006131480913609266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.131480768090114e-05, + "grad_norm": 3.314680337905884, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8626114726066589, + "num_tokens": 114789922.0, + "step": 3006 + }, + { + "epoch": 0.38252130772166393, + "ewc_loss": 0.006143996026366949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.143996142782271e-05, + "grad_norm": 3.276156187057495, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.852089524269104, + "num_tokens": 114838834.0, + "step": 3007 + }, + { + "epoch": 0.3826485180002544, + "ewc_loss": 0.006118535529822111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.118535384302959e-05, + "grad_norm": 3.42010235786438, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8373463153839111, + "num_tokens": 114873320.0, + "step": 3008 + }, + { + "epoch": 0.38277572827884493, + "ewc_loss": 0.006201175041496754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.201174983289093e-05, + "grad_norm": 3.322357177734375, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8688225746154785, + "num_tokens": 114908293.0, + "step": 3009 + }, + { + "epoch": 0.38290293855743546, + "ewc_loss": 0.0061055137775838375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.105513602960855e-05, + "grad_norm": 3.3019933700561523, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8489130139350891, + "num_tokens": 114950083.0, + "step": 3010 + }, + { + "epoch": 0.38303014883602593, + "ewc_loss": 0.0061334422789514065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.133442366262898e-05, + "grad_norm": 3.3799686431884766, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8629318475723267, + "num_tokens": 114982353.0, + "step": 3011 + }, + { + "epoch": 0.38315735911461646, + "ewc_loss": 0.006176832597702742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.176832539495081e-05, + "grad_norm": 3.322626829147339, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8633652329444885, + "num_tokens": 115017668.0, + "step": 3012 + }, + { + "epoch": 0.383284569393207, + "ewc_loss": 0.006121970247477293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.121970363892615e-05, + "grad_norm": 3.3093039989471436, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8702182769775391, + "num_tokens": 115055266.0, + "step": 3013 + }, + { + "epoch": 0.38341177967179746, + "ewc_loss": 0.006132907699793577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.132907583378255e-05, + "grad_norm": 3.3783793449401855, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8532720804214478, + "num_tokens": 115089728.0, + "step": 3014 + }, + { + "epoch": 0.383538989950388, + "ewc_loss": 0.006174451671540737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.17445184616372e-05, + "grad_norm": 3.278195858001709, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8421797156333923, + "num_tokens": 115135192.0, + "step": 3015 + }, + { + "epoch": 0.3836662002289785, + "ewc_loss": 0.006092578638345003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.09257876931224e-05, + "grad_norm": 3.438075542449951, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8409393429756165, + "num_tokens": 115170350.0, + "step": 3016 + }, + { + "epoch": 0.383793410507569, + "ewc_loss": 0.006231776438653469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.231776205822825e-05, + "grad_norm": 3.4089128971099854, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8651061654090881, + "num_tokens": 115202724.0, + "step": 3017 + }, + { + "epoch": 0.3839206207861595, + "ewc_loss": 0.006155205424875021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.155205483082682e-05, + "grad_norm": 3.375002145767212, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8571187257766724, + "num_tokens": 115239274.0, + "step": 3018 + }, + { + "epoch": 0.38404783106475004, + "ewc_loss": 0.006136614829301834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.136614683782682e-05, + "grad_norm": 3.3421359062194824, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8660115599632263, + "num_tokens": 115278330.0, + "step": 3019 + }, + { + "epoch": 0.3841750413433405, + "ewc_loss": 0.006147691514343023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.147691601654515e-05, + "grad_norm": 3.308654308319092, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8537410497665405, + "num_tokens": 115321258.0, + "step": 3020 + }, + { + "epoch": 0.38430225162193105, + "ewc_loss": 0.006128693465143442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.12869334872812e-05, + "grad_norm": 3.331521987915039, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8593249320983887, + "num_tokens": 115362609.0, + "step": 3021 + }, + { + "epoch": 0.3844294619005216, + "ewc_loss": 0.006158304866403341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.158305041026324e-05, + "grad_norm": 3.3745267391204834, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8311181664466858, + "num_tokens": 115405814.0, + "step": 3022 + }, + { + "epoch": 0.38455667217911205, + "ewc_loss": 0.006165916565805674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.165916420286521e-05, + "grad_norm": 3.3782825469970703, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8515315651893616, + "num_tokens": 115440856.0, + "step": 3023 + }, + { + "epoch": 0.3846838824577026, + "ewc_loss": 0.006163795478641987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.163795478641987e-05, + "grad_norm": 3.316394090652466, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8637312650680542, + "num_tokens": 115482582.0, + "step": 3024 + }, + { + "epoch": 0.3848110927362931, + "ewc_loss": 0.006136185489594936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.136185402283445e-05, + "grad_norm": 3.2927968502044678, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8488766551017761, + "num_tokens": 115527268.0, + "step": 3025 + }, + { + "epoch": 0.3849383030148836, + "ewc_loss": 0.006134555675089359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.134555587777868e-05, + "grad_norm": 3.394371509552002, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8530300855636597, + "num_tokens": 115563757.0, + "step": 3026 + }, + { + "epoch": 0.3850655132934741, + "ewc_loss": 0.006207006052136421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.2070059357211e-05, + "grad_norm": 3.308029890060425, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8609778881072998, + "num_tokens": 115599629.0, + "step": 3027 + }, + { + "epoch": 0.38519272357206463, + "ewc_loss": 0.006123743019998074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.12374278716743e-05, + "grad_norm": 3.382369041442871, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8619803786277771, + "num_tokens": 115634261.0, + "step": 3028 + }, + { + "epoch": 0.3853199338506551, + "ewc_loss": 0.006205519661307335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.205519457580522e-05, + "grad_norm": 3.312228202819824, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.870331883430481, + "num_tokens": 115671150.0, + "step": 3029 + }, + { + "epoch": 0.38544714412924563, + "ewc_loss": 0.006152323447167873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.152323476271704e-05, + "grad_norm": 3.284677743911743, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8715643286705017, + "num_tokens": 115714328.0, + "step": 3030 + }, + { + "epoch": 0.38557435440783616, + "ewc_loss": 0.006142238155007362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.142238271422684e-05, + "grad_norm": 3.358748197555542, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8728477358818054, + "num_tokens": 115747118.0, + "step": 3031 + }, + { + "epoch": 0.3857015646864267, + "ewc_loss": 0.006210533436387777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.210533319972456e-05, + "grad_norm": 3.3068127632141113, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8375778198242188, + "num_tokens": 115790309.0, + "step": 3032 + }, + { + "epoch": 0.38582877496501716, + "ewc_loss": 0.006132109556347132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.13210941082798e-05, + "grad_norm": 3.315014362335205, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8698150515556335, + "num_tokens": 115825630.0, + "step": 3033 + }, + { + "epoch": 0.3859559852436077, + "ewc_loss": 0.006162954960018396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.162955105537549e-05, + "grad_norm": 3.3788228034973145, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8660500049591064, + "num_tokens": 115858848.0, + "step": 3034 + }, + { + "epoch": 0.3860831955221982, + "ewc_loss": 0.006190611980855465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.190611748024821e-05, + "grad_norm": 3.277298927307129, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8680377006530762, + "num_tokens": 115900024.0, + "step": 3035 + }, + { + "epoch": 0.3862104058007887, + "ewc_loss": 0.006113157141953707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.113156996434554e-05, + "grad_norm": 3.391601085662842, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8330344557762146, + "num_tokens": 115938899.0, + "step": 3036 + }, + { + "epoch": 0.3863376160793792, + "ewc_loss": 0.006213740445673466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.213740562088788e-05, + "grad_norm": 3.4030299186706543, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8648855090141296, + "num_tokens": 115972199.0, + "step": 3037 + }, + { + "epoch": 0.38646482635796975, + "ewc_loss": 0.006176568567752838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.176568422233686e-05, + "grad_norm": 3.305166006088257, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.869097113609314, + "num_tokens": 116011506.0, + "step": 3038 + }, + { + "epoch": 0.3865920366365602, + "ewc_loss": 0.006117815151810646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.117815064499155e-05, + "grad_norm": 3.3475983142852783, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8351382613182068, + "num_tokens": 116051428.0, + "step": 3039 + }, + { + "epoch": 0.38671924691515075, + "ewc_loss": 0.006182612385600805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.182612560223788e-05, + "grad_norm": 3.3023598194122314, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8669837713241577, + "num_tokens": 116090307.0, + "step": 3040 + }, + { + "epoch": 0.3868464571937413, + "ewc_loss": 0.006128870882093906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.128870882093906e-05, + "grad_norm": 3.3876359462738037, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8495529890060425, + "num_tokens": 116123381.0, + "step": 3041 + }, + { + "epoch": 0.38697366747233175, + "ewc_loss": 0.006196026690304279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.196026515681297e-05, + "grad_norm": 3.3855738639831543, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8613792657852173, + "num_tokens": 116157755.0, + "step": 3042 + }, + { + "epoch": 0.3871008777509223, + "ewc_loss": 0.006161686033010483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.161686178529635e-05, + "grad_norm": 3.329310894012451, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.859742283821106, + "num_tokens": 116197371.0, + "step": 3043 + }, + { + "epoch": 0.3872280880295128, + "ewc_loss": 0.006148113403469324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.148113607196137e-05, + "grad_norm": 3.338273525238037, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8434058427810669, + "num_tokens": 116240544.0, + "step": 3044 + }, + { + "epoch": 0.3873552983081033, + "ewc_loss": 0.006171329878270626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.171329732751474e-05, + "grad_norm": 3.448089838027954, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8552472591400146, + "num_tokens": 116272079.0, + "step": 3045 + }, + { + "epoch": 0.3874825085866938, + "ewc_loss": 0.006242100615054369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.242100789677352e-05, + "grad_norm": 3.292227029800415, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8561873435974121, + "num_tokens": 116309641.0, + "step": 3046 + }, + { + "epoch": 0.38760971886528434, + "ewc_loss": 0.006129089742898941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.129089888418093e-05, + "grad_norm": 3.354240894317627, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8529831171035767, + "num_tokens": 116344470.0, + "step": 3047 + }, + { + "epoch": 0.3877369291438748, + "ewc_loss": 0.006217284593731165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.217284681042656e-05, + "grad_norm": 3.3451099395751953, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8600610494613647, + "num_tokens": 116380177.0, + "step": 3048 + }, + { + "epoch": 0.38786413942246534, + "ewc_loss": 0.006188856903463602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.18885678704828e-05, + "grad_norm": 3.3328723907470703, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8690037131309509, + "num_tokens": 116416482.0, + "step": 3049 + }, + { + "epoch": 0.38799134970105587, + "ewc_loss": 0.00619706604629755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.197066250024363e-05, + "grad_norm": 3.331190824508667, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8444840312004089, + "num_tokens": 116456103.0, + "step": 3050 + }, + { + "epoch": 0.38811855997964634, + "ewc_loss": 0.006197376176714897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.197376205818728e-05, + "grad_norm": 3.2816832065582275, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8643622398376465, + "num_tokens": 116496810.0, + "step": 3051 + }, + { + "epoch": 0.38824577025823687, + "ewc_loss": 0.006163704674690962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.16370452917181e-05, + "grad_norm": 3.3671646118164062, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8538440465927124, + "num_tokens": 116532999.0, + "step": 3052 + }, + { + "epoch": 0.3883729805368274, + "ewc_loss": 0.006233538966625929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.233539170352742e-05, + "grad_norm": 3.4516613483428955, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8672474026679993, + "num_tokens": 116565700.0, + "step": 3053 + }, + { + "epoch": 0.38850019081541787, + "ewc_loss": 0.0062709166668355465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.270916492212564e-05, + "grad_norm": 3.314243793487549, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8629376292228699, + "num_tokens": 116602798.0, + "step": 3054 + }, + { + "epoch": 0.3886274010940084, + "ewc_loss": 0.006164217367768288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.16421748418361e-05, + "grad_norm": 3.3202807903289795, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.870453953742981, + "num_tokens": 116638743.0, + "step": 3055 + }, + { + "epoch": 0.3887546113725989, + "ewc_loss": 0.006221516523510218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.221516377991065e-05, + "grad_norm": 3.3540427684783936, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8710140585899353, + "num_tokens": 116672290.0, + "step": 3056 + }, + { + "epoch": 0.3888818216511894, + "ewc_loss": 0.006219713948667049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.219714123290032e-05, + "grad_norm": 3.312798500061035, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8593935966491699, + "num_tokens": 116712593.0, + "step": 3057 + }, + { + "epoch": 0.3890090319297799, + "ewc_loss": 0.006188476458191872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.188476254465058e-05, + "grad_norm": 3.3888485431671143, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8574525117874146, + "num_tokens": 116748663.0, + "step": 3058 + }, + { + "epoch": 0.38913624220837045, + "ewc_loss": 0.006251349113881588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.25134925940074e-05, + "grad_norm": 3.2662038803100586, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8545144200325012, + "num_tokens": 116796018.0, + "step": 3059 + }, + { + "epoch": 0.3892634524869609, + "ewc_loss": 0.006163584999740124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.163585203466937e-05, + "grad_norm": 3.3152213096618652, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8560066223144531, + "num_tokens": 116837287.0, + "step": 3060 + }, + { + "epoch": 0.38939066276555145, + "ewc_loss": 0.006219570059329271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.219570059329271e-05, + "grad_norm": 3.4411730766296387, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8422487378120422, + "num_tokens": 116872163.0, + "step": 3061 + }, + { + "epoch": 0.389517873044142, + "ewc_loss": 0.006284496281296015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.284496339503676e-05, + "grad_norm": 3.2859952449798584, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8654299974441528, + "num_tokens": 116913445.0, + "step": 3062 + }, + { + "epoch": 0.38964508332273246, + "ewc_loss": 0.0061552636325359344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.155263690743595e-05, + "grad_norm": 3.2854838371276855, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8497800827026367, + "num_tokens": 116954469.0, + "step": 3063 + }, + { + "epoch": 0.389772293601323, + "ewc_loss": 0.006220304872840643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.220304931048304e-05, + "grad_norm": 3.325255870819092, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8576364517211914, + "num_tokens": 116994339.0, + "step": 3064 + }, + { + "epoch": 0.3898995038799135, + "ewc_loss": 0.006220946554094553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.220946670509875e-05, + "grad_norm": 3.3186986446380615, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8712201118469238, + "num_tokens": 117030867.0, + "step": 3065 + }, + { + "epoch": 0.390026714158504, + "ewc_loss": 0.006202880293130875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.202880467753857e-05, + "grad_norm": 3.3430278301239014, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8667097091674805, + "num_tokens": 117068309.0, + "step": 3066 + }, + { + "epoch": 0.3901539244370945, + "ewc_loss": 0.006210876628756523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.210876745171845e-05, + "grad_norm": 3.3472423553466797, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8537667393684387, + "num_tokens": 117105278.0, + "step": 3067 + }, + { + "epoch": 0.39028113471568504, + "ewc_loss": 0.006214506924152374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.214506720425561e-05, + "grad_norm": 3.370677947998047, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8494960069656372, + "num_tokens": 117143096.0, + "step": 3068 + }, + { + "epoch": 0.3904083449942755, + "ewc_loss": 0.006213964894413948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.213964661583304e-05, + "grad_norm": 3.3180694580078125, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8583278656005859, + "num_tokens": 117179638.0, + "step": 3069 + }, + { + "epoch": 0.39053555527286604, + "ewc_loss": 0.006169163156300783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.16916295257397e-05, + "grad_norm": 3.438044548034668, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8707056641578674, + "num_tokens": 117211560.0, + "step": 3070 + }, + { + "epoch": 0.39066276555145657, + "ewc_loss": 0.006250329315662384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.250329170143232e-05, + "grad_norm": 3.325150966644287, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8501695394515991, + "num_tokens": 117250622.0, + "step": 3071 + }, + { + "epoch": 0.39078997583004704, + "ewc_loss": 0.006149821914732456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.149822002043948e-05, + "grad_norm": 3.2854316234588623, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8789167404174805, + "num_tokens": 117287600.0, + "step": 3072 + }, + { + "epoch": 0.39091718610863757, + "ewc_loss": 0.006167666986584663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.167667015688494e-05, + "grad_norm": 3.3419086933135986, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8430241346359253, + "num_tokens": 117328188.0, + "step": 3073 + }, + { + "epoch": 0.3910443963872281, + "ewc_loss": 0.006217106245458126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.217106420081109e-05, + "grad_norm": 3.3366477489471436, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8668252229690552, + "num_tokens": 117367282.0, + "step": 3074 + }, + { + "epoch": 0.39117160666581857, + "ewc_loss": 0.006185980513691902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.185980601003394e-05, + "grad_norm": 3.351304292678833, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8703111410140991, + "num_tokens": 117403376.0, + "step": 3075 + }, + { + "epoch": 0.3912988169444091, + "ewc_loss": 0.0061889695934951305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.1889695643913e-05, + "grad_norm": 3.3930513858795166, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8509343266487122, + "num_tokens": 117438661.0, + "step": 3076 + }, + { + "epoch": 0.39142602722299963, + "ewc_loss": 0.006214634981006384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.214634777279571e-05, + "grad_norm": 3.393913507461548, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8624746799468994, + "num_tokens": 117477351.0, + "step": 3077 + }, + { + "epoch": 0.3915532375015901, + "ewc_loss": 0.006183614954352379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.183615187183022e-05, + "grad_norm": 3.3348922729492188, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8574601411819458, + "num_tokens": 117512773.0, + "step": 3078 + }, + { + "epoch": 0.39168044778018063, + "ewc_loss": 0.006161389406770468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.161389319458976e-05, + "grad_norm": 3.364896774291992, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8424219489097595, + "num_tokens": 117555370.0, + "step": 3079 + }, + { + "epoch": 0.39180765805877116, + "ewc_loss": 0.006190516520291567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.190516432980075e-05, + "grad_norm": 3.318866491317749, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8659494519233704, + "num_tokens": 117594208.0, + "step": 3080 + }, + { + "epoch": 0.3919348683373617, + "ewc_loss": 0.0061356984078884125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.135698640719056e-05, + "grad_norm": 3.327220916748047, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8618577718734741, + "num_tokens": 117632715.0, + "step": 3081 + }, + { + "epoch": 0.39206207861595216, + "ewc_loss": 0.006159798242151737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.159798067528754e-05, + "grad_norm": 3.3299245834350586, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8533278703689575, + "num_tokens": 117669221.0, + "step": 3082 + }, + { + "epoch": 0.3921892888945427, + "ewc_loss": 0.006156235001981258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.156235031085089e-05, + "grad_norm": 3.3067421913146973, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8513272404670715, + "num_tokens": 117709559.0, + "step": 3083 + }, + { + "epoch": 0.3923164991731332, + "ewc_loss": 0.006134543102234602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.134543218649924e-05, + "grad_norm": 3.3231303691864014, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8646400570869446, + "num_tokens": 117745240.0, + "step": 3084 + }, + { + "epoch": 0.3924437094517237, + "ewc_loss": 0.006167554296553135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.167554238345474e-05, + "grad_norm": 3.287404775619507, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8616762161254883, + "num_tokens": 117787897.0, + "step": 3085 + }, + { + "epoch": 0.3925709197303142, + "ewc_loss": 0.006131549831479788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.131549889687449e-05, + "grad_norm": 3.355630874633789, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8545349836349487, + "num_tokens": 117826862.0, + "step": 3086 + }, + { + "epoch": 0.39269813000890474, + "ewc_loss": 0.006172689609229565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.172689609229565e-05, + "grad_norm": 3.3183722496032715, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8544323444366455, + "num_tokens": 117866932.0, + "step": 3087 + }, + { + "epoch": 0.3928253402874952, + "ewc_loss": 0.006124632433056831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.124632636783645e-05, + "grad_norm": 3.3113553524017334, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8600696325302124, + "num_tokens": 117905538.0, + "step": 3088 + }, + { + "epoch": 0.39295255056608575, + "ewc_loss": 0.006142615806311369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.14261589362286e-05, + "grad_norm": 3.3471007347106934, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8468233346939087, + "num_tokens": 117945123.0, + "step": 3089 + }, + { + "epoch": 0.3930797608446763, + "ewc_loss": 0.006154260132461786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.1542603361886e-05, + "grad_norm": 3.2678112983703613, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8611980676651001, + "num_tokens": 117988743.0, + "step": 3090 + }, + { + "epoch": 0.39320697112326675, + "ewc_loss": 0.006101259496062994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.1012597143417224e-05, + "grad_norm": 3.2916104793548584, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8723556995391846, + "num_tokens": 118026078.0, + "step": 3091 + }, + { + "epoch": 0.3933341814018573, + "ewc_loss": 0.00613265810534358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.132658018032089e-05, + "grad_norm": 3.3430981636047363, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8707540035247803, + "num_tokens": 118060105.0, + "step": 3092 + }, + { + "epoch": 0.3934613916804478, + "ewc_loss": 0.00614666473120451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.146664964035153e-05, + "grad_norm": 3.331209659576416, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8875865936279297, + "num_tokens": 118089677.0, + "step": 3093 + }, + { + "epoch": 0.3935886019590383, + "ewc_loss": 0.006116051226854324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.116051372373477e-05, + "grad_norm": 3.2886104583740234, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8721545338630676, + "num_tokens": 118128449.0, + "step": 3094 + }, + { + "epoch": 0.3937158122376288, + "ewc_loss": 0.006108633708208799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.108633533585817e-05, + "grad_norm": 3.3492865562438965, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8685483932495117, + "num_tokens": 118168411.0, + "step": 3095 + }, + { + "epoch": 0.39384302251621933, + "ewc_loss": 0.00615327525883913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.153275171527639e-05, + "grad_norm": 3.3261313438415527, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.869696855545044, + "num_tokens": 118206097.0, + "step": 3096 + }, + { + "epoch": 0.3939702327948098, + "ewc_loss": 0.006118934601545334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.118934834375978e-05, + "grad_norm": 3.314417600631714, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8716798424720764, + "num_tokens": 118244429.0, + "step": 3097 + }, + { + "epoch": 0.39409744307340033, + "ewc_loss": 0.0061291358433663845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.129135726951063e-05, + "grad_norm": 3.362872838973999, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8651412725448608, + "num_tokens": 118279125.0, + "step": 3098 + }, + { + "epoch": 0.39422465335199086, + "ewc_loss": 0.006152893416583538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.152893183752894e-05, + "grad_norm": 3.3922598361968994, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8584701418876648, + "num_tokens": 118313156.0, + "step": 3099 + }, + { + "epoch": 0.39435186363058133, + "ewc_loss": 0.006153042893856764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.153043068479747e-05, + "grad_norm": 3.3543624877929688, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8464046120643616, + "num_tokens": 118349868.0, + "step": 3100 + }, + { + "epoch": 0.39447907390917186, + "ewc_loss": 0.006132188718765974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.132188718765974e-05, + "grad_norm": 3.3010244369506836, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8495386838912964, + "num_tokens": 118394998.0, + "step": 3101 + }, + { + "epoch": 0.3946062841877624, + "ewc_loss": 0.006113666109740734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.113666313467547e-05, + "grad_norm": 3.2943904399871826, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8710289001464844, + "num_tokens": 118436002.0, + "step": 3102 + }, + { + "epoch": 0.39473349446635286, + "ewc_loss": 0.006124717649072409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.12471776548773e-05, + "grad_norm": 3.3461663722991943, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8637079000473022, + "num_tokens": 118475094.0, + "step": 3103 + }, + { + "epoch": 0.3948607047449434, + "ewc_loss": 0.006151553709059954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.151553679956123e-05, + "grad_norm": 3.3424789905548096, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8564885258674622, + "num_tokens": 118514164.0, + "step": 3104 + }, + { + "epoch": 0.3949879150235339, + "ewc_loss": 0.006134750787168741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.134750583441928e-05, + "grad_norm": 3.3283979892730713, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8553944826126099, + "num_tokens": 118554271.0, + "step": 3105 + }, + { + "epoch": 0.3951151253021244, + "ewc_loss": 0.006132064387202263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.132064299890772e-05, + "grad_norm": 3.2973978519439697, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8588792085647583, + "num_tokens": 118597123.0, + "step": 3106 + }, + { + "epoch": 0.3952423355807149, + "ewc_loss": 0.006123632192611694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.123632192611694e-05, + "grad_norm": 3.322774887084961, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8539036512374878, + "num_tokens": 118636683.0, + "step": 3107 + }, + { + "epoch": 0.39536954585930545, + "ewc_loss": 0.006137243937700987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.137244054116309e-05, + "grad_norm": 3.4586000442504883, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8624412417411804, + "num_tokens": 118665770.0, + "step": 3108 + }, + { + "epoch": 0.3954967561378959, + "ewc_loss": 0.006211168598383665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.211168511072174e-05, + "grad_norm": 3.3242697715759277, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8690714836120605, + "num_tokens": 118701652.0, + "step": 3109 + }, + { + "epoch": 0.39562396641648645, + "ewc_loss": 0.006088850554078817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0888505686307326e-05, + "grad_norm": 3.281629800796509, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8652095794677734, + "num_tokens": 118741756.0, + "step": 3110 + }, + { + "epoch": 0.395751176695077, + "ewc_loss": 0.006114794872701168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.114794814493507e-05, + "grad_norm": 3.272336721420288, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8600070476531982, + "num_tokens": 118784320.0, + "step": 3111 + }, + { + "epoch": 0.39587838697366745, + "ewc_loss": 0.006134998053312302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.13499796600081e-05, + "grad_norm": 3.350928783416748, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8541697263717651, + "num_tokens": 118821456.0, + "step": 3112 + }, + { + "epoch": 0.396005597252258, + "ewc_loss": 0.006172689609229565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.172689609229565e-05, + "grad_norm": 3.331975221633911, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8581200242042542, + "num_tokens": 118861746.0, + "step": 3113 + }, + { + "epoch": 0.3961328075308485, + "ewc_loss": 0.006139207631349564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.139207835076377e-05, + "grad_norm": 3.3514790534973145, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8650656938552856, + "num_tokens": 118900346.0, + "step": 3114 + }, + { + "epoch": 0.396260017809439, + "ewc_loss": 0.006161998491734266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.161998317111284e-05, + "grad_norm": 3.306833505630493, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.859796404838562, + "num_tokens": 118939707.0, + "step": 3115 + }, + { + "epoch": 0.3963872280880295, + "ewc_loss": 0.006137436721473932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.137436866993085e-05, + "grad_norm": 3.3922181129455566, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8586685657501221, + "num_tokens": 118971154.0, + "step": 3116 + }, + { + "epoch": 0.39651443836662004, + "ewc_loss": 0.0061972183175385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.1972183175385e-05, + "grad_norm": 3.3059709072113037, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8436683416366577, + "num_tokens": 119012344.0, + "step": 3117 + }, + { + "epoch": 0.3966416486452105, + "ewc_loss": 0.0061287712305784225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.128771201474592e-05, + "grad_norm": 3.3550844192504883, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8578776717185974, + "num_tokens": 119047677.0, + "step": 3118 + }, + { + "epoch": 0.39676885892380104, + "ewc_loss": 0.006201603449881077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.201603537192568e-05, + "grad_norm": 3.3032467365264893, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8718783259391785, + "num_tokens": 119086357.0, + "step": 3119 + }, + { + "epoch": 0.39689606920239157, + "ewc_loss": 0.00615667225793004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.156672316137701e-05, + "grad_norm": 3.3379032611846924, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8671018481254578, + "num_tokens": 119122854.0, + "step": 3120 + }, + { + "epoch": 0.39702327948098204, + "ewc_loss": 0.0062045743688941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.204574310686439e-05, + "grad_norm": 3.3820931911468506, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8670002222061157, + "num_tokens": 119155449.0, + "step": 3121 + }, + { + "epoch": 0.39715048975957257, + "ewc_loss": 0.006220431532710791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.22043153271079e-05, + "grad_norm": 3.31486177444458, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8538491725921631, + "num_tokens": 119197287.0, + "step": 3122 + }, + { + "epoch": 0.3972777000381631, + "ewc_loss": 0.006169626489281654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.169626431073993e-05, + "grad_norm": 3.338912010192871, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8478050231933594, + "num_tokens": 119235749.0, + "step": 3123 + }, + { + "epoch": 0.39740491031675357, + "ewc_loss": 0.006214124150574207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.214124005055055e-05, + "grad_norm": 3.3855230808258057, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8563515543937683, + "num_tokens": 119267282.0, + "step": 3124 + }, + { + "epoch": 0.3975321205953441, + "ewc_loss": 0.006243328098207712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.243328243726864e-05, + "grad_norm": 3.3307108879089355, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8519396781921387, + "num_tokens": 119304387.0, + "step": 3125 + }, + { + "epoch": 0.3976593308739346, + "ewc_loss": 0.00619416031986475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.194160232553259e-05, + "grad_norm": 3.3219377994537354, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8491442203521729, + "num_tokens": 119341470.0, + "step": 3126 + }, + { + "epoch": 0.3977865411525251, + "ewc_loss": 0.006224749609827995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.224749813554808e-05, + "grad_norm": 3.3116977214813232, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.848892331123352, + "num_tokens": 119381067.0, + "step": 3127 + }, + { + "epoch": 0.3979137514311156, + "ewc_loss": 0.006209655664861202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.209655839484185e-05, + "grad_norm": 3.3911755084991455, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.858741283416748, + "num_tokens": 119415680.0, + "step": 3128 + }, + { + "epoch": 0.39804096170970615, + "ewc_loss": 0.006257900036871433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.25789980404079e-05, + "grad_norm": 3.345170736312866, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.848918080329895, + "num_tokens": 119454300.0, + "step": 3129 + }, + { + "epoch": 0.3981681719882967, + "ewc_loss": 0.006211812607944012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.211812433321029e-05, + "grad_norm": 3.4104201793670654, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8700239658355713, + "num_tokens": 119485599.0, + "step": 3130 + }, + { + "epoch": 0.39829538226688715, + "ewc_loss": 0.0062676286324858665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.267628486966714e-05, + "grad_norm": 3.3232641220092773, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8438898324966431, + "num_tokens": 119525820.0, + "step": 3131 + }, + { + "epoch": 0.3984225925454777, + "ewc_loss": 0.006205163896083832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.205163663253188e-05, + "grad_norm": 3.295646905899048, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8646655082702637, + "num_tokens": 119568216.0, + "step": 3132 + }, + { + "epoch": 0.3985498028240682, + "ewc_loss": 0.0062138112261891365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.213811138877645e-05, + "grad_norm": 3.4220783710479736, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8455349206924438, + "num_tokens": 119602914.0, + "step": 3133 + }, + { + "epoch": 0.3986770131026587, + "ewc_loss": 0.006296737585216761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.296737410593778e-05, + "grad_norm": 3.3736040592193604, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8552525043487549, + "num_tokens": 119637024.0, + "step": 3134 + }, + { + "epoch": 0.3988042233812492, + "ewc_loss": 0.006223234347999096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.223234231583774e-05, + "grad_norm": 3.3317205905914307, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8508687019348145, + "num_tokens": 119679265.0, + "step": 3135 + }, + { + "epoch": 0.39893143365983974, + "ewc_loss": 0.00622129812836647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.22129809926264e-05, + "grad_norm": 3.371656656265259, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8492510318756104, + "num_tokens": 119716254.0, + "step": 3136 + }, + { + "epoch": 0.3990586439384302, + "ewc_loss": 0.006256005726754665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.256005872273818e-05, + "grad_norm": 3.2513294219970703, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8515833020210266, + "num_tokens": 119761325.0, + "step": 3137 + }, + { + "epoch": 0.39918585421702074, + "ewc_loss": 0.0061768339946866035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.176833994686604e-05, + "grad_norm": 3.3379571437835693, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8805396556854248, + "num_tokens": 119795468.0, + "step": 3138 + }, + { + "epoch": 0.39931306449561127, + "ewc_loss": 0.006255522835999727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.255522748688236e-05, + "grad_norm": 3.338045358657837, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8505972623825073, + "num_tokens": 119833804.0, + "step": 3139 + }, + { + "epoch": 0.39944027477420174, + "ewc_loss": 0.006234173662960529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.234173633856699e-05, + "grad_norm": 3.3504927158355713, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8539755344390869, + "num_tokens": 119870852.0, + "step": 3140 + }, + { + "epoch": 0.39956748505279227, + "ewc_loss": 0.006235350854694843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.235350883798674e-05, + "grad_norm": 3.321669101715088, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8616139888763428, + "num_tokens": 119909258.0, + "step": 3141 + }, + { + "epoch": 0.3996946953313828, + "ewc_loss": 0.006219150964170694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.219150964170694e-05, + "grad_norm": 3.4319472312927246, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8477092981338501, + "num_tokens": 119940701.0, + "step": 3142 + }, + { + "epoch": 0.39982190560997327, + "ewc_loss": 0.006289070006459951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.289070006459951e-05, + "grad_norm": 3.3394060134887695, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8593025207519531, + "num_tokens": 119976780.0, + "step": 3143 + }, + { + "epoch": 0.3999491158885638, + "ewc_loss": 0.006210329011082649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.210328865563497e-05, + "grad_norm": 3.309211015701294, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8526096343994141, + "num_tokens": 120017543.0, + "step": 3144 + }, + { + "epoch": 0.40007632616715433, + "ewc_loss": 0.006230227183550596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.230227154446766e-05, + "grad_norm": 3.4434969425201416, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8561095595359802, + "num_tokens": 120050036.0, + "step": 3145 + }, + { + "epoch": 0.4002035364457448, + "ewc_loss": 0.006319031585007906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.319031672319397e-05, + "grad_norm": 3.3669066429138184, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8731030821800232, + "num_tokens": 120084582.0, + "step": 3146 + }, + { + "epoch": 0.40033074672433533, + "ewc_loss": 0.006233008578419685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.233008753042668e-05, + "grad_norm": 3.3498246669769287, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8542816638946533, + "num_tokens": 120120218.0, + "step": 3147 + }, + { + "epoch": 0.40045795700292586, + "ewc_loss": 0.006246349774301052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.246349948924035e-05, + "grad_norm": 3.2437903881073, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8803460597991943, + "num_tokens": 120161650.0, + "step": 3148 + }, + { + "epoch": 0.40058516728151633, + "ewc_loss": 0.006208682432770729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.208682316355407e-05, + "grad_norm": 3.3597843647003174, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.868450403213501, + "num_tokens": 120197409.0, + "step": 3149 + }, + { + "epoch": 0.40071237756010686, + "ewc_loss": 0.006293041165918112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.293041224125773e-05, + "grad_norm": 3.292989492416382, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8459564447402954, + "num_tokens": 120240700.0, + "step": 3150 + }, + { + "epoch": 0.4008395878386974, + "ewc_loss": 0.006205650977790356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.205651152413338e-05, + "grad_norm": 3.3121209144592285, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8488831520080566, + "num_tokens": 120286741.0, + "step": 3151 + }, + { + "epoch": 0.40096679811728786, + "ewc_loss": 0.006232962943613529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.232962914509699e-05, + "grad_norm": 3.368065118789673, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8604699373245239, + "num_tokens": 120323236.0, + "step": 3152 + }, + { + "epoch": 0.4010940083958784, + "ewc_loss": 0.006271324120461941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.271323945838958e-05, + "grad_norm": 3.359022855758667, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8568634986877441, + "num_tokens": 120362185.0, + "step": 3153 + }, + { + "epoch": 0.4012212186744689, + "ewc_loss": 0.006230575498193502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.230575672816485e-05, + "grad_norm": 3.375675916671753, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8502954244613647, + "num_tokens": 120397524.0, + "step": 3154 + }, + { + "epoch": 0.4013484289530594, + "ewc_loss": 0.0062408181838691235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.240818038349971e-05, + "grad_norm": 3.2918317317962646, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8619201183319092, + "num_tokens": 120439723.0, + "step": 3155 + }, + { + "epoch": 0.4014756392316499, + "ewc_loss": 0.006201269570738077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.201269570738077e-05, + "grad_norm": 3.3144302368164062, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.861435055732727, + "num_tokens": 120483693.0, + "step": 3156 + }, + { + "epoch": 0.40160284951024044, + "ewc_loss": 0.006233873311430216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.233873136807233e-05, + "grad_norm": 3.325939416885376, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8696684837341309, + "num_tokens": 120520558.0, + "step": 3157 + }, + { + "epoch": 0.4017300597888309, + "ewc_loss": 0.006217099726200104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.217099871719256e-05, + "grad_norm": 3.341703176498413, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.867718517780304, + "num_tokens": 120557901.0, + "step": 3158 + }, + { + "epoch": 0.40185727006742145, + "ewc_loss": 0.006223693490028381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.223693344509229e-05, + "grad_norm": 3.340386152267456, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8650092482566833, + "num_tokens": 120593851.0, + "step": 3159 + }, + { + "epoch": 0.401984480346012, + "ewc_loss": 0.006210843101143837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.21084327576682e-05, + "grad_norm": 3.4815993309020996, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8511542677879333, + "num_tokens": 120624533.0, + "step": 3160 + }, + { + "epoch": 0.40211169062460245, + "ewc_loss": 0.006279020570218563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.279020453803241e-05, + "grad_norm": 3.345125675201416, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8546419143676758, + "num_tokens": 120660820.0, + "step": 3161 + }, + { + "epoch": 0.402238900903193, + "ewc_loss": 0.0061773331835865974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.177333125378937e-05, + "grad_norm": 3.294638156890869, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8712433576583862, + "num_tokens": 120701485.0, + "step": 3162 + }, + { + "epoch": 0.4023661111817835, + "ewc_loss": 0.006186651531606913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.186651444295421e-05, + "grad_norm": 3.328155994415283, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8679242134094238, + "num_tokens": 120736677.0, + "step": 3163 + }, + { + "epoch": 0.402493321460374, + "ewc_loss": 0.006223627831786871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.223627860890701e-05, + "grad_norm": 3.3850913047790527, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.857417106628418, + "num_tokens": 120773933.0, + "step": 3164 + }, + { + "epoch": 0.4026205317389645, + "ewc_loss": 0.0062449658289551735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.244966061785817e-05, + "grad_norm": 3.398303985595703, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8643044233322144, + "num_tokens": 120804671.0, + "step": 3165 + }, + { + "epoch": 0.40274774201755503, + "ewc_loss": 0.0062259952537715435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.225995457498357e-05, + "grad_norm": 3.4663610458374023, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8640203475952148, + "num_tokens": 120837042.0, + "step": 3166 + }, + { + "epoch": 0.4028749522961455, + "ewc_loss": 0.006268922705203295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.268922879826277e-05, + "grad_norm": 3.322958469390869, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8565057516098022, + "num_tokens": 120875127.0, + "step": 3167 + }, + { + "epoch": 0.40300216257473603, + "ewc_loss": 0.006177890580147505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.177890463732183e-05, + "grad_norm": 3.3312039375305176, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8570908308029175, + "num_tokens": 120915937.0, + "step": 3168 + }, + { + "epoch": 0.40312937285332656, + "ewc_loss": 0.006232711020857096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.232711166376248e-05, + "grad_norm": 3.352311849594116, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8446196913719177, + "num_tokens": 120953334.0, + "step": 3169 + }, + { + "epoch": 0.40325658313191703, + "ewc_loss": 0.006246277131140232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.246277189347893e-05, + "grad_norm": 3.378103733062744, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8601753115653992, + "num_tokens": 120986674.0, + "step": 3170 + }, + { + "epoch": 0.40338379341050756, + "ewc_loss": 0.006254943087697029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.254942854866385e-05, + "grad_norm": 3.3101329803466797, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8651927709579468, + "num_tokens": 121026305.0, + "step": 3171 + }, + { + "epoch": 0.4035110036890981, + "ewc_loss": 0.006227546837180853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.2275466916617e-05, + "grad_norm": 3.3363654613494873, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8720711469650269, + "num_tokens": 121061644.0, + "step": 3172 + }, + { + "epoch": 0.40363821396768856, + "ewc_loss": 0.00626266049221158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.26266046310775e-05, + "grad_norm": 3.2969870567321777, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8552538752555847, + "num_tokens": 121102860.0, + "step": 3173 + }, + { + "epoch": 0.4037654242462791, + "ewc_loss": 0.006223437376320362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.22343723080121e-05, + "grad_norm": 3.326890468597412, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8713655471801758, + "num_tokens": 121136566.0, + "step": 3174 + }, + { + "epoch": 0.4038926345248696, + "ewc_loss": 0.006265320349484682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.265320553211495e-05, + "grad_norm": 3.3351526260375977, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8690512776374817, + "num_tokens": 121173759.0, + "step": 3175 + }, + { + "epoch": 0.4040198448034601, + "ewc_loss": 0.0062421830371022224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.242183007998392e-05, + "grad_norm": 3.285242795944214, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8601159453392029, + "num_tokens": 121219881.0, + "step": 3176 + }, + { + "epoch": 0.4041470550820506, + "ewc_loss": 0.006221058778464794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.221058720257133e-05, + "grad_norm": 3.3683559894561768, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.857562780380249, + "num_tokens": 121258719.0, + "step": 3177 + }, + { + "epoch": 0.40427426536064115, + "ewc_loss": 0.006283903960138559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.283904076553881e-05, + "grad_norm": 3.355403423309326, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8366842269897461, + "num_tokens": 121298013.0, + "step": 3178 + }, + { + "epoch": 0.4044014756392316, + "ewc_loss": 0.006238989066332579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.238988862605765e-05, + "grad_norm": 3.4381613731384277, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8712557554244995, + "num_tokens": 121329615.0, + "step": 3179 + }, + { + "epoch": 0.40452868591782215, + "ewc_loss": 0.006282959133386612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.282958929659799e-05, + "grad_norm": 3.319530725479126, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8694037795066833, + "num_tokens": 121365199.0, + "step": 3180 + }, + { + "epoch": 0.4046558961964127, + "ewc_loss": 0.006188265047967434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.188265251694247e-05, + "grad_norm": 3.4085850715637207, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8561751842498779, + "num_tokens": 121402393.0, + "step": 3181 + }, + { + "epoch": 0.4047831064750032, + "ewc_loss": 0.006278380285948515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.278380169533193e-05, + "grad_norm": 3.321417808532715, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8595414161682129, + "num_tokens": 121439678.0, + "step": 3182 + }, + { + "epoch": 0.4049103167535937, + "ewc_loss": 0.006190044805407524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.190044950926676e-05, + "grad_norm": 3.31968092918396, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8541715145111084, + "num_tokens": 121480781.0, + "step": 3183 + }, + { + "epoch": 0.4050375270321842, + "ewc_loss": 0.00622805580496788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.228056008694693e-05, + "grad_norm": 3.289365768432617, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8656487464904785, + "num_tokens": 121517614.0, + "step": 3184 + }, + { + "epoch": 0.40516473731077474, + "ewc_loss": 0.006197712384164333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.197712355060503e-05, + "grad_norm": 3.368530511856079, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8574665188789368, + "num_tokens": 121552225.0, + "step": 3185 + }, + { + "epoch": 0.4052919475893652, + "ewc_loss": 0.006254149135202169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.254149047890678e-05, + "grad_norm": 3.4028122425079346, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8534858226776123, + "num_tokens": 121587177.0, + "step": 3186 + }, + { + "epoch": 0.40541915786795574, + "ewc_loss": 0.006263709627091885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.263709656195715e-05, + "grad_norm": 3.421346426010132, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8409315943717957, + "num_tokens": 121619547.0, + "step": 3187 + }, + { + "epoch": 0.40554636814654627, + "ewc_loss": 0.006258082576096058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.258082430576906e-05, + "grad_norm": 3.304453134536743, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8655412197113037, + "num_tokens": 121657396.0, + "step": 3188 + }, + { + "epoch": 0.40567357842513674, + "ewc_loss": 0.006198402959853411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.198402843438089e-05, + "grad_norm": 3.3925676345825195, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8424578905105591, + "num_tokens": 121691867.0, + "step": 3189 + }, + { + "epoch": 0.40580078870372727, + "ewc_loss": 0.006303609348833561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.303609552560374e-05, + "grad_norm": 3.4159417152404785, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8553767204284668, + "num_tokens": 121723546.0, + "step": 3190 + }, + { + "epoch": 0.4059279989823178, + "ewc_loss": 0.006279686000198126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.279686203924939e-05, + "grad_norm": 3.3113412857055664, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8589626550674438, + "num_tokens": 121764589.0, + "step": 3191 + }, + { + "epoch": 0.40605520926090827, + "ewc_loss": 0.006237940397113562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.237940397113562e-05, + "grad_norm": 3.3546977043151855, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8482745885848999, + "num_tokens": 121803281.0, + "step": 3192 + }, + { + "epoch": 0.4061824195394988, + "ewc_loss": 0.006290512625128031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.290512828854844e-05, + "grad_norm": 3.3690836429595947, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8464187383651733, + "num_tokens": 121841710.0, + "step": 3193 + }, + { + "epoch": 0.4063096298180893, + "ewc_loss": 0.006277794484049082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.277794454945251e-05, + "grad_norm": 3.3357603549957275, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8570213913917542, + "num_tokens": 121881979.0, + "step": 3194 + }, + { + "epoch": 0.4064368400966798, + "ewc_loss": 0.006256251595914364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.256251799641177e-05, + "grad_norm": 3.341923475265503, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8627956509590149, + "num_tokens": 121921599.0, + "step": 3195 + }, + { + "epoch": 0.4065640503752703, + "ewc_loss": 0.0062765683978796005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.276568456087261e-05, + "grad_norm": 3.334399461746216, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8679397106170654, + "num_tokens": 121960545.0, + "step": 3196 + }, + { + "epoch": 0.40669126065386085, + "ewc_loss": 0.006258795969188213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.258796202018857e-05, + "grad_norm": 3.3499205112457275, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8408653140068054, + "num_tokens": 121999259.0, + "step": 3197 + }, + { + "epoch": 0.4068184709324513, + "ewc_loss": 0.0062742577865719795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.274257611948997e-05, + "grad_norm": 3.3023335933685303, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8711993098258972, + "num_tokens": 122039066.0, + "step": 3198 + }, + { + "epoch": 0.40694568121104185, + "ewc_loss": 0.006228461395949125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.228461279533803e-05, + "grad_norm": 3.2686920166015625, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8661715388298035, + "num_tokens": 122085035.0, + "step": 3199 + }, + { + "epoch": 0.4070728914896324, + "ewc_loss": 0.006217525340616703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.217525515239686e-05, + "grad_norm": 3.2841241359710693, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8726816177368164, + "num_tokens": 122128007.0, + "step": 3200 + }, + { + "epoch": 0.40720010176822286, + "ewc_loss": 0.0062256548553705215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.225654942682013e-05, + "grad_norm": 3.3198049068450928, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8575774431228638, + "num_tokens": 122169623.0, + "step": 3201 + }, + { + "epoch": 0.4073273120468134, + "ewc_loss": 0.006223164498806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.223164382390678e-05, + "grad_norm": 3.4628329277038574, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8530102372169495, + "num_tokens": 122201755.0, + "step": 3202 + }, + { + "epoch": 0.4074545223254039, + "ewc_loss": 0.006301598623394966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.301598477875814e-05, + "grad_norm": 3.4160804748535156, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8701832890510559, + "num_tokens": 122233680.0, + "step": 3203 + }, + { + "epoch": 0.4075817326039944, + "ewc_loss": 0.006238146219402552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.238146306714043e-05, + "grad_norm": 3.4793219566345215, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8562729358673096, + "num_tokens": 122265107.0, + "step": 3204 + }, + { + "epoch": 0.4077089428825849, + "ewc_loss": 0.006283729337155819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.283729453571141e-05, + "grad_norm": 3.301950693130493, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8638631105422974, + "num_tokens": 122302750.0, + "step": 3205 + }, + { + "epoch": 0.40783615316117544, + "ewc_loss": 0.00617770804092288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.177707837196067e-05, + "grad_norm": 3.275542736053467, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8493612408638, + "num_tokens": 122346820.0, + "step": 3206 + }, + { + "epoch": 0.4079633634397659, + "ewc_loss": 0.006218137219548225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.218137423275039e-05, + "grad_norm": 3.37135648727417, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8639678955078125, + "num_tokens": 122383254.0, + "step": 3207 + }, + { + "epoch": 0.40809057371835644, + "ewc_loss": 0.006280533969402313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.280533852986991e-05, + "grad_norm": 3.345521926879883, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8712838292121887, + "num_tokens": 122416693.0, + "step": 3208 + }, + { + "epoch": 0.40821778399694697, + "ewc_loss": 0.006229829974472523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.229829887161031e-05, + "grad_norm": 3.388343572616577, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8649462461471558, + "num_tokens": 122452403.0, + "step": 3209 + }, + { + "epoch": 0.40834499427553744, + "ewc_loss": 0.006264756433665752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.264756666496396e-05, + "grad_norm": 3.3400166034698486, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8491074442863464, + "num_tokens": 122489685.0, + "step": 3210 + }, + { + "epoch": 0.40847220455412797, + "ewc_loss": 0.006240888498723507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.240888615138829e-05, + "grad_norm": 3.3304710388183594, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8430461883544922, + "num_tokens": 122529079.0, + "step": 3211 + }, + { + "epoch": 0.4085994148327185, + "ewc_loss": 0.006240504328161478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.2405044445768e-05, + "grad_norm": 3.299995183944702, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8531074523925781, + "num_tokens": 122569933.0, + "step": 3212 + }, + { + "epoch": 0.40872662511130897, + "ewc_loss": 0.0062378523871302605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.23785235802643e-05, + "grad_norm": 3.399631977081299, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8407674431800842, + "num_tokens": 122607884.0, + "step": 3213 + }, + { + "epoch": 0.4088538353898995, + "ewc_loss": 0.006289436481893063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.289436714723706e-05, + "grad_norm": 3.264268398284912, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8601782321929932, + "num_tokens": 122651968.0, + "step": 3214 + }, + { + "epoch": 0.40898104566849003, + "ewc_loss": 0.006195493042469025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.195493187988177e-05, + "grad_norm": 3.406991958618164, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8628579378128052, + "num_tokens": 122688680.0, + "step": 3215 + }, + { + "epoch": 0.4091082559470805, + "ewc_loss": 0.006317671854048967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.317671795841306e-05, + "grad_norm": 3.3058011531829834, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8646202087402344, + "num_tokens": 122728271.0, + "step": 3216 + }, + { + "epoch": 0.40923546622567103, + "ewc_loss": 0.006211788393557072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.211788422660902e-05, + "grad_norm": 3.484304428100586, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8595836162567139, + "num_tokens": 122759691.0, + "step": 3217 + }, + { + "epoch": 0.40936267650426156, + "ewc_loss": 0.006342495791614056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.342495908029377e-05, + "grad_norm": 3.305887460708618, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8576184511184692, + "num_tokens": 122800185.0, + "step": 3218 + }, + { + "epoch": 0.40948988678285203, + "ewc_loss": 0.006205890327692032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.205890531418845e-05, + "grad_norm": 3.3268234729766846, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8693743348121643, + "num_tokens": 122839548.0, + "step": 3219 + }, + { + "epoch": 0.40961709706144256, + "ewc_loss": 0.00624935794621706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.249357829801738e-05, + "grad_norm": 3.3146555423736572, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8457911014556885, + "num_tokens": 122879621.0, + "step": 3220 + }, + { + "epoch": 0.4097443073400331, + "ewc_loss": 0.006240318063646555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.240318180061877e-05, + "grad_norm": 3.3472611904144287, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8545967936515808, + "num_tokens": 122921495.0, + "step": 3221 + }, + { + "epoch": 0.40987151761862356, + "ewc_loss": 0.006234646774828434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.234646571101621e-05, + "grad_norm": 3.3663511276245117, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8657639026641846, + "num_tokens": 122957197.0, + "step": 3222 + }, + { + "epoch": 0.4099987278972141, + "ewc_loss": 0.006234942004084587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.234941974980757e-05, + "grad_norm": 3.3739137649536133, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8406462669372559, + "num_tokens": 122999999.0, + "step": 3223 + }, + { + "epoch": 0.4101259381758046, + "ewc_loss": 0.0062265461310744286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.22654624748975e-05, + "grad_norm": 3.347517490386963, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8636962175369263, + "num_tokens": 123040402.0, + "step": 3224 + }, + { + "epoch": 0.4102531484543951, + "ewc_loss": 0.0062110950239002705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.21109502390027e-05, + "grad_norm": 3.281372308731079, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8628376126289368, + "num_tokens": 123080086.0, + "step": 3225 + }, + { + "epoch": 0.4103803587329856, + "ewc_loss": 0.006186325568705797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.186325481394306e-05, + "grad_norm": 3.3096680641174316, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8606934547424316, + "num_tokens": 123120058.0, + "step": 3226 + }, + { + "epoch": 0.41050756901157615, + "ewc_loss": 0.006210209336131811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.210209539858624e-05, + "grad_norm": 3.3047492504119873, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8607126474380493, + "num_tokens": 123162003.0, + "step": 3227 + }, + { + "epoch": 0.4106347792901666, + "ewc_loss": 0.006184076424688101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.184076482895762e-05, + "grad_norm": 3.4222970008850098, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.827053427696228, + "num_tokens": 123202249.0, + "step": 3228 + }, + { + "epoch": 0.41076198956875715, + "ewc_loss": 0.006252797320485115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.252797174965963e-05, + "grad_norm": 3.43048095703125, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8531802892684937, + "num_tokens": 123237401.0, + "step": 3229 + }, + { + "epoch": 0.4108891998473477, + "ewc_loss": 0.006212878972291946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.212879088707268e-05, + "grad_norm": 3.3370139598846436, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8753140568733215, + "num_tokens": 123269364.0, + "step": 3230 + }, + { + "epoch": 0.4110164101259382, + "ewc_loss": 0.006169281434267759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.169281550683081e-05, + "grad_norm": 3.353888750076294, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.862205982208252, + "num_tokens": 123307743.0, + "step": 3231 + }, + { + "epoch": 0.4111436204045287, + "ewc_loss": 0.006215987261384726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.215987377800047e-05, + "grad_norm": 3.384335994720459, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8640154600143433, + "num_tokens": 123340102.0, + "step": 3232 + }, + { + "epoch": 0.4112708306831192, + "ewc_loss": 0.006220517214387655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.220517389010638e-05, + "grad_norm": 3.371464967727661, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8537558317184448, + "num_tokens": 123374956.0, + "step": 3233 + }, + { + "epoch": 0.41139804096170973, + "ewc_loss": 0.0062054963782429695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.205496174516156e-05, + "grad_norm": 3.3041813373565674, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8684637546539307, + "num_tokens": 123418070.0, + "step": 3234 + }, + { + "epoch": 0.4115252512403002, + "ewc_loss": 0.006193450186401606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.193450099090114e-05, + "grad_norm": 3.385718584060669, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8577466011047363, + "num_tokens": 123453183.0, + "step": 3235 + }, + { + "epoch": 0.41165246151889073, + "ewc_loss": 0.00626097759231925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.260977534111589e-05, + "grad_norm": 3.3611788749694824, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.860566258430481, + "num_tokens": 123491526.0, + "step": 3236 + }, + { + "epoch": 0.41177967179748126, + "ewc_loss": 0.006222161930054426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.222161755431443e-05, + "grad_norm": 3.3387608528137207, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8612951040267944, + "num_tokens": 123529743.0, + "step": 3237 + }, + { + "epoch": 0.41190688207607173, + "ewc_loss": 0.006223815493285656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.223815580597147e-05, + "grad_norm": 3.3230600357055664, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8614914417266846, + "num_tokens": 123573402.0, + "step": 3238 + }, + { + "epoch": 0.41203409235466226, + "ewc_loss": 0.006223505828529596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.223505624802783e-05, + "grad_norm": 3.453787326812744, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8571093082427979, + "num_tokens": 123604167.0, + "step": 3239 + }, + { + "epoch": 0.4121613026332528, + "ewc_loss": 0.006302876863628626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.302876863628626e-05, + "grad_norm": 3.271589994430542, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.855849027633667, + "num_tokens": 123648104.0, + "step": 3240 + }, + { + "epoch": 0.41228851291184326, + "ewc_loss": 0.006161817815154791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.161817873362452e-05, + "grad_norm": 3.302644968032837, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8491805791854858, + "num_tokens": 123689678.0, + "step": 3241 + }, + { + "epoch": 0.4124157231904338, + "ewc_loss": 0.006254356354475021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.254356412682682e-05, + "grad_norm": 3.2902157306671143, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8679589033126831, + "num_tokens": 123729257.0, + "step": 3242 + }, + { + "epoch": 0.4125429334690243, + "ewc_loss": 0.006234229542315006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.234229658730328e-05, + "grad_norm": 3.3064279556274414, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8607147932052612, + "num_tokens": 123772251.0, + "step": 3243 + }, + { + "epoch": 0.4126701437476148, + "ewc_loss": 0.00623093219473958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.23093219473958e-05, + "grad_norm": 3.329355239868164, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8709454536437988, + "num_tokens": 123810345.0, + "step": 3244 + }, + { + "epoch": 0.4127973540262053, + "ewc_loss": 0.006247666198760271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.24766616965644e-05, + "grad_norm": 3.3107097148895264, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8557045459747314, + "num_tokens": 123855276.0, + "step": 3245 + }, + { + "epoch": 0.41292456430479585, + "ewc_loss": 0.006210995838046074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.210996070876718e-05, + "grad_norm": 3.316053867340088, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8626716136932373, + "num_tokens": 123896613.0, + "step": 3246 + }, + { + "epoch": 0.4130517745833863, + "ewc_loss": 0.00621730787679553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.217307964107022e-05, + "grad_norm": 3.3531813621520996, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8440966606140137, + "num_tokens": 123940713.0, + "step": 3247 + }, + { + "epoch": 0.41317898486197685, + "ewc_loss": 0.006229817401617765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.229817518033087e-05, + "grad_norm": 3.3337864875793457, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8487762212753296, + "num_tokens": 123981485.0, + "step": 3248 + }, + { + "epoch": 0.4133061951405674, + "ewc_loss": 0.00620501721277833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.205017416505143e-05, + "grad_norm": 3.365081310272217, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8594017028808594, + "num_tokens": 124018159.0, + "step": 3249 + }, + { + "epoch": 0.41343340541915785, + "ewc_loss": 0.006218164227902889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.218164344318211e-05, + "grad_norm": 3.3936612606048584, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8695599436759949, + "num_tokens": 124049911.0, + "step": 3250 + }, + { + "epoch": 0.4135606156977484, + "ewc_loss": 0.0062289126217365265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.228912388905883e-05, + "grad_norm": 3.3542542457580566, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8796471357345581, + "num_tokens": 124084260.0, + "step": 3251 + }, + { + "epoch": 0.4136878259763389, + "ewc_loss": 0.006193237844854593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.19323764112778e-05, + "grad_norm": 3.3531739711761475, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8613770008087158, + "num_tokens": 124122741.0, + "step": 3252 + }, + { + "epoch": 0.4138150362549294, + "ewc_loss": 0.006210454739630222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.210454739630222e-05, + "grad_norm": 3.3525776863098145, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.857191801071167, + "num_tokens": 124162793.0, + "step": 3253 + }, + { + "epoch": 0.4139422465335199, + "ewc_loss": 0.006205297540873289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.205297540873289e-05, + "grad_norm": 3.3338301181793213, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8536123037338257, + "num_tokens": 124206889.0, + "step": 3254 + }, + { + "epoch": 0.41406945681211044, + "ewc_loss": 0.006196620408445597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.196620233822614e-05, + "grad_norm": 3.3629140853881836, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8693777918815613, + "num_tokens": 124244551.0, + "step": 3255 + }, + { + "epoch": 0.4141966670907009, + "ewc_loss": 0.006221245042979717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.221244984772056e-05, + "grad_norm": 3.2764089107513428, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.862777829170227, + "num_tokens": 124290485.0, + "step": 3256 + }, + { + "epoch": 0.41432387736929144, + "ewc_loss": 0.00616802554577589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.168025720398873e-05, + "grad_norm": 3.3836522102355957, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8683176636695862, + "num_tokens": 124323763.0, + "step": 3257 + }, + { + "epoch": 0.41445108764788197, + "ewc_loss": 0.006249859929084778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.249859870877117e-05, + "grad_norm": 3.3481218814849854, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8600555062294006, + "num_tokens": 124363318.0, + "step": 3258 + }, + { + "epoch": 0.41457829792647244, + "ewc_loss": 0.006187630817294121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.18763078819029e-05, + "grad_norm": 3.323903799057007, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8814088702201843, + "num_tokens": 124400845.0, + "step": 3259 + }, + { + "epoch": 0.41470550820506297, + "ewc_loss": 0.006183130666613579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.183130608405918e-05, + "grad_norm": 3.2686071395874023, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8669500946998596, + "num_tokens": 124442546.0, + "step": 3260 + }, + { + "epoch": 0.4148327184836535, + "ewc_loss": 0.006162640638649464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.162640784168616e-05, + "grad_norm": 3.3775618076324463, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8553866147994995, + "num_tokens": 124477737.0, + "step": 3261 + }, + { + "epoch": 0.41495992876224397, + "ewc_loss": 0.006256820634007454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.256820779526606e-05, + "grad_norm": 3.353163957595825, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8476362228393555, + "num_tokens": 124518081.0, + "step": 3262 + }, + { + "epoch": 0.4150871390408345, + "ewc_loss": 0.006215706001967192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.215705798240378e-05, + "grad_norm": 3.357234001159668, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8493673205375671, + "num_tokens": 124556361.0, + "step": 3263 + }, + { + "epoch": 0.415214349319425, + "ewc_loss": 0.006224285811185837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.224285607459024e-05, + "grad_norm": 3.4435553550720215, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.858750581741333, + "num_tokens": 124588490.0, + "step": 3264 + }, + { + "epoch": 0.4153415595980155, + "ewc_loss": 0.0062726750038564205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.272675091167912e-05, + "grad_norm": 3.3641135692596436, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8579496145248413, + "num_tokens": 124624656.0, + "step": 3265 + }, + { + "epoch": 0.415468769876606, + "ewc_loss": 0.006201731041073799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.201730866450816e-05, + "grad_norm": 3.3831350803375244, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8466064929962158, + "num_tokens": 124660681.0, + "step": 3266 + }, + { + "epoch": 0.41559598015519655, + "ewc_loss": 0.0062426128424704075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.24261301709339e-05, + "grad_norm": 3.2758779525756836, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.859931230545044, + "num_tokens": 124703448.0, + "step": 3267 + }, + { + "epoch": 0.415723190433787, + "ewc_loss": 0.006180815398693085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.180815398693085e-05, + "grad_norm": 3.3237383365631104, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8578671216964722, + "num_tokens": 124742078.0, + "step": 3268 + }, + { + "epoch": 0.41585040071237755, + "ewc_loss": 0.006248351652175188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.248351564863697e-05, + "grad_norm": 3.380523204803467, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8482083082199097, + "num_tokens": 124779894.0, + "step": 3269 + }, + { + "epoch": 0.4159776109909681, + "ewc_loss": 0.006267276126891375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.267276330618188e-05, + "grad_norm": 3.3452465534210205, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8539425134658813, + "num_tokens": 124817918.0, + "step": 3270 + }, + { + "epoch": 0.41610482126955856, + "ewc_loss": 0.00622909190133214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.229092105058953e-05, + "grad_norm": 3.3302950859069824, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8758797645568848, + "num_tokens": 124854839.0, + "step": 3271 + }, + { + "epoch": 0.4162320315481491, + "ewc_loss": 0.006256197113543749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.256197229959071e-05, + "grad_norm": 3.4692463874816895, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8439313173294067, + "num_tokens": 124886779.0, + "step": 3272 + }, + { + "epoch": 0.4163592418267396, + "ewc_loss": 0.0063350084237754345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.335008220048621e-05, + "grad_norm": 3.3265671730041504, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8673092722892761, + "num_tokens": 124927919.0, + "step": 3273 + }, + { + "epoch": 0.4164864521053301, + "ewc_loss": 0.0062090628780424595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.209062848938629e-05, + "grad_norm": 3.3180835247039795, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8571348190307617, + "num_tokens": 124966475.0, + "step": 3274 + }, + { + "epoch": 0.4166136623839206, + "ewc_loss": 0.0062674423679709435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.267442222451791e-05, + "grad_norm": 3.372971773147583, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8351154923439026, + "num_tokens": 125004110.0, + "step": 3275 + }, + { + "epoch": 0.41674087266251114, + "ewc_loss": 0.006300958339124918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.300958193605766e-05, + "grad_norm": 3.3074769973754883, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.872765839099884, + "num_tokens": 125043164.0, + "step": 3276 + }, + { + "epoch": 0.4168680829411016, + "ewc_loss": 0.006249239202588797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.249239231692627e-05, + "grad_norm": 3.2992351055145264, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8635133504867554, + "num_tokens": 125084255.0, + "step": 3277 + }, + { + "epoch": 0.41699529321969214, + "ewc_loss": 0.006267394404858351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.267394201131538e-05, + "grad_norm": 3.314950466156006, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.872660219669342, + "num_tokens": 125124971.0, + "step": 3278 + }, + { + "epoch": 0.41712250349828267, + "ewc_loss": 0.006279598921537399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.279598892433569e-05, + "grad_norm": 3.3614394664764404, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.856230616569519, + "num_tokens": 125161470.0, + "step": 3279 + }, + { + "epoch": 0.4172497137768732, + "ewc_loss": 0.006290595978498459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.290595774771646e-05, + "grad_norm": 3.398585796356201, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8440248966217041, + "num_tokens": 125199117.0, + "step": 3280 + }, + { + "epoch": 0.41737692405546367, + "ewc_loss": 0.006303915288299322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.30391514278017e-05, + "grad_norm": 3.3443713188171387, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8403282761573792, + "num_tokens": 125238136.0, + "step": 3281 + }, + { + "epoch": 0.4175041343340542, + "ewc_loss": 0.006272245664149523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.272245809668675e-05, + "grad_norm": 3.3046207427978516, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8566719889640808, + "num_tokens": 125281041.0, + "step": 3282 + }, + { + "epoch": 0.41763134461264473, + "ewc_loss": 0.006272397004067898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.27239714958705e-05, + "grad_norm": 3.3809993267059326, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.862413763999939, + "num_tokens": 125314739.0, + "step": 3283 + }, + { + "epoch": 0.4177585548912352, + "ewc_loss": 0.006320179905742407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.320179818430915e-05, + "grad_norm": 3.3505702018737793, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8536864519119263, + "num_tokens": 125353129.0, + "step": 3284 + }, + { + "epoch": 0.41788576516982573, + "ewc_loss": 0.006278342567384243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.2783423345536e-05, + "grad_norm": 3.3700125217437744, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8564381003379822, + "num_tokens": 125394092.0, + "step": 3285 + }, + { + "epoch": 0.41801297544841626, + "ewc_loss": 0.006300594191998243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.300594395725057e-05, + "grad_norm": 3.3180336952209473, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8529367446899414, + "num_tokens": 125438262.0, + "step": 3286 + }, + { + "epoch": 0.41814018572700673, + "ewc_loss": 0.006261980161070824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.261980161070824e-05, + "grad_norm": 3.3360750675201416, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8581516146659851, + "num_tokens": 125477842.0, + "step": 3287 + }, + { + "epoch": 0.41826739600559726, + "ewc_loss": 0.006275476887822151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.275477062445134e-05, + "grad_norm": 3.297299861907959, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8700978755950928, + "num_tokens": 125517236.0, + "step": 3288 + }, + { + "epoch": 0.4183946062841878, + "ewc_loss": 0.006258922629058361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.258922803681344e-05, + "grad_norm": 3.343714714050293, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8527841567993164, + "num_tokens": 125561598.0, + "step": 3289 + }, + { + "epoch": 0.41852181656277826, + "ewc_loss": 0.0062809959053993225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.280995876295492e-05, + "grad_norm": 3.334263801574707, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8572244644165039, + "num_tokens": 125599104.0, + "step": 3290 + }, + { + "epoch": 0.4186490268413688, + "ewc_loss": 0.006267573218792677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.267573189688846e-05, + "grad_norm": 3.3996634483337402, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8587538003921509, + "num_tokens": 125635286.0, + "step": 3291 + }, + { + "epoch": 0.4187762371199593, + "ewc_loss": 0.006298635620623827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.298635707935318e-05, + "grad_norm": 3.358715057373047, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8641948699951172, + "num_tokens": 125669534.0, + "step": 3292 + }, + { + "epoch": 0.4189034473985498, + "ewc_loss": 0.006249952130019665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.249952275538817e-05, + "grad_norm": 3.3415350914001465, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8586806058883667, + "num_tokens": 125707966.0, + "step": 3293 + }, + { + "epoch": 0.4190306576771403, + "ewc_loss": 0.00625893147662282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.258931534830481e-05, + "grad_norm": 3.406092405319214, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.853951096534729, + "num_tokens": 125741448.0, + "step": 3294 + }, + { + "epoch": 0.41915786795573085, + "ewc_loss": 0.006302323192358017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.302323163254187e-05, + "grad_norm": 3.3568079471588135, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8511302471160889, + "num_tokens": 125776887.0, + "step": 3295 + }, + { + "epoch": 0.4192850782343213, + "ewc_loss": 0.006248227320611477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.248227145988494e-05, + "grad_norm": 3.453035831451416, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8635056614875793, + "num_tokens": 125807899.0, + "step": 3296 + }, + { + "epoch": 0.41941228851291185, + "ewc_loss": 0.0063325753435492516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.332575139822438e-05, + "grad_norm": 3.283968210220337, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8667203187942505, + "num_tokens": 125847711.0, + "step": 3297 + }, + { + "epoch": 0.4195394987915024, + "ewc_loss": 0.006223732605576515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.223732634680346e-05, + "grad_norm": 3.3506317138671875, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.871957540512085, + "num_tokens": 125883248.0, + "step": 3298 + }, + { + "epoch": 0.41966670907009285, + "ewc_loss": 0.006322909612208605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.322909757727757e-05, + "grad_norm": 3.323974132537842, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8475545048713684, + "num_tokens": 125925771.0, + "step": 3299 + }, + { + "epoch": 0.4197939193486834, + "ewc_loss": 0.006280763074755669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.280763045651838e-05, + "grad_norm": 3.3412704467773438, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8480265140533447, + "num_tokens": 125968815.0, + "step": 3300 + }, + { + "epoch": 0.4199211296272739, + "ewc_loss": 0.00629767682403326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.297676736721769e-05, + "grad_norm": 3.3695566654205322, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8544991612434387, + "num_tokens": 126007761.0, + "step": 3301 + }, + { + "epoch": 0.4200483399058644, + "ewc_loss": 0.006316100712865591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.316100916592404e-05, + "grad_norm": 3.363722324371338, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.839827299118042, + "num_tokens": 126047872.0, + "step": 3302 + }, + { + "epoch": 0.4201755501844549, + "ewc_loss": 0.00628495030105114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.284950359258801e-05, + "grad_norm": 3.351328134536743, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8665065169334412, + "num_tokens": 126081273.0, + "step": 3303 + }, + { + "epoch": 0.42030276046304543, + "ewc_loss": 0.006288567092269659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.288567237788811e-05, + "grad_norm": 3.3381075859069824, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8431699275970459, + "num_tokens": 126121590.0, + "step": 3304 + }, + { + "epoch": 0.4204299707416359, + "ewc_loss": 0.006278420332819223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.278420187300071e-05, + "grad_norm": 3.291221857070923, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8421797752380371, + "num_tokens": 126168396.0, + "step": 3305 + }, + { + "epoch": 0.42055718102022643, + "ewc_loss": 0.006262022536247969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.262022361624986e-05, + "grad_norm": 3.4189820289611816, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8539483547210693, + "num_tokens": 126205220.0, + "step": 3306 + }, + { + "epoch": 0.42068439129881696, + "ewc_loss": 0.006354673765599728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.354673678288236e-05, + "grad_norm": 3.3906354904174805, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8526120185852051, + "num_tokens": 126239131.0, + "step": 3307 + }, + { + "epoch": 0.42081160157740743, + "ewc_loss": 0.006299314554780722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.299314554780722e-05, + "grad_norm": 3.375210762023926, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8609902858734131, + "num_tokens": 126271931.0, + "step": 3308 + }, + { + "epoch": 0.42093881185599796, + "ewc_loss": 0.0062992386519908905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.299238884821534e-05, + "grad_norm": 3.3531928062438965, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8664730787277222, + "num_tokens": 126307730.0, + "step": 3309 + }, + { + "epoch": 0.4210660221345885, + "ewc_loss": 0.006302522495388985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.302522524492815e-05, + "grad_norm": 3.3707571029663086, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8612587451934814, + "num_tokens": 126344528.0, + "step": 3310 + }, + { + "epoch": 0.42119323241317896, + "ewc_loss": 0.006303051486611366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.303051486611366e-05, + "grad_norm": 3.357008457183838, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8679949045181274, + "num_tokens": 126377260.0, + "step": 3311 + }, + { + "epoch": 0.4213204426917695, + "ewc_loss": 0.00630432553589344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.30432550678961e-05, + "grad_norm": 3.396831512451172, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8549460768699646, + "num_tokens": 126409723.0, + "step": 3312 + }, + { + "epoch": 0.42144765297036, + "ewc_loss": 0.0063324603252112865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.332460179692134e-05, + "grad_norm": 3.323206901550293, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8641276955604553, + "num_tokens": 126450929.0, + "step": 3313 + }, + { + "epoch": 0.4215748632489505, + "ewc_loss": 0.0062799956649541855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.279995432123542e-05, + "grad_norm": 3.314922571182251, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8682829141616821, + "num_tokens": 126486878.0, + "step": 3314 + }, + { + "epoch": 0.421702073527541, + "ewc_loss": 0.006305874325335026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.30587455816567e-05, + "grad_norm": 3.4328064918518066, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8474938273429871, + "num_tokens": 126522884.0, + "step": 3315 + }, + { + "epoch": 0.42182928380613155, + "ewc_loss": 0.006382480263710022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.382480205502361e-05, + "grad_norm": 3.385915517807007, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8597303628921509, + "num_tokens": 126560848.0, + "step": 3316 + }, + { + "epoch": 0.421956494084722, + "ewc_loss": 0.006304606329649687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.304606358753517e-05, + "grad_norm": 3.3917508125305176, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8603114485740662, + "num_tokens": 126592947.0, + "step": 3317 + }, + { + "epoch": 0.42208370436331255, + "ewc_loss": 0.006331891752779484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.331891927402467e-05, + "grad_norm": 3.332505226135254, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8693770170211792, + "num_tokens": 126630631.0, + "step": 3318 + }, + { + "epoch": 0.4222109146419031, + "ewc_loss": 0.006306137423962355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.306137220235541e-05, + "grad_norm": 3.3415334224700928, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8563823699951172, + "num_tokens": 126673395.0, + "step": 3319 + }, + { + "epoch": 0.42233812492049355, + "ewc_loss": 0.0063218832947313786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.321883120108396e-05, + "grad_norm": 3.3401989936828613, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8745677471160889, + "num_tokens": 126711504.0, + "step": 3320 + }, + { + "epoch": 0.4224653351990841, + "ewc_loss": 0.006323825102299452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.323825073195621e-05, + "grad_norm": 3.3190555572509766, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8713394403457642, + "num_tokens": 126753217.0, + "step": 3321 + }, + { + "epoch": 0.4225925454776746, + "ewc_loss": 0.006291529163718224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.291529280133545e-05, + "grad_norm": 3.31732439994812, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8648136854171753, + "num_tokens": 126792982.0, + "step": 3322 + }, + { + "epoch": 0.4227197557562651, + "ewc_loss": 0.006287772674113512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.287772703217342e-05, + "grad_norm": 3.489565849304199, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8542430996894836, + "num_tokens": 126823771.0, + "step": 3323 + }, + { + "epoch": 0.4228469660348556, + "ewc_loss": 0.00638769194483757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.3876919739414e-05, + "grad_norm": 3.3072726726531982, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8631768226623535, + "num_tokens": 126867137.0, + "step": 3324 + }, + { + "epoch": 0.42297417631344614, + "ewc_loss": 0.006223918870091438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.223918899195269e-05, + "grad_norm": 3.372612237930298, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8639699220657349, + "num_tokens": 126902680.0, + "step": 3325 + }, + { + "epoch": 0.4231013865920366, + "ewc_loss": 0.0063359299674630165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.335930083878338e-05, + "grad_norm": 3.349884510040283, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8704256415367126, + "num_tokens": 126941037.0, + "step": 3326 + }, + { + "epoch": 0.42322859687062714, + "ewc_loss": 0.006274201441556215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.274201587075368e-05, + "grad_norm": 3.297654867172241, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8633159399032593, + "num_tokens": 126980009.0, + "step": 3327 + }, + { + "epoch": 0.42335580714921767, + "ewc_loss": 0.006248753052204847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.248753197724e-05, + "grad_norm": 3.3445827960968018, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8721613883972168, + "num_tokens": 127020022.0, + "step": 3328 + }, + { + "epoch": 0.42348301742780814, + "ewc_loss": 0.006290934979915619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.290934834396467e-05, + "grad_norm": 3.3845865726470947, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8407901525497437, + "num_tokens": 127060500.0, + "step": 3329 + }, + { + "epoch": 0.42361022770639867, + "ewc_loss": 0.006291738711297512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.291738827712834e-05, + "grad_norm": 3.3924267292022705, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8565003871917725, + "num_tokens": 127095886.0, + "step": 3330 + }, + { + "epoch": 0.4237374379849892, + "ewc_loss": 0.006279472261667252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.279472290771082e-05, + "grad_norm": 3.3468756675720215, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8504488468170166, + "num_tokens": 127134334.0, + "step": 3331 + }, + { + "epoch": 0.4238646482635797, + "ewc_loss": 0.0062657613307237625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.265761476242915e-05, + "grad_norm": 3.3718297481536865, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8520560264587402, + "num_tokens": 127174288.0, + "step": 3332 + }, + { + "epoch": 0.4239918585421702, + "ewc_loss": 0.006286573596298695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.286573625402525e-05, + "grad_norm": 3.4225211143493652, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8565162420272827, + "num_tokens": 127210311.0, + "step": 3333 + }, + { + "epoch": 0.4241190688207607, + "ewc_loss": 0.006297661457210779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.297661457210779e-05, + "grad_norm": 3.3165602684020996, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8805561661720276, + "num_tokens": 127252813.0, + "step": 3334 + }, + { + "epoch": 0.42424627909935125, + "ewc_loss": 0.006213390734046698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.213390588527545e-05, + "grad_norm": 3.4342544078826904, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8478252291679382, + "num_tokens": 127285847.0, + "step": 3335 + }, + { + "epoch": 0.4243734893779417, + "ewc_loss": 0.006324455142021179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.32445517112501e-05, + "grad_norm": 3.3077213764190674, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8726049661636353, + "num_tokens": 127325360.0, + "step": 3336 + }, + { + "epoch": 0.42450069965653225, + "ewc_loss": 0.006214313209056854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.214313179953024e-05, + "grad_norm": 3.3706812858581543, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8625991344451904, + "num_tokens": 127366446.0, + "step": 3337 + }, + { + "epoch": 0.4246279099351228, + "ewc_loss": 0.006289293989539146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.289294105954468e-05, + "grad_norm": 3.3752660751342773, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8690291047096252, + "num_tokens": 127402608.0, + "step": 3338 + }, + { + "epoch": 0.42475512021371326, + "ewc_loss": 0.006265435367822647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.2654355133418e-05, + "grad_norm": 3.302570343017578, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8651701211929321, + "num_tokens": 127445355.0, + "step": 3339 + }, + { + "epoch": 0.4248823304923038, + "ewc_loss": 0.0062166121788322926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.216612382559106e-05, + "grad_norm": 3.4052510261535645, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8544652462005615, + "num_tokens": 127480523.0, + "step": 3340 + }, + { + "epoch": 0.4250095407708943, + "ewc_loss": 0.006311394274234772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.311394099611789e-05, + "grad_norm": 3.3330631256103516, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8789678812026978, + "num_tokens": 127520787.0, + "step": 3341 + }, + { + "epoch": 0.4251367510494848, + "ewc_loss": 0.006231476552784443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.231476436369121e-05, + "grad_norm": 3.311276435852051, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8560881018638611, + "num_tokens": 127565576.0, + "step": 3342 + }, + { + "epoch": 0.4252639613280753, + "ewc_loss": 0.006237028166651726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.237027992028743e-05, + "grad_norm": 3.3726837635040283, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8710507154464722, + "num_tokens": 127599144.0, + "step": 3343 + }, + { + "epoch": 0.42539117160666584, + "ewc_loss": 0.0062855747528374195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.285574636422098e-05, + "grad_norm": 3.312358856201172, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8782180547714233, + "num_tokens": 127637361.0, + "step": 3344 + }, + { + "epoch": 0.4255183818852563, + "ewc_loss": 0.006222554482519627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.22255465714261e-05, + "grad_norm": 3.3638973236083984, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8502639532089233, + "num_tokens": 127675139.0, + "step": 3345 + }, + { + "epoch": 0.42564559216384684, + "ewc_loss": 0.006270614452660084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.270614539971575e-05, + "grad_norm": 3.2866997718811035, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8664631843566895, + "num_tokens": 127718822.0, + "step": 3346 + }, + { + "epoch": 0.42577280244243737, + "ewc_loss": 0.006211514119058847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.211514119058847e-05, + "grad_norm": 3.2835488319396973, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8644067645072937, + "num_tokens": 127763332.0, + "step": 3347 + }, + { + "epoch": 0.42590001272102784, + "ewc_loss": 0.0062210094183683395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.221009243745357e-05, + "grad_norm": 3.321319818496704, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8715446591377258, + "num_tokens": 127800346.0, + "step": 3348 + }, + { + "epoch": 0.42602722299961837, + "ewc_loss": 0.006249584723263979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.249584839679301e-05, + "grad_norm": 3.314885377883911, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8575385808944702, + "num_tokens": 127845142.0, + "step": 3349 + }, + { + "epoch": 0.4261544332782089, + "ewc_loss": 0.006216614507138729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.21661456534639e-05, + "grad_norm": 3.4330978393554688, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8437292575836182, + "num_tokens": 127881131.0, + "step": 3350 + }, + { + "epoch": 0.4262816435567994, + "ewc_loss": 0.006282004993408918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.282005051616579e-05, + "grad_norm": 3.332180976867676, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.874793529510498, + "num_tokens": 127917941.0, + "step": 3351 + }, + { + "epoch": 0.4264088538353899, + "ewc_loss": 0.006184970494359732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.184970698086545e-05, + "grad_norm": 3.3263180255889893, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.847536563873291, + "num_tokens": 127963345.0, + "step": 3352 + }, + { + "epoch": 0.42653606411398043, + "ewc_loss": 0.006221604999154806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.221605144673958e-05, + "grad_norm": 3.3064208030700684, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8689114451408386, + "num_tokens": 128004839.0, + "step": 3353 + }, + { + "epoch": 0.4266632743925709, + "ewc_loss": 0.00621014041826129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.21014041826129e-05, + "grad_norm": 3.3479840755462646, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8550209999084473, + "num_tokens": 128042709.0, + "step": 3354 + }, + { + "epoch": 0.42679048467116143, + "ewc_loss": 0.0062217083759605885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.22170846327208e-05, + "grad_norm": 3.3727331161499023, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.845679759979248, + "num_tokens": 128085069.0, + "step": 3355 + }, + { + "epoch": 0.42691769494975196, + "ewc_loss": 0.006230559665709734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.230559665709734e-05, + "grad_norm": 3.36848521232605, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8549509048461914, + "num_tokens": 128124300.0, + "step": 3356 + }, + { + "epoch": 0.42704490522834243, + "ewc_loss": 0.006225257646292448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.225257675396279e-05, + "grad_norm": 3.3496363162994385, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8578574657440186, + "num_tokens": 128164410.0, + "step": 3357 + }, + { + "epoch": 0.42717211550693296, + "ewc_loss": 0.0062087359838187695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.208736158441752e-05, + "grad_norm": 3.2971925735473633, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8699462413787842, + "num_tokens": 128206524.0, + "step": 3358 + }, + { + "epoch": 0.4272993257855235, + "ewc_loss": 0.006196204107254744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.196204049047083e-05, + "grad_norm": 3.3525354862213135, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8507465124130249, + "num_tokens": 128246387.0, + "step": 3359 + }, + { + "epoch": 0.42742653606411396, + "ewc_loss": 0.006235100794583559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.235100590856746e-05, + "grad_norm": 3.3137009143829346, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8696544170379639, + "num_tokens": 128287794.0, + "step": 3360 + }, + { + "epoch": 0.4275537463427045, + "ewc_loss": 0.006199859548360109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.199859490152448e-05, + "grad_norm": 3.3348445892333984, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8572138547897339, + "num_tokens": 128328467.0, + "step": 3361 + }, + { + "epoch": 0.427680956621295, + "ewc_loss": 0.006228478159755468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.228478014236316e-05, + "grad_norm": 3.352491855621338, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8697957992553711, + "num_tokens": 128364319.0, + "step": 3362 + }, + { + "epoch": 0.4278081668998855, + "ewc_loss": 0.006226597353816032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.22659717919305e-05, + "grad_norm": 3.320183753967285, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8549762964248657, + "num_tokens": 128403892.0, + "step": 3363 + }, + { + "epoch": 0.427935377178476, + "ewc_loss": 0.006215197034180164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.215197208803147e-05, + "grad_norm": 3.368751287460327, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8400655388832092, + "num_tokens": 128446760.0, + "step": 3364 + }, + { + "epoch": 0.42806258745706655, + "ewc_loss": 0.006250962149351835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.250962178455666e-05, + "grad_norm": 3.39624285697937, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8363775014877319, + "num_tokens": 128485498.0, + "step": 3365 + }, + { + "epoch": 0.428189797735657, + "ewc_loss": 0.006255049724131823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.255049811443314e-05, + "grad_norm": 3.3478736877441406, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8488665819168091, + "num_tokens": 128527871.0, + "step": 3366 + }, + { + "epoch": 0.42831700801424755, + "ewc_loss": 0.006207452155649662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.207451951922849e-05, + "grad_norm": 3.345261812210083, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8468790054321289, + "num_tokens": 128567792.0, + "step": 3367 + }, + { + "epoch": 0.4284442182928381, + "ewc_loss": 0.006236945278942585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.236945046111941e-05, + "grad_norm": 3.3327274322509766, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8488584160804749, + "num_tokens": 128607217.0, + "step": 3368 + }, + { + "epoch": 0.42857142857142855, + "ewc_loss": 0.006232911255210638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.232911255210638e-05, + "grad_norm": 3.3848443031311035, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.866493821144104, + "num_tokens": 128642787.0, + "step": 3369 + }, + { + "epoch": 0.4286986388500191, + "ewc_loss": 0.006258961278945208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.258961366256699e-05, + "grad_norm": 3.322432041168213, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8714342713356018, + "num_tokens": 128680462.0, + "step": 3370 + }, + { + "epoch": 0.4288258491286096, + "ewc_loss": 0.00622490094974637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.224901153473184e-05, + "grad_norm": 3.441587448120117, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.856306791305542, + "num_tokens": 128715101.0, + "step": 3371 + }, + { + "epoch": 0.4289530594072001, + "ewc_loss": 0.006316340994089842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.316341023193672e-05, + "grad_norm": 3.432208299636841, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8516370058059692, + "num_tokens": 128749601.0, + "step": 3372 + }, + { + "epoch": 0.4290802696857906, + "ewc_loss": 0.006268517579883337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.268517608987167e-05, + "grad_norm": 3.350128650665283, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8559440970420837, + "num_tokens": 128788744.0, + "step": 3373 + }, + { + "epoch": 0.42920747996438113, + "ewc_loss": 0.0062324972823262215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.232497253222391e-05, + "grad_norm": 3.3399975299835205, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8566427230834961, + "num_tokens": 128828102.0, + "step": 3374 + }, + { + "epoch": 0.4293346902429716, + "ewc_loss": 0.006264481693506241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.26448163529858e-05, + "grad_norm": 3.387143850326538, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8558834791183472, + "num_tokens": 128863610.0, + "step": 3375 + }, + { + "epoch": 0.42946190052156213, + "ewc_loss": 0.006301672663539648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.301672692643479e-05, + "grad_norm": 3.358504295349121, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8736487627029419, + "num_tokens": 128898836.0, + "step": 3376 + }, + { + "epoch": 0.42958911080015266, + "ewc_loss": 0.006281127687543631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.281127571128309e-05, + "grad_norm": 3.3618698120117188, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8610175848007202, + "num_tokens": 128934821.0, + "step": 3377 + }, + { + "epoch": 0.42971632107874314, + "ewc_loss": 0.006293111015111208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.293111073318869e-05, + "grad_norm": 3.4004805088043213, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8591588735580444, + "num_tokens": 128966914.0, + "step": 3378 + }, + { + "epoch": 0.42984353135733366, + "ewc_loss": 0.006337396334856749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.337396189337596e-05, + "grad_norm": 3.3194539546966553, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.865264892578125, + "num_tokens": 129008305.0, + "step": 3379 + }, + { + "epoch": 0.4299707416359242, + "ewc_loss": 0.006288706324994564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.288706208579242e-05, + "grad_norm": 3.362049102783203, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8789978623390198, + "num_tokens": 129043331.0, + "step": 3380 + }, + { + "epoch": 0.4300979519145147, + "ewc_loss": 0.006337836384773254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.337836384773254e-05, + "grad_norm": 3.2956690788269043, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8648797869682312, + "num_tokens": 129083513.0, + "step": 3381 + }, + { + "epoch": 0.4302251621931052, + "ewc_loss": 0.006294936407357454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.294936611084267e-05, + "grad_norm": 3.4915056228637695, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.858303427696228, + "num_tokens": 129113463.0, + "step": 3382 + }, + { + "epoch": 0.4303523724716957, + "ewc_loss": 0.00643360847607255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.433608359657228e-05, + "grad_norm": 3.3399269580841064, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8361886739730835, + "num_tokens": 129153080.0, + "step": 3383 + }, + { + "epoch": 0.43047958275028625, + "ewc_loss": 0.006291581783443689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.291581667028368e-05, + "grad_norm": 3.3596606254577637, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8562938570976257, + "num_tokens": 129193338.0, + "step": 3384 + }, + { + "epoch": 0.4306067930288767, + "ewc_loss": 0.0063450220040977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.345022120513022e-05, + "grad_norm": 3.3266870975494385, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8496919870376587, + "num_tokens": 129234816.0, + "step": 3385 + }, + { + "epoch": 0.43073400330746725, + "ewc_loss": 0.006320231594145298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.320231477729976e-05, + "grad_norm": 3.316298484802246, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8620858192443848, + "num_tokens": 129275228.0, + "step": 3386 + }, + { + "epoch": 0.4308612135860578, + "ewc_loss": 0.006317703053355217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.317703082459047e-05, + "grad_norm": 3.4239768981933594, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8421633243560791, + "num_tokens": 129309747.0, + "step": 3387 + }, + { + "epoch": 0.43098842386464825, + "ewc_loss": 0.006374774966388941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.374774966388941e-05, + "grad_norm": 3.3871371746063232, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8633148074150085, + "num_tokens": 129342580.0, + "step": 3388 + }, + { + "epoch": 0.4311156341432388, + "ewc_loss": 0.006334642414003611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.334642239380628e-05, + "grad_norm": 3.4091298580169678, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8553590774536133, + "num_tokens": 129376599.0, + "step": 3389 + }, + { + "epoch": 0.4312428444218293, + "ewc_loss": 0.006360291503369808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.360291445162147e-05, + "grad_norm": 3.3493266105651855, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8572194576263428, + "num_tokens": 129414783.0, + "step": 3390 + }, + { + "epoch": 0.4313700547004198, + "ewc_loss": 0.006338796578347683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.338796811178327e-05, + "grad_norm": 3.3790183067321777, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.858608067035675, + "num_tokens": 129450287.0, + "step": 3391 + }, + { + "epoch": 0.4314972649790103, + "ewc_loss": 0.0063608568161726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.36085678706877e-05, + "grad_norm": 3.438124895095825, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8406128883361816, + "num_tokens": 129486733.0, + "step": 3392 + }, + { + "epoch": 0.43162447525760084, + "ewc_loss": 0.006392157170921564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.392157229129225e-05, + "grad_norm": 3.382042646408081, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8592109084129333, + "num_tokens": 129524392.0, + "step": 3393 + }, + { + "epoch": 0.4317516855361913, + "ewc_loss": 0.006358557380735874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.358557584462687e-05, + "grad_norm": 3.397289991378784, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8469287157058716, + "num_tokens": 129558452.0, + "step": 3394 + }, + { + "epoch": 0.43187889581478184, + "ewc_loss": 0.00637997779995203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.379978003678843e-05, + "grad_norm": 3.3205809593200684, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8570712804794312, + "num_tokens": 129597076.0, + "step": 3395 + }, + { + "epoch": 0.43200610609337237, + "ewc_loss": 0.0063494350761175156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.349434988806024e-05, + "grad_norm": 3.392272472381592, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8528611063957214, + "num_tokens": 129635861.0, + "step": 3396 + }, + { + "epoch": 0.43213331637196284, + "ewc_loss": 0.006419587414711714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.419587589334697e-05, + "grad_norm": 3.3568954467773438, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8550803661346436, + "num_tokens": 129673760.0, + "step": 3397 + }, + { + "epoch": 0.43226052665055337, + "ewc_loss": 0.00637844717130065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.378447142196819e-05, + "grad_norm": 3.340940475463867, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8612685203552246, + "num_tokens": 129710730.0, + "step": 3398 + }, + { + "epoch": 0.4323877369291439, + "ewc_loss": 0.006391740869730711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.391741044353694e-05, + "grad_norm": 3.387636423110962, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8572896122932434, + "num_tokens": 129748724.0, + "step": 3399 + }, + { + "epoch": 0.43251494720773437, + "ewc_loss": 0.006417460273951292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.417460099328309e-05, + "grad_norm": 3.3331496715545654, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8723520040512085, + "num_tokens": 129787478.0, + "step": 3400 + }, + { + "epoch": 0.4326421574863249, + "ewc_loss": 0.006370644550770521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.370644405251369e-05, + "grad_norm": 3.3543834686279297, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8699944615364075, + "num_tokens": 129829852.0, + "step": 3401 + }, + { + "epoch": 0.4327693677649154, + "ewc_loss": 0.006391493137925863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.39149293419905e-05, + "grad_norm": 3.334578037261963, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8579416871070862, + "num_tokens": 129869279.0, + "step": 3402 + }, + { + "epoch": 0.4328965780435059, + "ewc_loss": 0.0063821012154221535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.382101128110662e-05, + "grad_norm": 3.4038825035095215, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8533790111541748, + "num_tokens": 129903347.0, + "step": 3403 + }, + { + "epoch": 0.4330237883220964, + "ewc_loss": 0.006423210259526968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.423210288630798e-05, + "grad_norm": 3.3706259727478027, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8694575428962708, + "num_tokens": 129940028.0, + "step": 3404 + }, + { + "epoch": 0.43315099860068695, + "ewc_loss": 0.006372631993144751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.372632196871564e-05, + "grad_norm": 3.357006311416626, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.868105411529541, + "num_tokens": 129976438.0, + "step": 3405 + }, + { + "epoch": 0.4332782088792774, + "ewc_loss": 0.006381536368280649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.381536513799801e-05, + "grad_norm": 3.3494873046875, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8628950119018555, + "num_tokens": 130015698.0, + "step": 3406 + }, + { + "epoch": 0.43340541915786795, + "ewc_loss": 0.006373908370733261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.373908399837092e-05, + "grad_norm": 3.364262580871582, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8748353719711304, + "num_tokens": 130056000.0, + "step": 3407 + }, + { + "epoch": 0.4335326294364585, + "ewc_loss": 0.006375123281031847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.37512348475866e-05, + "grad_norm": 3.3830316066741943, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8567677140235901, + "num_tokens": 130096990.0, + "step": 3408 + }, + { + "epoch": 0.43365983971504896, + "ewc_loss": 0.006364674307405949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.364674482028931e-05, + "grad_norm": 3.3861582279205322, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8691210746765137, + "num_tokens": 130128556.0, + "step": 3409 + }, + { + "epoch": 0.4337870499936395, + "ewc_loss": 0.006358349695801735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.358349492074922e-05, + "grad_norm": 3.292170524597168, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8652629852294922, + "num_tokens": 130170607.0, + "step": 3410 + }, + { + "epoch": 0.43391426027223, + "ewc_loss": 0.006303866393864155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.303866393864155e-05, + "grad_norm": 3.377195358276367, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8559844493865967, + "num_tokens": 130212511.0, + "step": 3411 + }, + { + "epoch": 0.4340414705508205, + "ewc_loss": 0.006371544674038887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.371544441208243e-05, + "grad_norm": 3.4080190658569336, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8524259328842163, + "num_tokens": 130249246.0, + "step": 3412 + }, + { + "epoch": 0.434168680829411, + "ewc_loss": 0.006358920596539974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.358920654747635e-05, + "grad_norm": 3.2951011657714844, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8784090876579285, + "num_tokens": 130291250.0, + "step": 3413 + }, + { + "epoch": 0.43429589110800154, + "ewc_loss": 0.0062713646329939365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.271364691201597e-05, + "grad_norm": 3.4799094200134277, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8397555351257324, + "num_tokens": 130327094.0, + "step": 3414 + }, + { + "epoch": 0.434423101386592, + "ewc_loss": 0.006418346427381039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.418346310965717e-05, + "grad_norm": 3.3512048721313477, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8519607782363892, + "num_tokens": 130366482.0, + "step": 3415 + }, + { + "epoch": 0.43455031166518254, + "ewc_loss": 0.006282744463533163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.28274428891018e-05, + "grad_norm": 3.3378419876098633, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8587851524353027, + "num_tokens": 130407416.0, + "step": 3416 + }, + { + "epoch": 0.43467752194377307, + "ewc_loss": 0.00629717530682683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.297175423242152e-05, + "grad_norm": 3.3050520420074463, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8662624955177307, + "num_tokens": 130447125.0, + "step": 3417 + }, + { + "epoch": 0.43480473222236354, + "ewc_loss": 0.00629700580611825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.297005893429741e-05, + "grad_norm": 3.396484851837158, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8658164739608765, + "num_tokens": 130480955.0, + "step": 3418 + }, + { + "epoch": 0.43493194250095407, + "ewc_loss": 0.00635747192427516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.357472011586651e-05, + "grad_norm": 3.3814525604248047, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8720183372497559, + "num_tokens": 130511771.0, + "step": 3419 + }, + { + "epoch": 0.4350591527795446, + "ewc_loss": 0.006324686575680971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.324686546577141e-05, + "grad_norm": 3.3407540321350098, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.859065055847168, + "num_tokens": 130550080.0, + "step": 3420 + }, + { + "epoch": 0.4351863630581351, + "ewc_loss": 0.006310008000582457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.310008029686287e-05, + "grad_norm": 3.3741915225982666, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8492693305015564, + "num_tokens": 130588109.0, + "step": 3421 + }, + { + "epoch": 0.4353135733367256, + "ewc_loss": 0.006347414571791887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.347414455376565e-05, + "grad_norm": 3.397068500518799, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8469573855400085, + "num_tokens": 130626832.0, + "step": 3422 + }, + { + "epoch": 0.43544078361531613, + "ewc_loss": 0.006345183122903109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.345182919176295e-05, + "grad_norm": 3.348358631134033, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.864616870880127, + "num_tokens": 130662202.0, + "step": 3423 + }, + { + "epoch": 0.4355679938939066, + "ewc_loss": 0.006326943635940552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.32694354862906e-05, + "grad_norm": 3.368098020553589, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8561326265335083, + "num_tokens": 130700747.0, + "step": 3424 + }, + { + "epoch": 0.43569520417249713, + "ewc_loss": 0.006342997774481773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.342997949104756e-05, + "grad_norm": 3.326780319213867, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8578299283981323, + "num_tokens": 130742849.0, + "step": 3425 + }, + { + "epoch": 0.43582241445108766, + "ewc_loss": 0.006311310455203056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.311310426099226e-05, + "grad_norm": 3.3197741508483887, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8731464743614197, + "num_tokens": 130780156.0, + "step": 3426 + }, + { + "epoch": 0.43594962472967813, + "ewc_loss": 0.006329129915684462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.329129973892123e-05, + "grad_norm": 3.3393328189849854, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8594198822975159, + "num_tokens": 130825824.0, + "step": 3427 + }, + { + "epoch": 0.43607683500826866, + "ewc_loss": 0.00632890360429883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.328903691610321e-05, + "grad_norm": 3.355618715286255, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8684930801391602, + "num_tokens": 130863603.0, + "step": 3428 + }, + { + "epoch": 0.4362040452868592, + "ewc_loss": 0.006337250117212534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.337249942589551e-05, + "grad_norm": 3.3214213848114014, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8608956336975098, + "num_tokens": 130906689.0, + "step": 3429 + }, + { + "epoch": 0.4363312555654497, + "ewc_loss": 0.006303554866462946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.303554982878268e-05, + "grad_norm": 3.3310348987579346, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.857532262802124, + "num_tokens": 130947785.0, + "step": 3430 + }, + { + "epoch": 0.4364584658440402, + "ewc_loss": 0.006323117297142744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.323117122519761e-05, + "grad_norm": 3.349992275238037, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.869659423828125, + "num_tokens": 130985489.0, + "step": 3431 + }, + { + "epoch": 0.4365856761226307, + "ewc_loss": 0.006326432805508375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.326432776404545e-05, + "grad_norm": 3.3745856285095215, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8693695664405823, + "num_tokens": 131025274.0, + "step": 3432 + }, + { + "epoch": 0.43671288640122125, + "ewc_loss": 0.006310771219432354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.310771277640015e-05, + "grad_norm": 3.362635612487793, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8601652979850769, + "num_tokens": 131060132.0, + "step": 3433 + }, + { + "epoch": 0.4368400966798117, + "ewc_loss": 0.006313578225672245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.313578342087567e-05, + "grad_norm": 3.3249025344848633, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8483489751815796, + "num_tokens": 131105109.0, + "step": 3434 + }, + { + "epoch": 0.43696730695840225, + "ewc_loss": 0.006285952404141426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.285952258622274e-05, + "grad_norm": 3.369696617126465, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8749979734420776, + "num_tokens": 131142094.0, + "step": 3435 + }, + { + "epoch": 0.4370945172369928, + "ewc_loss": 0.0063146851025521755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.314685015240684e-05, + "grad_norm": 3.436326742172241, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8586220145225525, + "num_tokens": 131174648.0, + "step": 3436 + }, + { + "epoch": 0.43722172751558325, + "ewc_loss": 0.006333694327622652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.3336941821035e-05, + "grad_norm": 3.3216512203216553, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8741531372070312, + "num_tokens": 131213447.0, + "step": 3437 + }, + { + "epoch": 0.4373489377941738, + "ewc_loss": 0.006245080381631851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.24508029432036e-05, + "grad_norm": 3.395869255065918, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8572651147842407, + "num_tokens": 131249622.0, + "step": 3438 + }, + { + "epoch": 0.4374761480727643, + "ewc_loss": 0.00633960822597146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.339608080452308e-05, + "grad_norm": 3.323002338409424, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8752500414848328, + "num_tokens": 131285883.0, + "step": 3439 + }, + { + "epoch": 0.4376033583513548, + "ewc_loss": 0.006249021738767624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.249021680559963e-05, + "grad_norm": 3.371661424636841, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8637139201164246, + "num_tokens": 131322774.0, + "step": 3440 + }, + { + "epoch": 0.4377305686299453, + "ewc_loss": 0.006310494150966406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.310494063654914e-05, + "grad_norm": 3.322553873062134, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8728642463684082, + "num_tokens": 131364513.0, + "step": 3441 + }, + { + "epoch": 0.43785777890853583, + "ewc_loss": 0.006258436944335699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.258436769712716e-05, + "grad_norm": 3.382277727127075, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8508961200714111, + "num_tokens": 131402299.0, + "step": 3442 + }, + { + "epoch": 0.4379849891871263, + "ewc_loss": 0.006313201505690813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.313201447483152e-05, + "grad_norm": 3.2926251888275146, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8678311109542847, + "num_tokens": 131447830.0, + "step": 3443 + }, + { + "epoch": 0.43811219946571683, + "ewc_loss": 0.006250720005482435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.250719889067113e-05, + "grad_norm": 3.4922587871551514, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8536463975906372, + "num_tokens": 131481058.0, + "step": 3444 + }, + { + "epoch": 0.43823940974430736, + "ewc_loss": 0.006381030194461346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.381030107149854e-05, + "grad_norm": 3.399845838546753, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.856980562210083, + "num_tokens": 131518376.0, + "step": 3445 + }, + { + "epoch": 0.43836662002289783, + "ewc_loss": 0.0062767015770077705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.276701606111601e-05, + "grad_norm": 3.426037549972534, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8460512161254883, + "num_tokens": 131553702.0, + "step": 3446 + }, + { + "epoch": 0.43849383030148836, + "ewc_loss": 0.006319419480860233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.319419480860233e-05, + "grad_norm": 3.4096505641937256, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8683922290802002, + "num_tokens": 131587026.0, + "step": 3447 + }, + { + "epoch": 0.4386210405800789, + "ewc_loss": 0.006310689263045788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.310689059318975e-05, + "grad_norm": 3.362494707107544, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8670142889022827, + "num_tokens": 131621145.0, + "step": 3448 + }, + { + "epoch": 0.43874825085866936, + "ewc_loss": 0.00628562830388546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.285628478508443e-05, + "grad_norm": 3.28883695602417, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8658990263938904, + "num_tokens": 131661618.0, + "step": 3449 + }, + { + "epoch": 0.4388754611372599, + "ewc_loss": 0.0062688253819942474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.268825381994247e-05, + "grad_norm": 3.437873125076294, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8552674651145935, + "num_tokens": 131693840.0, + "step": 3450 + }, + { + "epoch": 0.4390026714158504, + "ewc_loss": 0.006388562731444836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.388562906067818e-05, + "grad_norm": 3.357722520828247, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8638282418251038, + "num_tokens": 131730856.0, + "step": 3451 + }, + { + "epoch": 0.4391298816944409, + "ewc_loss": 0.006290346384048462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.290346209425479e-05, + "grad_norm": 3.383978843688965, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8368492722511292, + "num_tokens": 131769172.0, + "step": 3452 + }, + { + "epoch": 0.4392570919730314, + "ewc_loss": 0.006329348776489496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.32934898021631e-05, + "grad_norm": 3.3051724433898926, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8635736107826233, + "num_tokens": 131810481.0, + "step": 3453 + }, + { + "epoch": 0.43938430225162195, + "ewc_loss": 0.006294637452811003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.294637569226325e-05, + "grad_norm": 3.364919424057007, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8636130094528198, + "num_tokens": 131850742.0, + "step": 3454 + }, + { + "epoch": 0.4395115125302124, + "ewc_loss": 0.00635575270280242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.35575270280242e-05, + "grad_norm": 3.423307418823242, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8582563400268555, + "num_tokens": 131886675.0, + "step": 3455 + }, + { + "epoch": 0.43963872280880295, + "ewc_loss": 0.006366935092955828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.366935122059658e-05, + "grad_norm": 3.3233704566955566, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.86688631772995, + "num_tokens": 131926831.0, + "step": 3456 + }, + { + "epoch": 0.4397659330873935, + "ewc_loss": 0.006296815350651741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.29681526334025e-05, + "grad_norm": 3.411064386367798, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8595138788223267, + "num_tokens": 131962265.0, + "step": 3457 + }, + { + "epoch": 0.43989314336598395, + "ewc_loss": 0.006387874484062195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.387874600477517e-05, + "grad_norm": 3.3652169704437256, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8562252521514893, + "num_tokens": 131999906.0, + "step": 3458 + }, + { + "epoch": 0.4400203536445745, + "ewc_loss": 0.006329199764877558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.329199823085219e-05, + "grad_norm": 3.44315242767334, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8776431083679199, + "num_tokens": 132031330.0, + "step": 3459 + }, + { + "epoch": 0.440147563923165, + "ewc_loss": 0.006396064534783363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.396064418368042e-05, + "grad_norm": 3.4284727573394775, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8598876595497131, + "num_tokens": 132065049.0, + "step": 3460 + }, + { + "epoch": 0.4402747742017555, + "ewc_loss": 0.006356263533234596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.356263475026935e-05, + "grad_norm": 3.357835054397583, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8657090067863464, + "num_tokens": 132100993.0, + "step": 3461 + }, + { + "epoch": 0.440401984480346, + "ewc_loss": 0.0063269296661019325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.326929724309593e-05, + "grad_norm": 3.3558902740478516, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8594503998756409, + "num_tokens": 132138169.0, + "step": 3462 + }, + { + "epoch": 0.44052919475893654, + "ewc_loss": 0.0063675157725811005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.36751574347727e-05, + "grad_norm": 3.403601884841919, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8676925301551819, + "num_tokens": 132170498.0, + "step": 3463 + }, + { + "epoch": 0.440656405037527, + "ewc_loss": 0.006398103199899197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.398103141691536e-05, + "grad_norm": 3.3463873863220215, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8417600393295288, + "num_tokens": 132213583.0, + "step": 3464 + }, + { + "epoch": 0.44078361531611754, + "ewc_loss": 0.006354887504130602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.354887591442093e-05, + "grad_norm": 3.3315911293029785, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8555167317390442, + "num_tokens": 132255427.0, + "step": 3465 + }, + { + "epoch": 0.44091082559470807, + "ewc_loss": 0.006380384787917137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.380384729709476e-05, + "grad_norm": 3.372551202774048, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8585335612297058, + "num_tokens": 132295795.0, + "step": 3466 + }, + { + "epoch": 0.44103803587329854, + "ewc_loss": 0.006382180377840996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.382180436048657e-05, + "grad_norm": 3.3618369102478027, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8432962894439697, + "num_tokens": 132336881.0, + "step": 3467 + }, + { + "epoch": 0.44116524615188907, + "ewc_loss": 0.006378880701959133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.378880789270625e-05, + "grad_norm": 3.3838605880737305, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8602941036224365, + "num_tokens": 132375136.0, + "step": 3468 + }, + { + "epoch": 0.4412924564304796, + "ewc_loss": 0.006386016961187124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.386017048498616e-05, + "grad_norm": 3.394270658493042, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8569243550300598, + "num_tokens": 132412893.0, + "step": 3469 + }, + { + "epoch": 0.44141966670907007, + "ewc_loss": 0.006371486932039261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.371486961143091e-05, + "grad_norm": 3.4615321159362793, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8563693761825562, + "num_tokens": 132444360.0, + "step": 3470 + }, + { + "epoch": 0.4415468769876606, + "ewc_loss": 0.006417484022676945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.417484109988436e-05, + "grad_norm": 3.399984836578369, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8354579210281372, + "num_tokens": 132485022.0, + "step": 3471 + }, + { + "epoch": 0.4416740872662511, + "ewc_loss": 0.0063641257584095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.364125874824822e-05, + "grad_norm": 3.3750550746917725, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8458045125007629, + "num_tokens": 132522800.0, + "step": 3472 + }, + { + "epoch": 0.4418012975448416, + "ewc_loss": 0.006375971250236034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.375971133820713e-05, + "grad_norm": 3.3651936054229736, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8620350360870361, + "num_tokens": 132560527.0, + "step": 3473 + }, + { + "epoch": 0.4419285078234321, + "ewc_loss": 0.006380767561495304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.380767445079982e-05, + "grad_norm": 3.314100980758667, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8579282760620117, + "num_tokens": 132601075.0, + "step": 3474 + }, + { + "epoch": 0.44205571810202265, + "ewc_loss": 0.006359965540468693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.359965482261032e-05, + "grad_norm": 3.3913354873657227, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8505409955978394, + "num_tokens": 132639851.0, + "step": 3475 + }, + { + "epoch": 0.4421829283806131, + "ewc_loss": 0.006423134822398424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.423134618671611e-05, + "grad_norm": 3.3305776119232178, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8619712591171265, + "num_tokens": 132679849.0, + "step": 3476 + }, + { + "epoch": 0.44231013865920366, + "ewc_loss": 0.006371957715600729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.371957715600729e-05, + "grad_norm": 3.3417978286743164, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8649272918701172, + "num_tokens": 132721020.0, + "step": 3477 + }, + { + "epoch": 0.4424373489377942, + "ewc_loss": 0.006396767217665911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.396767275873572e-05, + "grad_norm": 3.407534122467041, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8459484577178955, + "num_tokens": 132760933.0, + "step": 3478 + }, + { + "epoch": 0.44256455921638466, + "ewc_loss": 0.0064215282909572124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.421528087230399e-05, + "grad_norm": 3.3151557445526123, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.871492862701416, + "num_tokens": 132800627.0, + "step": 3479 + }, + { + "epoch": 0.4426917694949752, + "ewc_loss": 0.0063437195494771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.343719724100083e-05, + "grad_norm": 3.407662868499756, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.860339343547821, + "num_tokens": 132833232.0, + "step": 3480 + }, + { + "epoch": 0.4428189797735657, + "ewc_loss": 0.00643132021650672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.431320070987567e-05, + "grad_norm": 3.344825029373169, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8584815263748169, + "num_tokens": 132874670.0, + "step": 3481 + }, + { + "epoch": 0.44294619005215624, + "ewc_loss": 0.006365744862705469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.365744775393978e-05, + "grad_norm": 3.3559422492980957, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8628950119018555, + "num_tokens": 132916187.0, + "step": 3482 + }, + { + "epoch": 0.4430734003307467, + "ewc_loss": 0.006391406524926424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.391406350303441e-05, + "grad_norm": 3.3556957244873047, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8635432720184326, + "num_tokens": 132956948.0, + "step": 3483 + }, + { + "epoch": 0.44320061060933724, + "ewc_loss": 0.006368331611156464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.36833137832582e-05, + "grad_norm": 3.349703788757324, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8655687570571899, + "num_tokens": 132995213.0, + "step": 3484 + }, + { + "epoch": 0.44332782088792777, + "ewc_loss": 0.0063679697923362255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.367969763232395e-05, + "grad_norm": 3.351243019104004, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8567821979522705, + "num_tokens": 133038850.0, + "step": 3485 + }, + { + "epoch": 0.44345503116651824, + "ewc_loss": 0.006371539551764727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.371539348037913e-05, + "grad_norm": 3.324070692062378, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8617193698883057, + "num_tokens": 133080766.0, + "step": 3486 + }, + { + "epoch": 0.44358224144510877, + "ewc_loss": 0.006338849198073149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.338849198073149e-05, + "grad_norm": 3.336794137954712, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8573762774467468, + "num_tokens": 133120845.0, + "step": 3487 + }, + { + "epoch": 0.4437094517236993, + "ewc_loss": 0.006366437301039696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.366437446558848e-05, + "grad_norm": 3.347508192062378, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8578076362609863, + "num_tokens": 133166011.0, + "step": 3488 + }, + { + "epoch": 0.4438366620022898, + "ewc_loss": 0.006357923150062561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.35792312095873e-05, + "grad_norm": 3.371297836303711, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8588031530380249, + "num_tokens": 133203507.0, + "step": 3489 + }, + { + "epoch": 0.4439638722808803, + "ewc_loss": 0.0063479188829660416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.347918679239228e-05, + "grad_norm": 3.432924747467041, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8354935646057129, + "num_tokens": 133240554.0, + "step": 3490 + }, + { + "epoch": 0.44409108255947083, + "ewc_loss": 0.006388282869011164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.388282781699672e-05, + "grad_norm": 3.4358551502227783, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8434706926345825, + "num_tokens": 133274987.0, + "step": 3491 + }, + { + "epoch": 0.4442182928380613, + "ewc_loss": 0.006371754687279463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.371754716383293e-05, + "grad_norm": 3.395961046218872, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8325704336166382, + "num_tokens": 133319287.0, + "step": 3492 + }, + { + "epoch": 0.44434550311665183, + "ewc_loss": 0.0063571622595191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.357162055792287e-05, + "grad_norm": 3.364478826522827, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8654630780220032, + "num_tokens": 133353670.0, + "step": 3493 + }, + { + "epoch": 0.44447271339524236, + "ewc_loss": 0.006344711408019066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.344711437122896e-05, + "grad_norm": 3.4058871269226074, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8593659400939941, + "num_tokens": 133391950.0, + "step": 3494 + }, + { + "epoch": 0.44459992367383283, + "ewc_loss": 0.006387769710272551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.387769826687872e-05, + "grad_norm": 3.373404026031494, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.865057110786438, + "num_tokens": 133424758.0, + "step": 3495 + }, + { + "epoch": 0.44472713395242336, + "ewc_loss": 0.006354862824082375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.354862853186205e-05, + "grad_norm": 3.337644338607788, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.888103723526001, + "num_tokens": 133459869.0, + "step": 3496 + }, + { + "epoch": 0.4448543442310139, + "ewc_loss": 0.006351489573717117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.35148971923627e-05, + "grad_norm": 3.46688175201416, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8571902513504028, + "num_tokens": 133494598.0, + "step": 3497 + }, + { + "epoch": 0.44498155450960436, + "ewc_loss": 0.006441589444875717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.441589357564226e-05, + "grad_norm": 3.3625741004943848, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8675305843353271, + "num_tokens": 133536403.0, + "step": 3498 + }, + { + "epoch": 0.4451087647881949, + "ewc_loss": 0.006341543514281511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.341543485177681e-05, + "grad_norm": 3.449380397796631, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8389971256256104, + "num_tokens": 133567319.0, + "step": 3499 + }, + { + "epoch": 0.4452359750667854, + "ewc_loss": 0.006427979562431574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.427979678846896e-05, + "grad_norm": 3.37894344329834, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.852629542350769, + "num_tokens": 133604829.0, + "step": 3500 + }, + { + "epoch": 0.4453631853453759, + "ewc_loss": 0.006375560071319342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.375560042215511e-05, + "grad_norm": 3.3808016777038574, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8591303825378418, + "num_tokens": 133638373.0, + "step": 3501 + }, + { + "epoch": 0.4454903956239664, + "ewc_loss": 0.006391141563653946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.391141505446285e-05, + "grad_norm": 3.3416717052459717, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8705835938453674, + "num_tokens": 133674110.0, + "step": 3502 + }, + { + "epoch": 0.44561760590255695, + "ewc_loss": 0.00636658538132906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.366585148498416e-05, + "grad_norm": 3.3600704669952393, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.861640214920044, + "num_tokens": 133711005.0, + "step": 3503 + }, + { + "epoch": 0.4457448161811474, + "ewc_loss": 0.006406111642718315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.406111788237467e-05, + "grad_norm": 3.3505537509918213, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8730477690696716, + "num_tokens": 133746258.0, + "step": 3504 + }, + { + "epoch": 0.44587202645973795, + "ewc_loss": 0.006386409047991037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.38640922261402e-05, + "grad_norm": 3.274543046951294, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8682209253311157, + "num_tokens": 133790695.0, + "step": 3505 + }, + { + "epoch": 0.4459992367383285, + "ewc_loss": 0.006357686594128609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.35768665233627e-05, + "grad_norm": 3.3691091537475586, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8578603267669678, + "num_tokens": 133830181.0, + "step": 3506 + }, + { + "epoch": 0.44612644701691895, + "ewc_loss": 0.006421008612960577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.421008583856747e-05, + "grad_norm": 3.3527331352233887, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8648501634597778, + "num_tokens": 133866849.0, + "step": 3507 + }, + { + "epoch": 0.4462536572955095, + "ewc_loss": 0.006380466744303703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.380466948030517e-05, + "grad_norm": 3.3465561866760254, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8591651916503906, + "num_tokens": 133906812.0, + "step": 3508 + }, + { + "epoch": 0.4463808675741, + "ewc_loss": 0.006395702715963125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.395702803274617e-05, + "grad_norm": 3.3466835021972656, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8535096049308777, + "num_tokens": 133948956.0, + "step": 3509 + }, + { + "epoch": 0.4465080778526905, + "ewc_loss": 0.006392119452357292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.392119394149631e-05, + "grad_norm": 3.3641180992126465, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8431721329689026, + "num_tokens": 133988764.0, + "step": 3510 + }, + { + "epoch": 0.446635288131281, + "ewc_loss": 0.006406265310943127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.406265310943127e-05, + "grad_norm": 3.3430023193359375, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8656307458877563, + "num_tokens": 134028148.0, + "step": 3511 + }, + { + "epoch": 0.44676249840987153, + "ewc_loss": 0.006382330320775509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.382330320775509e-05, + "grad_norm": 3.4352946281433105, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8352213501930237, + "num_tokens": 134065909.0, + "step": 3512 + }, + { + "epoch": 0.446889708688462, + "ewc_loss": 0.006436917465180159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.436917465180159e-05, + "grad_norm": 3.303755521774292, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8661038875579834, + "num_tokens": 134108342.0, + "step": 3513 + }, + { + "epoch": 0.44701691896705253, + "ewc_loss": 0.006331146229058504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.331146141747013e-05, + "grad_norm": 3.3764023780822754, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8598616123199463, + "num_tokens": 134145928.0, + "step": 3514 + }, + { + "epoch": 0.44714412924564306, + "ewc_loss": 0.006422126200050116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.422126170946285e-05, + "grad_norm": 3.3111891746520996, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8566633462905884, + "num_tokens": 134185955.0, + "step": 3515 + }, + { + "epoch": 0.44727133952423354, + "ewc_loss": 0.0063463058322668076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.346305599436164e-05, + "grad_norm": 3.33105206489563, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8652167320251465, + "num_tokens": 134225829.0, + "step": 3516 + }, + { + "epoch": 0.44739854980282406, + "ewc_loss": 0.006388514302670956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.388514157151803e-05, + "grad_norm": 3.364596366882324, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8508851528167725, + "num_tokens": 134261381.0, + "step": 3517 + }, + { + "epoch": 0.4475257600814146, + "ewc_loss": 0.0063975416123867035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.397541437763721e-05, + "grad_norm": 3.4004533290863037, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8670225143432617, + "num_tokens": 134294697.0, + "step": 3518 + }, + { + "epoch": 0.44765297036000506, + "ewc_loss": 0.006409747060388327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.409746856661513e-05, + "grad_norm": 3.2764008045196533, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8807005882263184, + "num_tokens": 134334349.0, + "step": 3519 + }, + { + "epoch": 0.4477801806385956, + "ewc_loss": 0.006331675220280886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.331675103865564e-05, + "grad_norm": 3.3720343112945557, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8487037420272827, + "num_tokens": 134373381.0, + "step": 3520 + }, + { + "epoch": 0.4479073909171861, + "ewc_loss": 0.006427941843867302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.427941843867302e-05, + "grad_norm": 3.384026288986206, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8649919033050537, + "num_tokens": 134407952.0, + "step": 3521 + }, + { + "epoch": 0.4480346011957766, + "ewc_loss": 0.006397465709596872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.397465767804533e-05, + "grad_norm": 3.281287670135498, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8496570587158203, + "num_tokens": 134458862.0, + "step": 3522 + }, + { + "epoch": 0.4481618114743671, + "ewc_loss": 0.006336132530122995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.336132355500013e-05, + "grad_norm": 3.451833724975586, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8579835891723633, + "num_tokens": 134494007.0, + "step": 3523 + }, + { + "epoch": 0.44828902175295765, + "ewc_loss": 0.006459915079176426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.45991531200707e-05, + "grad_norm": 3.3929426670074463, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8463364839553833, + "num_tokens": 134534195.0, + "step": 3524 + }, + { + "epoch": 0.4484162320315481, + "ewc_loss": 0.0063783214427530766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.378321268130094e-05, + "grad_norm": 3.3956031799316406, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8683826923370361, + "num_tokens": 134570241.0, + "step": 3525 + }, + { + "epoch": 0.44854344231013865, + "ewc_loss": 0.006375462748110294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.375462544383481e-05, + "grad_norm": 3.3048975467681885, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8585941791534424, + "num_tokens": 134611904.0, + "step": 3526 + }, + { + "epoch": 0.4486706525887292, + "ewc_loss": 0.006342479959130287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.342479900922626e-05, + "grad_norm": 3.4718661308288574, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8507922291755676, + "num_tokens": 134644426.0, + "step": 3527 + }, + { + "epoch": 0.44879786286731965, + "ewc_loss": 0.00645506102591753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.455060793086886e-05, + "grad_norm": 3.4022645950317383, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8677763938903809, + "num_tokens": 134678979.0, + "step": 3528 + }, + { + "epoch": 0.4489250731459102, + "ewc_loss": 0.006367055233567953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.367055175360292e-05, + "grad_norm": 3.371979236602783, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8639824390411377, + "num_tokens": 134717270.0, + "step": 3529 + }, + { + "epoch": 0.4490522834245007, + "ewc_loss": 0.0063852183520793915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.385218148352578e-05, + "grad_norm": 3.4038164615631104, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8604497909545898, + "num_tokens": 134755464.0, + "step": 3530 + }, + { + "epoch": 0.44917949370309124, + "ewc_loss": 0.006427524145692587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.427524203900248e-05, + "grad_norm": 3.340257167816162, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8524285554885864, + "num_tokens": 134793737.0, + "step": 3531 + }, + { + "epoch": 0.4493067039816817, + "ewc_loss": 0.006359701976180077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.359702092595398e-05, + "grad_norm": 3.3419911861419678, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8532665967941284, + "num_tokens": 134834332.0, + "step": 3532 + }, + { + "epoch": 0.44943391426027224, + "ewc_loss": 0.006396202836185694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.396202661562711e-05, + "grad_norm": 3.357379674911499, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8712484836578369, + "num_tokens": 134871027.0, + "step": 3533 + }, + { + "epoch": 0.44956112453886277, + "ewc_loss": 0.006392769981175661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.392769864760339e-05, + "grad_norm": 3.385800361633301, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.860706090927124, + "num_tokens": 134908529.0, + "step": 3534 + }, + { + "epoch": 0.44968833481745324, + "ewc_loss": 0.006401048507541418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.401048449333757e-05, + "grad_norm": 3.355725049972534, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8685240745544434, + "num_tokens": 134944177.0, + "step": 3535 + }, + { + "epoch": 0.44981554509604377, + "ewc_loss": 0.006376881152391434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.376881356118247e-05, + "grad_norm": 3.4127275943756104, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8575536012649536, + "num_tokens": 134977928.0, + "step": 3536 + }, + { + "epoch": 0.4499427553746343, + "ewc_loss": 0.006425352301448584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.425352330552414e-05, + "grad_norm": 3.3434410095214844, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.860865592956543, + "num_tokens": 135017554.0, + "step": 3537 + }, + { + "epoch": 0.45006996565322477, + "ewc_loss": 0.006358087062835693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.35808683000505e-05, + "grad_norm": 3.342902183532715, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8665972948074341, + "num_tokens": 135058694.0, + "step": 3538 + }, + { + "epoch": 0.4501971759318153, + "ewc_loss": 0.006379405502229929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.379405385814607e-05, + "grad_norm": 3.3895301818847656, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8620154857635498, + "num_tokens": 135094037.0, + "step": 3539 + }, + { + "epoch": 0.4503243862104058, + "ewc_loss": 0.006407200824469328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.40720099909231e-05, + "grad_norm": 3.388667583465576, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8539787530899048, + "num_tokens": 135132758.0, + "step": 3540 + }, + { + "epoch": 0.4504515964889963, + "ewc_loss": 0.006395940203219652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.395939999492839e-05, + "grad_norm": 3.3765060901641846, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8723869323730469, + "num_tokens": 135169398.0, + "step": 3541 + }, + { + "epoch": 0.4505788067675868, + "ewc_loss": 0.006386653520166874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.386653694789857e-05, + "grad_norm": 3.4235804080963135, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8465378284454346, + "num_tokens": 135204312.0, + "step": 3542 + }, + { + "epoch": 0.45070601704617735, + "ewc_loss": 0.00642411969602108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.424119783332571e-05, + "grad_norm": 3.3440911769866943, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8576493263244629, + "num_tokens": 135246287.0, + "step": 3543 + }, + { + "epoch": 0.4508332273247678, + "ewc_loss": 0.006365813780575991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.365813896991313e-05, + "grad_norm": 3.373028039932251, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8628010749816895, + "num_tokens": 135280970.0, + "step": 3544 + }, + { + "epoch": 0.45096043760335836, + "ewc_loss": 0.00640812236815691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.408122135326266e-05, + "grad_norm": 3.338139295578003, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8821364641189575, + "num_tokens": 135317873.0, + "step": 3545 + }, + { + "epoch": 0.4510876478819489, + "ewc_loss": 0.006373195443302393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.373195355990902e-05, + "grad_norm": 3.374250650405884, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8660992383956909, + "num_tokens": 135354994.0, + "step": 3546 + }, + { + "epoch": 0.45121485816053936, + "ewc_loss": 0.006412410642951727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.412410584744066e-05, + "grad_norm": 3.4602458477020264, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8548015356063843, + "num_tokens": 135393050.0, + "step": 3547 + }, + { + "epoch": 0.4513420684391299, + "ewc_loss": 0.0064467196352779865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.446719635277987e-05, + "grad_norm": 3.337886333465576, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8651713132858276, + "num_tokens": 135431704.0, + "step": 3548 + }, + { + "epoch": 0.4514692787177204, + "ewc_loss": 0.006340993568301201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.340993422782049e-05, + "grad_norm": 3.4000041484832764, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8386999368667603, + "num_tokens": 135471306.0, + "step": 3549 + }, + { + "epoch": 0.4515964889963109, + "ewc_loss": 0.00642395531758666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.423955346690491e-05, + "grad_norm": 3.3898463249206543, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8479532599449158, + "num_tokens": 135509647.0, + "step": 3550 + }, + { + "epoch": 0.4517236992749014, + "ewc_loss": 0.0063780322670936584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.378032412612811e-05, + "grad_norm": 3.3159902095794678, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.867464005947113, + "num_tokens": 135550469.0, + "step": 3551 + }, + { + "epoch": 0.45185090955349194, + "ewc_loss": 0.006342414300888777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.342414417304099e-05, + "grad_norm": 3.3478519916534424, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8591437339782715, + "num_tokens": 135589290.0, + "step": 3552 + }, + { + "epoch": 0.4519781198320824, + "ewc_loss": 0.006398514378815889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.398514233296737e-05, + "grad_norm": 3.439309597015381, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8542150259017944, + "num_tokens": 135624416.0, + "step": 3553 + }, + { + "epoch": 0.45210533011067294, + "ewc_loss": 0.006413806229829788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.413806113414466e-05, + "grad_norm": 3.3837339878082275, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8624560236930847, + "num_tokens": 135662316.0, + "step": 3554 + }, + { + "epoch": 0.45223254038926347, + "ewc_loss": 0.006364359986037016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.364360160659999e-05, + "grad_norm": 3.334522247314453, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8617481589317322, + "num_tokens": 135701739.0, + "step": 3555 + }, + { + "epoch": 0.45235975066785394, + "ewc_loss": 0.006357078440487385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.357078382279724e-05, + "grad_norm": 3.3739726543426514, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8562101125717163, + "num_tokens": 135741372.0, + "step": 3556 + }, + { + "epoch": 0.45248696094644447, + "ewc_loss": 0.006401747465133667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.40174766886048e-05, + "grad_norm": 3.3926475048065186, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8817576169967651, + "num_tokens": 135779945.0, + "step": 3557 + }, + { + "epoch": 0.452614171225035, + "ewc_loss": 0.00639368686825037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.393686635419726e-05, + "grad_norm": 3.4388175010681152, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8545218110084534, + "num_tokens": 135813179.0, + "step": 3558 + }, + { + "epoch": 0.4527413815036255, + "ewc_loss": 0.006408922374248505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.408922490663826e-05, + "grad_norm": 3.322108030319214, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8634045720100403, + "num_tokens": 135855147.0, + "step": 3559 + }, + { + "epoch": 0.452868591782216, + "ewc_loss": 0.00633642915636301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.336429214570671e-05, + "grad_norm": 3.3550002574920654, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8678895235061646, + "num_tokens": 135892990.0, + "step": 3560 + }, + { + "epoch": 0.45299580206080653, + "ewc_loss": 0.006390625610947609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.39062564005144e-05, + "grad_norm": 3.3737542629241943, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8792905807495117, + "num_tokens": 135926911.0, + "step": 3561 + }, + { + "epoch": 0.453123012339397, + "ewc_loss": 0.0063834479078650475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.383447907865047e-05, + "grad_norm": 3.3609797954559326, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8692017793655396, + "num_tokens": 135963530.0, + "step": 3562 + }, + { + "epoch": 0.45325022261798753, + "ewc_loss": 0.006382114253938198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.382114224834368e-05, + "grad_norm": 3.4164047241210938, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8510890007019043, + "num_tokens": 136000677.0, + "step": 3563 + }, + { + "epoch": 0.45337743289657806, + "ewc_loss": 0.006418335251510143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.418335397029296e-05, + "grad_norm": 3.4182844161987305, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.850928544998169, + "num_tokens": 136037283.0, + "step": 3564 + }, + { + "epoch": 0.45350464317516853, + "ewc_loss": 0.00639493390917778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.394933734554797e-05, + "grad_norm": 3.4431748390197754, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8438022136688232, + "num_tokens": 136071454.0, + "step": 3565 + }, + { + "epoch": 0.45363185345375906, + "ewc_loss": 0.00642597209662199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.425972242141142e-05, + "grad_norm": 3.394439458847046, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8503382205963135, + "num_tokens": 136105638.0, + "step": 3566 + }, + { + "epoch": 0.4537590637323496, + "ewc_loss": 0.006394147872924805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.394147931132466e-05, + "grad_norm": 3.2888994216918945, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.86385178565979, + "num_tokens": 136147968.0, + "step": 3567 + }, + { + "epoch": 0.45388627401094006, + "ewc_loss": 0.006353878416121006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.353878416121006e-05, + "grad_norm": 3.354069471359253, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8617075681686401, + "num_tokens": 136190468.0, + "step": 3568 + }, + { + "epoch": 0.4540134842895306, + "ewc_loss": 0.006437215954065323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.43721577944234e-05, + "grad_norm": 3.312443733215332, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8644741773605347, + "num_tokens": 136229363.0, + "step": 3569 + }, + { + "epoch": 0.4541406945681211, + "ewc_loss": 0.006391255650669336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.391255737980828e-05, + "grad_norm": 3.3830032348632812, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8551433086395264, + "num_tokens": 136268834.0, + "step": 3570 + }, + { + "epoch": 0.4542679048467116, + "ewc_loss": 0.006437758449465036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.437758565880358e-05, + "grad_norm": 3.3781163692474365, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.837810754776001, + "num_tokens": 136306761.0, + "step": 3571 + }, + { + "epoch": 0.4543951151253021, + "ewc_loss": 0.006425388157367706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.425387982744724e-05, + "grad_norm": 3.4027504920959473, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8590114712715149, + "num_tokens": 136347736.0, + "step": 3572 + }, + { + "epoch": 0.45452232540389265, + "ewc_loss": 0.006421515718102455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.421515718102455e-05, + "grad_norm": 3.4678831100463867, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8635443449020386, + "num_tokens": 136381009.0, + "step": 3573 + }, + { + "epoch": 0.4546495356824831, + "ewc_loss": 0.006474535446614027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.47453562123701e-05, + "grad_norm": 3.3508241176605225, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8717226982116699, + "num_tokens": 136418050.0, + "step": 3574 + }, + { + "epoch": 0.45477674596107365, + "ewc_loss": 0.0063808877021074295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.380887498380616e-05, + "grad_norm": 3.5144472122192383, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8423836827278137, + "num_tokens": 136459109.0, + "step": 3575 + }, + { + "epoch": 0.4549039562396642, + "ewc_loss": 0.006518777925521135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.518777809105814e-05, + "grad_norm": 3.3582844734191895, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8558159470558167, + "num_tokens": 136497993.0, + "step": 3576 + }, + { + "epoch": 0.45503116651825465, + "ewc_loss": 0.006370085757225752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.3700856117066e-05, + "grad_norm": 3.368499279022217, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8634926080703735, + "num_tokens": 136534446.0, + "step": 3577 + }, + { + "epoch": 0.4551583767968452, + "ewc_loss": 0.006427550222724676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.427550397347659e-05, + "grad_norm": 3.3948678970336914, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8447511792182922, + "num_tokens": 136571117.0, + "step": 3578 + }, + { + "epoch": 0.4552855870754357, + "ewc_loss": 0.00645512854680419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.455128459492698e-05, + "grad_norm": 3.4187347888946533, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.867304801940918, + "num_tokens": 136601706.0, + "step": 3579 + }, + { + "epoch": 0.4554127973540262, + "ewc_loss": 0.006452809553593397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.452809611801058e-05, + "grad_norm": 3.381068706512451, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8607757091522217, + "num_tokens": 136639735.0, + "step": 3580 + }, + { + "epoch": 0.4555400076326167, + "ewc_loss": 0.006431444548070431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.43144448986277e-05, + "grad_norm": 3.395986795425415, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8537691831588745, + "num_tokens": 136675647.0, + "step": 3581 + }, + { + "epoch": 0.45566721791120723, + "ewc_loss": 0.00647507980465889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.475079862866551e-05, + "grad_norm": 3.3441877365112305, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8686302900314331, + "num_tokens": 136713361.0, + "step": 3582 + }, + { + "epoch": 0.45579442818979776, + "ewc_loss": 0.006428529042750597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.428529013646767e-05, + "grad_norm": 3.369013786315918, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8602551221847534, + "num_tokens": 136751440.0, + "step": 3583 + }, + { + "epoch": 0.45592163846838824, + "ewc_loss": 0.006449245847761631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.449245847761631e-05, + "grad_norm": 3.423555612564087, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.855813205242157, + "num_tokens": 136786078.0, + "step": 3584 + }, + { + "epoch": 0.45604884874697876, + "ewc_loss": 0.006478874944150448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.478875002358109e-05, + "grad_norm": 3.331421136856079, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8679998517036438, + "num_tokens": 136828313.0, + "step": 3585 + }, + { + "epoch": 0.4561760590255693, + "ewc_loss": 0.006416119635105133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.416119867935777e-05, + "grad_norm": 3.3815276622772217, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8593963384628296, + "num_tokens": 136862549.0, + "step": 3586 + }, + { + "epoch": 0.45630326930415976, + "ewc_loss": 0.006486015394330025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.486015627160668e-05, + "grad_norm": 3.3310422897338867, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8640080690383911, + "num_tokens": 136906423.0, + "step": 3587 + }, + { + "epoch": 0.4564304795827503, + "ewc_loss": 0.006440095603466034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.440095603466034e-05, + "grad_norm": 3.4141077995300293, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8493129014968872, + "num_tokens": 136941550.0, + "step": 3588 + }, + { + "epoch": 0.4565576898613408, + "ewc_loss": 0.006489424966275692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.489425140898675e-05, + "grad_norm": 3.344937562942505, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8624438047409058, + "num_tokens": 136980234.0, + "step": 3589 + }, + { + "epoch": 0.4566849001399313, + "ewc_loss": 0.00642794743180275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.427947664633393e-05, + "grad_norm": 3.325024366378784, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8637350797653198, + "num_tokens": 137023991.0, + "step": 3590 + }, + { + "epoch": 0.4568121104185218, + "ewc_loss": 0.0064280773513019085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.428077176678926e-05, + "grad_norm": 3.389401435852051, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8519047498703003, + "num_tokens": 137063261.0, + "step": 3591 + }, + { + "epoch": 0.45693932069711235, + "ewc_loss": 0.006474402267485857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.47440247121267e-05, + "grad_norm": 3.32623028755188, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8704613447189331, + "num_tokens": 137103935.0, + "step": 3592 + }, + { + "epoch": 0.4570665309757028, + "ewc_loss": 0.006403258070349693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.403258157661185e-05, + "grad_norm": 3.399528741836548, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8593807220458984, + "num_tokens": 137140194.0, + "step": 3593 + }, + { + "epoch": 0.45719374125429335, + "ewc_loss": 0.006470805965363979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.47080596536398e-05, + "grad_norm": 3.4100112915039062, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8512133359909058, + "num_tokens": 137179086.0, + "step": 3594 + }, + { + "epoch": 0.4573209515328839, + "ewc_loss": 0.006438014563173056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.438014679588377e-05, + "grad_norm": 3.364539384841919, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8657429218292236, + "num_tokens": 137216887.0, + "step": 3595 + }, + { + "epoch": 0.45744816181147435, + "ewc_loss": 0.006414136383682489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.41413644189015e-05, + "grad_norm": 3.344870090484619, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8764152526855469, + "num_tokens": 137255029.0, + "step": 3596 + }, + { + "epoch": 0.4575753720900649, + "ewc_loss": 0.006412258371710777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.41225851722993e-05, + "grad_norm": 3.3677608966827393, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8632997274398804, + "num_tokens": 137291399.0, + "step": 3597 + }, + { + "epoch": 0.4577025823686554, + "ewc_loss": 0.006424409337341785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.424409366445616e-05, + "grad_norm": 3.4107284545898438, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8644462823867798, + "num_tokens": 137326569.0, + "step": 3598 + }, + { + "epoch": 0.4578297926472459, + "ewc_loss": 0.006435591261833906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.435591058107093e-05, + "grad_norm": 3.336862564086914, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8555663228034973, + "num_tokens": 137370035.0, + "step": 3599 + }, + { + "epoch": 0.4579570029258364, + "ewc_loss": 0.006359385792165995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.359385588439181e-05, + "grad_norm": 3.370521068572998, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8565075397491455, + "num_tokens": 137407969.0, + "step": 3600 + }, + { + "epoch": 0.45808421320442694, + "ewc_loss": 0.006417475175112486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.417475378839299e-05, + "grad_norm": 3.3542239665985107, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8580091595649719, + "num_tokens": 137447256.0, + "step": 3601 + }, + { + "epoch": 0.4582114234830174, + "ewc_loss": 0.006395587231963873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.395587115548551e-05, + "grad_norm": 3.4858365058898926, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8463132381439209, + "num_tokens": 137479762.0, + "step": 3602 + }, + { + "epoch": 0.45833863376160794, + "ewc_loss": 0.006472667213529348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.472667155321687e-05, + "grad_norm": 3.3612093925476074, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8526127934455872, + "num_tokens": 137521823.0, + "step": 3603 + }, + { + "epoch": 0.45846584404019847, + "ewc_loss": 0.0063707237131893635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.370723713189363e-05, + "grad_norm": 3.415205955505371, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8587812185287476, + "num_tokens": 137558174.0, + "step": 3604 + }, + { + "epoch": 0.45859305431878894, + "ewc_loss": 0.006440417375415564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.440417200792581e-05, + "grad_norm": 3.414198398590088, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8366552591323853, + "num_tokens": 137597265.0, + "step": 3605 + }, + { + "epoch": 0.45872026459737947, + "ewc_loss": 0.006400817073881626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.400817073881626e-05, + "grad_norm": 3.324918031692505, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8692138195037842, + "num_tokens": 137635954.0, + "step": 3606 + }, + { + "epoch": 0.45884747487597, + "ewc_loss": 0.00637039402499795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.370394112309441e-05, + "grad_norm": 3.334009885787964, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8624353408813477, + "num_tokens": 137676137.0, + "step": 3607 + }, + { + "epoch": 0.45897468515456047, + "ewc_loss": 0.006391935516148806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.391935312421992e-05, + "grad_norm": 3.340184450149536, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8645955324172974, + "num_tokens": 137716723.0, + "step": 3608 + }, + { + "epoch": 0.459101895433151, + "ewc_loss": 0.006380550563335419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.38055062154308e-05, + "grad_norm": 3.4048025608062744, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8440600633621216, + "num_tokens": 137752435.0, + "step": 3609 + }, + { + "epoch": 0.4592291057117415, + "ewc_loss": 0.006420725490897894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.420725549105555e-05, + "grad_norm": 3.4156503677368164, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8519114255905151, + "num_tokens": 137786684.0, + "step": 3610 + }, + { + "epoch": 0.459356315990332, + "ewc_loss": 0.006410619243979454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.410619243979454e-05, + "grad_norm": 3.330878257751465, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8510967493057251, + "num_tokens": 137830668.0, + "step": 3611 + }, + { + "epoch": 0.4594835262689225, + "ewc_loss": 0.006358182989060879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.358182872645557e-05, + "grad_norm": 3.3511438369750977, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8674643635749817, + "num_tokens": 137865262.0, + "step": 3612 + }, + { + "epoch": 0.45961073654751305, + "ewc_loss": 0.0064003970474004745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.400397251127288e-05, + "grad_norm": 3.385267972946167, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8682961463928223, + "num_tokens": 137898867.0, + "step": 3613 + }, + { + "epoch": 0.4597379468261035, + "ewc_loss": 0.006410001311451197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.41000151517801e-05, + "grad_norm": 3.4736111164093018, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8626636266708374, + "num_tokens": 137935108.0, + "step": 3614 + }, + { + "epoch": 0.45986515710469406, + "ewc_loss": 0.006433534901589155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.433534872485325e-05, + "grad_norm": 3.3162808418273926, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8708199262619019, + "num_tokens": 137977714.0, + "step": 3615 + }, + { + "epoch": 0.4599923673832846, + "ewc_loss": 0.006314173340797424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.314173515420407e-05, + "grad_norm": 3.5190258026123047, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8726943731307983, + "num_tokens": 138006345.0, + "step": 3616 + }, + { + "epoch": 0.46011957766187506, + "ewc_loss": 0.006511957384645939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.511957326438278e-05, + "grad_norm": 3.3692445755004883, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8613870739936829, + "num_tokens": 138045106.0, + "step": 3617 + }, + { + "epoch": 0.4602467879404656, + "ewc_loss": 0.006344133522361517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.34413372608833e-05, + "grad_norm": 3.405583381652832, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8536162376403809, + "num_tokens": 138079993.0, + "step": 3618 + }, + { + "epoch": 0.4603739982190561, + "ewc_loss": 0.006428235210478306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.428235064959154e-05, + "grad_norm": 3.359673261642456, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8649070262908936, + "num_tokens": 138118555.0, + "step": 3619 + }, + { + "epoch": 0.4605012084976466, + "ewc_loss": 0.006388664711266756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.388664769474417e-05, + "grad_norm": 3.4219155311584473, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.869057297706604, + "num_tokens": 138155096.0, + "step": 3620 + }, + { + "epoch": 0.4606284187762371, + "ewc_loss": 0.00643646577373147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.436465628212318e-05, + "grad_norm": 3.3656764030456543, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.863228976726532, + "num_tokens": 138194529.0, + "step": 3621 + }, + { + "epoch": 0.46075562905482764, + "ewc_loss": 0.006386938039213419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.386938184732571e-05, + "grad_norm": 3.4589555263519287, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8352916240692139, + "num_tokens": 138235520.0, + "step": 3622 + }, + { + "epoch": 0.4608828393334181, + "ewc_loss": 0.006474957801401615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.474957626778632e-05, + "grad_norm": 3.3758981227874756, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8703700304031372, + "num_tokens": 138271391.0, + "step": 3623 + }, + { + "epoch": 0.46101004961200864, + "ewc_loss": 0.006393190938979387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.3931911427062e-05, + "grad_norm": 3.3583931922912598, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8552792072296143, + "num_tokens": 138313923.0, + "step": 3624 + }, + { + "epoch": 0.46113725989059917, + "ewc_loss": 0.006411133799701929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.411133654182777e-05, + "grad_norm": 3.327373504638672, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8537164330482483, + "num_tokens": 138355992.0, + "step": 3625 + }, + { + "epoch": 0.46126447016918964, + "ewc_loss": 0.006403000559657812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.403000588761643e-05, + "grad_norm": 3.4612841606140137, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8802415132522583, + "num_tokens": 138393074.0, + "step": 3626 + }, + { + "epoch": 0.4613916804477802, + "ewc_loss": 0.006492186337709427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.492186366813257e-05, + "grad_norm": 3.507871150970459, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8674100041389465, + "num_tokens": 138427854.0, + "step": 3627 + }, + { + "epoch": 0.4615188907263707, + "ewc_loss": 0.0064590503461658955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.459050200646743e-05, + "grad_norm": 3.347952127456665, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8692095875740051, + "num_tokens": 138467359.0, + "step": 3628 + }, + { + "epoch": 0.4616461010049612, + "ewc_loss": 0.0063604325987398624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.360432598739862e-05, + "grad_norm": 3.4480268955230713, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8486166000366211, + "num_tokens": 138505355.0, + "step": 3629 + }, + { + "epoch": 0.4617733112835517, + "ewc_loss": 0.006473333574831486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.473333633039147e-05, + "grad_norm": 3.357267141342163, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8645932674407959, + "num_tokens": 138543703.0, + "step": 3630 + }, + { + "epoch": 0.46190052156214223, + "ewc_loss": 0.006377519574016333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.377519457601011e-05, + "grad_norm": 3.392162322998047, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8530659079551697, + "num_tokens": 138579860.0, + "step": 3631 + }, + { + "epoch": 0.46202773184073276, + "ewc_loss": 0.00643053138628602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.43053135718219e-05, + "grad_norm": 3.3808395862579346, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8708617687225342, + "num_tokens": 138615755.0, + "step": 3632 + }, + { + "epoch": 0.46215494211932323, + "ewc_loss": 0.0064134723506867886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.413472146959975e-05, + "grad_norm": 3.4778904914855957, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8545724153518677, + "num_tokens": 138647607.0, + "step": 3633 + }, + { + "epoch": 0.46228215239791376, + "ewc_loss": 0.0064798383973538876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.479838339146227e-05, + "grad_norm": 3.4695918560028076, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8665672540664673, + "num_tokens": 138679284.0, + "step": 3634 + }, + { + "epoch": 0.4624093626765043, + "ewc_loss": 0.006438873242586851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.438873242586851e-05, + "grad_norm": 3.335070848464966, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8713628649711609, + "num_tokens": 138719690.0, + "step": 3635 + }, + { + "epoch": 0.46253657295509476, + "ewc_loss": 0.00637404527515173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.374045187840238e-05, + "grad_norm": 3.398312568664551, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8494181036949158, + "num_tokens": 138758031.0, + "step": 3636 + }, + { + "epoch": 0.4626637832336853, + "ewc_loss": 0.006465445272624493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.465445039793849e-05, + "grad_norm": 3.405644655227661, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8616270422935486, + "num_tokens": 138797283.0, + "step": 3637 + }, + { + "epoch": 0.4627909935122758, + "ewc_loss": 0.006433103233575821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.433103408198804e-05, + "grad_norm": 3.376405954360962, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8589291572570801, + "num_tokens": 138834672.0, + "step": 3638 + }, + { + "epoch": 0.4629182037908663, + "ewc_loss": 0.006432943511754274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.432943337131292e-05, + "grad_norm": 3.3479323387145996, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8716900944709778, + "num_tokens": 138871105.0, + "step": 3639 + }, + { + "epoch": 0.4630454140694568, + "ewc_loss": 0.006411823444068432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.411823414964601e-05, + "grad_norm": 3.461516857147217, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8621416091918945, + "num_tokens": 138903516.0, + "step": 3640 + }, + { + "epoch": 0.46317262434804735, + "ewc_loss": 0.006497689988464117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.497689901152626e-05, + "grad_norm": 3.394235134124756, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8594539761543274, + "num_tokens": 138942021.0, + "step": 3641 + }, + { + "epoch": 0.4632998346266378, + "ewc_loss": 0.006422535050660372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.422535079764202e-05, + "grad_norm": 3.364304780960083, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8690277338027954, + "num_tokens": 138978210.0, + "step": 3642 + }, + { + "epoch": 0.46342704490522835, + "ewc_loss": 0.006422663107514381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.422663136618212e-05, + "grad_norm": 3.3245904445648193, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8642566204071045, + "num_tokens": 139019122.0, + "step": 3643 + }, + { + "epoch": 0.4635542551838189, + "ewc_loss": 0.0064148977398872375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.414897507056594e-05, + "grad_norm": 3.347883701324463, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8705050349235535, + "num_tokens": 139062187.0, + "step": 3644 + }, + { + "epoch": 0.46368146546240935, + "ewc_loss": 0.00642735417932272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.427353946492076e-05, + "grad_norm": 3.332833766937256, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8802372217178345, + "num_tokens": 139102960.0, + "step": 3645 + }, + { + "epoch": 0.4638086757409999, + "ewc_loss": 0.006409600377082825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.409600609913468e-05, + "grad_norm": 3.388159990310669, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8636570572853088, + "num_tokens": 139137302.0, + "step": 3646 + }, + { + "epoch": 0.4639358860195904, + "ewc_loss": 0.006445758976042271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.445759208872914e-05, + "grad_norm": 3.417004108428955, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8423063158988953, + "num_tokens": 139177195.0, + "step": 3647 + }, + { + "epoch": 0.4640630962981809, + "ewc_loss": 0.006444426253437996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.444426253437996e-05, + "grad_norm": 3.4156322479248047, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8570454120635986, + "num_tokens": 139214722.0, + "step": 3648 + }, + { + "epoch": 0.4641903065767714, + "ewc_loss": 0.00644519180059433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.445191684179008e-05, + "grad_norm": 3.423916816711426, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8603638410568237, + "num_tokens": 139248569.0, + "step": 3649 + }, + { + "epoch": 0.46431751685536193, + "ewc_loss": 0.006437940523028374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.437940464820713e-05, + "grad_norm": 3.338878870010376, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8678191900253296, + "num_tokens": 139289141.0, + "step": 3650 + }, + { + "epoch": 0.4644447271339524, + "ewc_loss": 0.006392213981598616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.392213981598616e-05, + "grad_norm": 3.3399457931518555, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.879641056060791, + "num_tokens": 139330791.0, + "step": 3651 + }, + { + "epoch": 0.46457193741254293, + "ewc_loss": 0.006412714719772339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.412714719772339e-05, + "grad_norm": 3.4276487827301025, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8300914764404297, + "num_tokens": 139373009.0, + "step": 3652 + }, + { + "epoch": 0.46469914769113346, + "ewc_loss": 0.0064497049897909164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.449704960687086e-05, + "grad_norm": 3.368727922439575, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8596993088722229, + "num_tokens": 139408894.0, + "step": 3653 + }, + { + "epoch": 0.46482635796972394, + "ewc_loss": 0.006390594411641359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.390594353433698e-05, + "grad_norm": 3.382732391357422, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.860173225402832, + "num_tokens": 139444040.0, + "step": 3654 + }, + { + "epoch": 0.46495356824831446, + "ewc_loss": 0.0064207264222204685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.420726276701316e-05, + "grad_norm": 3.380054235458374, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8666603565216064, + "num_tokens": 139479675.0, + "step": 3655 + }, + { + "epoch": 0.465080778526905, + "ewc_loss": 0.006405862979590893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.405862950487062e-05, + "grad_norm": 3.3294999599456787, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.867702841758728, + "num_tokens": 139521792.0, + "step": 3656 + }, + { + "epoch": 0.46520798880549546, + "ewc_loss": 0.0063845086842775345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.384508742485195e-05, + "grad_norm": 3.3503689765930176, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8679533004760742, + "num_tokens": 139563471.0, + "step": 3657 + }, + { + "epoch": 0.465335199084086, + "ewc_loss": 0.00642399163916707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.423991726478562e-05, + "grad_norm": 3.371931552886963, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8615615367889404, + "num_tokens": 139604702.0, + "step": 3658 + }, + { + "epoch": 0.4654624093626765, + "ewc_loss": 0.006410406902432442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.41040678601712e-05, + "grad_norm": 3.373505115509033, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8587008714675903, + "num_tokens": 139644573.0, + "step": 3659 + }, + { + "epoch": 0.465589619641267, + "ewc_loss": 0.006398960016667843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.398960249498487e-05, + "grad_norm": 3.425635814666748, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8481331467628479, + "num_tokens": 139680924.0, + "step": 3660 + }, + { + "epoch": 0.4657168299198575, + "ewc_loss": 0.006421297322958708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.42129743937403e-05, + "grad_norm": 3.361968517303467, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8531477451324463, + "num_tokens": 139724369.0, + "step": 3661 + }, + { + "epoch": 0.46584404019844805, + "ewc_loss": 0.006375976372510195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.375976226991042e-05, + "grad_norm": 3.3301146030426025, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8615835905075073, + "num_tokens": 139769412.0, + "step": 3662 + }, + { + "epoch": 0.4659712504770385, + "ewc_loss": 0.006361974403262138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.361974374158308e-05, + "grad_norm": 3.4138801097869873, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8367775678634644, + "num_tokens": 139808066.0, + "step": 3663 + }, + { + "epoch": 0.46609846075562905, + "ewc_loss": 0.006432684138417244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.432684313040227e-05, + "grad_norm": 3.375441312789917, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8663098216056824, + "num_tokens": 139851169.0, + "step": 3664 + }, + { + "epoch": 0.4662256710342196, + "ewc_loss": 0.006376578472554684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.376578676281497e-05, + "grad_norm": 3.378821611404419, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.870259165763855, + "num_tokens": 139890550.0, + "step": 3665 + }, + { + "epoch": 0.46635288131281005, + "ewc_loss": 0.006398397032171488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.398397090379149e-05, + "grad_norm": 3.377040147781372, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8340932130813599, + "num_tokens": 139931496.0, + "step": 3666 + }, + { + "epoch": 0.4664800915914006, + "ewc_loss": 0.006392665673047304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.392665818566456e-05, + "grad_norm": 3.4587740898132324, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8507451415061951, + "num_tokens": 139965872.0, + "step": 3667 + }, + { + "epoch": 0.4666073018699911, + "ewc_loss": 0.006435032933950424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.435032992158085e-05, + "grad_norm": 3.4713969230651855, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8470151424407959, + "num_tokens": 140000394.0, + "step": 3668 + }, + { + "epoch": 0.4667345121485816, + "ewc_loss": 0.006426086649298668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.426086474675685e-05, + "grad_norm": 3.367884397506714, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8448038101196289, + "num_tokens": 140039967.0, + "step": 3669 + }, + { + "epoch": 0.4668617224271721, + "ewc_loss": 0.006377969868481159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.377969839377329e-05, + "grad_norm": 3.3913180828094482, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8613809943199158, + "num_tokens": 140075739.0, + "step": 3670 + }, + { + "epoch": 0.46698893270576264, + "ewc_loss": 0.006436915136873722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.436915282392874e-05, + "grad_norm": 3.413254737854004, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8654121160507202, + "num_tokens": 140110016.0, + "step": 3671 + }, + { + "epoch": 0.4671161429843531, + "ewc_loss": 0.0064420076087117195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.442007725127041e-05, + "grad_norm": 3.3859362602233887, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8517540097236633, + "num_tokens": 140147905.0, + "step": 3672 + }, + { + "epoch": 0.46724335326294364, + "ewc_loss": 0.006422120146453381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.422120350180194e-05, + "grad_norm": 3.3989827632904053, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8566432595252991, + "num_tokens": 140182284.0, + "step": 3673 + }, + { + "epoch": 0.46737056354153417, + "ewc_loss": 0.006458327639847994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.458327698055655e-05, + "grad_norm": 3.3355891704559326, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8555775880813599, + "num_tokens": 140220669.0, + "step": 3674 + }, + { + "epoch": 0.46749777382012464, + "ewc_loss": 0.0064150262624025345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.415026291506365e-05, + "grad_norm": 3.415722131729126, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.846595048904419, + "num_tokens": 140258399.0, + "step": 3675 + }, + { + "epoch": 0.46762498409871517, + "ewc_loss": 0.006494889501482248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.494889385066926e-05, + "grad_norm": 3.3891947269439697, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.875123143196106, + "num_tokens": 140296015.0, + "step": 3676 + }, + { + "epoch": 0.4677521943773057, + "ewc_loss": 0.006455094087868929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.455094262491912e-05, + "grad_norm": 3.379300355911255, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8501695394515991, + "num_tokens": 140338618.0, + "step": 3677 + }, + { + "epoch": 0.46787940465589617, + "ewc_loss": 0.006457239389419556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.457239214796573e-05, + "grad_norm": 3.383296012878418, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8652892112731934, + "num_tokens": 140379151.0, + "step": 3678 + }, + { + "epoch": 0.4680066149344867, + "ewc_loss": 0.006483479868620634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.483479955932125e-05, + "grad_norm": 3.4025416374206543, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8462512493133545, + "num_tokens": 140417740.0, + "step": 3679 + }, + { + "epoch": 0.4681338252130772, + "ewc_loss": 0.006484629586338997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.484629557235166e-05, + "grad_norm": 3.422477960586548, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8587852716445923, + "num_tokens": 140452454.0, + "step": 3680 + }, + { + "epoch": 0.46826103549166775, + "ewc_loss": 0.006494783330708742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.49478315608576e-05, + "grad_norm": 3.4512975215911865, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8384908437728882, + "num_tokens": 140492981.0, + "step": 3681 + }, + { + "epoch": 0.4683882457702582, + "ewc_loss": 0.006507363170385361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.507363286800683e-05, + "grad_norm": 3.341615676879883, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8498514294624329, + "num_tokens": 140534131.0, + "step": 3682 + }, + { + "epoch": 0.46851545604884876, + "ewc_loss": 0.006435672752559185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.435672548832372e-05, + "grad_norm": 3.4133431911468506, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8513109087944031, + "num_tokens": 140576761.0, + "step": 3683 + }, + { + "epoch": 0.4686426663274393, + "ewc_loss": 0.006527087651193142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.527087680296972e-05, + "grad_norm": 3.510962963104248, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8411104679107666, + "num_tokens": 140609538.0, + "step": 3684 + }, + { + "epoch": 0.46876987660602976, + "ewc_loss": 0.006549024023115635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.549023964907974e-05, + "grad_norm": 3.4583609104156494, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8563966751098633, + "num_tokens": 140641802.0, + "step": 3685 + }, + { + "epoch": 0.4688970868846203, + "ewc_loss": 0.006491668988019228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.491669046226889e-05, + "grad_norm": 3.3444528579711914, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8483673334121704, + "num_tokens": 140685832.0, + "step": 3686 + }, + { + "epoch": 0.4690242971632108, + "ewc_loss": 0.006454689893871546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.454689719248563e-05, + "grad_norm": 3.419321060180664, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8485087156295776, + "num_tokens": 140725333.0, + "step": 3687 + }, + { + "epoch": 0.4691515074418013, + "ewc_loss": 0.00653524324297905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.53524330118671e-05, + "grad_norm": 3.428333044052124, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8578044772148132, + "num_tokens": 140760838.0, + "step": 3688 + }, + { + "epoch": 0.4692787177203918, + "ewc_loss": 0.006510762497782707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.510762614198029e-05, + "grad_norm": 3.440364360809326, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8495473861694336, + "num_tokens": 140797367.0, + "step": 3689 + }, + { + "epoch": 0.46940592799898234, + "ewc_loss": 0.006513063330203295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.513063271995634e-05, + "grad_norm": 3.3715903759002686, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8581631183624268, + "num_tokens": 140839778.0, + "step": 3690 + }, + { + "epoch": 0.4695331382775728, + "ewc_loss": 0.006474471651017666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.474471592810005e-05, + "grad_norm": 3.3780341148376465, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8552054166793823, + "num_tokens": 140880340.0, + "step": 3691 + }, + { + "epoch": 0.46966034855616334, + "ewc_loss": 0.006501178722828627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.501178722828627e-05, + "grad_norm": 3.425391435623169, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8527729511260986, + "num_tokens": 140915451.0, + "step": 3692 + }, + { + "epoch": 0.46978755883475387, + "ewc_loss": 0.0065257060341537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.525705975946039e-05, + "grad_norm": 3.3689076900482178, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8566097021102905, + "num_tokens": 140956967.0, + "step": 3693 + }, + { + "epoch": 0.46991476911334434, + "ewc_loss": 0.006483667064458132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.48366694804281e-05, + "grad_norm": 3.3416872024536133, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8744176030158997, + "num_tokens": 140996640.0, + "step": 3694 + }, + { + "epoch": 0.47004197939193487, + "ewc_loss": 0.006485919002443552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.4859188569244e-05, + "grad_norm": 3.3794422149658203, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8579065203666687, + "num_tokens": 141036094.0, + "step": 3695 + }, + { + "epoch": 0.4701691896705254, + "ewc_loss": 0.006516619585454464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.516619760077447e-05, + "grad_norm": 3.5043444633483887, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8494329452514648, + "num_tokens": 141069033.0, + "step": 3696 + }, + { + "epoch": 0.4702963999491159, + "ewc_loss": 0.006572242826223373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.572243000846356e-05, + "grad_norm": 3.3165977001190186, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8762607574462891, + "num_tokens": 141111075.0, + "step": 3697 + }, + { + "epoch": 0.4704236102277064, + "ewc_loss": 0.006422396283596754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.422396108973771e-05, + "grad_norm": 3.404510259628296, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8751420974731445, + "num_tokens": 141145270.0, + "step": 3698 + }, + { + "epoch": 0.47055082050629693, + "ewc_loss": 0.006545120384544134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.545120413647965e-05, + "grad_norm": 3.371204137802124, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8751739263534546, + "num_tokens": 141182132.0, + "step": 3699 + }, + { + "epoch": 0.4706780307848874, + "ewc_loss": 0.0064658611081540585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.46586122456938e-05, + "grad_norm": 3.428901433944702, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8594590425491333, + "num_tokens": 141217455.0, + "step": 3700 + }, + { + "epoch": 0.47080524106347793, + "ewc_loss": 0.00651150057092309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.511500396300107e-05, + "grad_norm": 3.4379966259002686, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8599445223808289, + "num_tokens": 141249680.0, + "step": 3701 + }, + { + "epoch": 0.47093245134206846, + "ewc_loss": 0.006492366082966328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.492366082966328e-05, + "grad_norm": 3.350586414337158, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8582684397697449, + "num_tokens": 141292085.0, + "step": 3702 + }, + { + "epoch": 0.47105966162065893, + "ewc_loss": 0.00644250912591815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.442509038606659e-05, + "grad_norm": 3.3449134826660156, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8683387041091919, + "num_tokens": 141331455.0, + "step": 3703 + }, + { + "epoch": 0.47118687189924946, + "ewc_loss": 0.006487484090030193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.48748391540721e-05, + "grad_norm": 3.422137975692749, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8598166704177856, + "num_tokens": 141365623.0, + "step": 3704 + }, + { + "epoch": 0.47131408217784, + "ewc_loss": 0.0065125999972224236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.51259979349561e-05, + "grad_norm": 3.4322640895843506, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8509302139282227, + "num_tokens": 141399033.0, + "step": 3705 + }, + { + "epoch": 0.47144129245643046, + "ewc_loss": 0.006493558641523123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.493558612419292e-05, + "grad_norm": 3.442910671234131, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8664391040802002, + "num_tokens": 141430436.0, + "step": 3706 + }, + { + "epoch": 0.471568502735021, + "ewc_loss": 0.006507026497274637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.507026409963146e-05, + "grad_norm": 3.436769485473633, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8767803311347961, + "num_tokens": 141463169.0, + "step": 3707 + }, + { + "epoch": 0.4716957130136115, + "ewc_loss": 0.006497567053884268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.497566937468946e-05, + "grad_norm": 3.396183967590332, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8790297508239746, + "num_tokens": 141495524.0, + "step": 3708 + }, + { + "epoch": 0.471822923292202, + "ewc_loss": 0.00649560010060668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.495600246125832e-05, + "grad_norm": 3.3384640216827393, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8715964555740356, + "num_tokens": 141533099.0, + "step": 3709 + }, + { + "epoch": 0.4719501335707925, + "ewc_loss": 0.006482897792011499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.482897879322991e-05, + "grad_norm": 3.344900131225586, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8652923703193665, + "num_tokens": 141576389.0, + "step": 3710 + }, + { + "epoch": 0.47207734384938305, + "ewc_loss": 0.006497183348983526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.497183494502679e-05, + "grad_norm": 3.4494788646698, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8621193766593933, + "num_tokens": 141611870.0, + "step": 3711 + }, + { + "epoch": 0.4722045541279735, + "ewc_loss": 0.006551993079483509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.551993283210322e-05, + "grad_norm": 3.427827835083008, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8719795942306519, + "num_tokens": 141648539.0, + "step": 3712 + }, + { + "epoch": 0.47233176440656405, + "ewc_loss": 0.0065087792463600636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.508779188152403e-05, + "grad_norm": 3.391223669052124, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8639212250709534, + "num_tokens": 141688549.0, + "step": 3713 + }, + { + "epoch": 0.4724589746851546, + "ewc_loss": 0.006486453115940094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.486452912213281e-05, + "grad_norm": 3.3828125, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8685335516929626, + "num_tokens": 141725294.0, + "step": 3714 + }, + { + "epoch": 0.47258618496374505, + "ewc_loss": 0.006500142626464367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.500142626464367e-05, + "grad_norm": 3.5067138671875, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8438906073570251, + "num_tokens": 141756761.0, + "step": 3715 + }, + { + "epoch": 0.4727133952423356, + "ewc_loss": 0.006573758088052273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.573757855221629e-05, + "grad_norm": 3.442728042602539, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.846621036529541, + "num_tokens": 141790417.0, + "step": 3716 + }, + { + "epoch": 0.4728406055209261, + "ewc_loss": 0.0065093752928078175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.509375089081004e-05, + "grad_norm": 3.342804193496704, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8710527420043945, + "num_tokens": 141830749.0, + "step": 3717 + }, + { + "epoch": 0.4729678157995166, + "ewc_loss": 0.0064760237000882626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.47602355456911e-05, + "grad_norm": 3.3543941974639893, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8650137186050415, + "num_tokens": 141868433.0, + "step": 3718 + }, + { + "epoch": 0.4730950260781071, + "ewc_loss": 0.006517324596643448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.517324800370261e-05, + "grad_norm": 3.356546401977539, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8783560395240784, + "num_tokens": 141907140.0, + "step": 3719 + }, + { + "epoch": 0.47322223635669763, + "ewc_loss": 0.00649365596473217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.493656110251322e-05, + "grad_norm": 3.3530924320220947, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8751531839370728, + "num_tokens": 141947436.0, + "step": 3720 + }, + { + "epoch": 0.4733494466352881, + "ewc_loss": 0.006482923869043589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.482924072770402e-05, + "grad_norm": 3.3757944107055664, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8541712164878845, + "num_tokens": 141986876.0, + "step": 3721 + }, + { + "epoch": 0.47347665691387864, + "ewc_loss": 0.006526064593344927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.526064680656418e-05, + "grad_norm": 3.408996105194092, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8632721304893494, + "num_tokens": 142025806.0, + "step": 3722 + }, + { + "epoch": 0.47360386719246916, + "ewc_loss": 0.006502395961433649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.50239599053748e-05, + "grad_norm": 3.4306044578552246, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8604467511177063, + "num_tokens": 142057932.0, + "step": 3723 + }, + { + "epoch": 0.47373107747105964, + "ewc_loss": 0.006496572867035866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.496573041658849e-05, + "grad_norm": 3.4631152153015137, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8649579882621765, + "num_tokens": 142088880.0, + "step": 3724 + }, + { + "epoch": 0.47385828774965016, + "ewc_loss": 0.006508592050522566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.508592196041718e-05, + "grad_norm": 3.410099983215332, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8747373819351196, + "num_tokens": 142124959.0, + "step": 3725 + }, + { + "epoch": 0.4739854980282407, + "ewc_loss": 0.006463910453021526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.463910540333018e-05, + "grad_norm": 3.4397428035736084, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8485792875289917, + "num_tokens": 142160667.0, + "step": 3726 + }, + { + "epoch": 0.47411270830683117, + "ewc_loss": 0.00650579622015357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.505796045530587e-05, + "grad_norm": 3.3294715881347656, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8611367344856262, + "num_tokens": 142205017.0, + "step": 3727 + }, + { + "epoch": 0.4742399185854217, + "ewc_loss": 0.006441125646233559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.441125879064202e-05, + "grad_norm": 3.418160915374756, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8674039840698242, + "num_tokens": 142241479.0, + "step": 3728 + }, + { + "epoch": 0.4743671288640122, + "ewc_loss": 0.006518855690956116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.518855661852285e-05, + "grad_norm": 3.4215309619903564, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8607156276702881, + "num_tokens": 142280881.0, + "step": 3729 + }, + { + "epoch": 0.4744943391426027, + "ewc_loss": 0.0064935809932649136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.493581167887896e-05, + "grad_norm": 3.3105430603027344, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8680863380432129, + "num_tokens": 142328248.0, + "step": 3730 + }, + { + "epoch": 0.4746215494211932, + "ewc_loss": 0.006425297819077969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.425297760870308e-05, + "grad_norm": 3.4608142375946045, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8453059196472168, + "num_tokens": 142364872.0, + "step": 3731 + }, + { + "epoch": 0.47474875969978375, + "ewc_loss": 0.006547221913933754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.547221710206941e-05, + "grad_norm": 3.455022096633911, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.844569206237793, + "num_tokens": 142400494.0, + "step": 3732 + }, + { + "epoch": 0.4748759699783743, + "ewc_loss": 0.006491947919130325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.491947715403512e-05, + "grad_norm": 3.4462716579437256, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8593100309371948, + "num_tokens": 142433739.0, + "step": 3733 + }, + { + "epoch": 0.47500318025696475, + "ewc_loss": 0.006490309257060289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.490309169748798e-05, + "grad_norm": 3.335909366607666, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8694249987602234, + "num_tokens": 142469777.0, + "step": 3734 + }, + { + "epoch": 0.4751303905355553, + "ewc_loss": 0.0064411768689751625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.441176810767502e-05, + "grad_norm": 3.421083450317383, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8511243462562561, + "num_tokens": 142510052.0, + "step": 3735 + }, + { + "epoch": 0.4752576008141458, + "ewc_loss": 0.006506145931780338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.50614601909183e-05, + "grad_norm": 3.4183361530303955, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8494311571121216, + "num_tokens": 142545689.0, + "step": 3736 + }, + { + "epoch": 0.4753848110927363, + "ewc_loss": 0.006484828889369965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.484828918473795e-05, + "grad_norm": 3.401906728744507, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8589292168617249, + "num_tokens": 142585916.0, + "step": 3737 + }, + { + "epoch": 0.4755120213713268, + "ewc_loss": 0.006468497216701508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.468497304012999e-05, + "grad_norm": 3.4079997539520264, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8681513071060181, + "num_tokens": 142623019.0, + "step": 3738 + }, + { + "epoch": 0.47563923164991734, + "ewc_loss": 0.006502925883978605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.502925680251792e-05, + "grad_norm": 3.396796941757202, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8580144643783569, + "num_tokens": 142660986.0, + "step": 3739 + }, + { + "epoch": 0.4757664419285078, + "ewc_loss": 0.006472807377576828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.472807581303641e-05, + "grad_norm": 3.354428768157959, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8644441366195679, + "num_tokens": 142702876.0, + "step": 3740 + }, + { + "epoch": 0.47589365220709834, + "ewc_loss": 0.006454004906117916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.454005051637068e-05, + "grad_norm": 3.3572919368743896, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8497098088264465, + "num_tokens": 142744082.0, + "step": 3741 + }, + { + "epoch": 0.47602086248568887, + "ewc_loss": 0.006471634842455387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.471634696936235e-05, + "grad_norm": 3.4591119289398193, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8429107069969177, + "num_tokens": 142777511.0, + "step": 3742 + }, + { + "epoch": 0.47614807276427934, + "ewc_loss": 0.006522269453853369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.52226954116486e-05, + "grad_norm": 3.4049925804138184, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8553339242935181, + "num_tokens": 142818882.0, + "step": 3743 + }, + { + "epoch": 0.47627528304286987, + "ewc_loss": 0.006470236461609602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.470236257882789e-05, + "grad_norm": 3.4567742347717285, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8659071922302246, + "num_tokens": 142852771.0, + "step": 3744 + }, + { + "epoch": 0.4764024933214604, + "ewc_loss": 0.0065137725323438644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.513772677863017e-05, + "grad_norm": 3.4142184257507324, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8617595434188843, + "num_tokens": 142888398.0, + "step": 3745 + }, + { + "epoch": 0.47652970360005087, + "ewc_loss": 0.006480284966528416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.480285082943738e-05, + "grad_norm": 3.4009764194488525, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8769099116325378, + "num_tokens": 142924789.0, + "step": 3746 + }, + { + "epoch": 0.4766569138786414, + "ewc_loss": 0.006492229178547859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.492229294963181e-05, + "grad_norm": 3.3949108123779297, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8688802123069763, + "num_tokens": 142958843.0, + "step": 3747 + }, + { + "epoch": 0.4767841241572319, + "ewc_loss": 0.0064965467900037766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.496546848211437e-05, + "grad_norm": 3.417140483856201, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8720262050628662, + "num_tokens": 142996408.0, + "step": 3748 + }, + { + "epoch": 0.4769113344358224, + "ewc_loss": 0.006506381556391716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.506381760118529e-05, + "grad_norm": 3.3528528213500977, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.853618860244751, + "num_tokens": 143040031.0, + "step": 3749 + }, + { + "epoch": 0.4770385447144129, + "ewc_loss": 0.006473910063505173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.47390988888219e-05, + "grad_norm": 3.4022228717803955, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.837862491607666, + "num_tokens": 143081648.0, + "step": 3750 + }, + { + "epoch": 0.47716575499300345, + "ewc_loss": 0.006502525415271521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.502525502583012e-05, + "grad_norm": 3.42972993850708, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8398492336273193, + "num_tokens": 143118913.0, + "step": 3751 + }, + { + "epoch": 0.4772929652715939, + "ewc_loss": 0.006507235113531351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.507235229946673e-05, + "grad_norm": 3.3963894844055176, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.869899332523346, + "num_tokens": 143153858.0, + "step": 3752 + }, + { + "epoch": 0.47742017555018446, + "ewc_loss": 0.006478746887296438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.478746945504099e-05, + "grad_norm": 3.3766872882843018, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8730427026748657, + "num_tokens": 143190977.0, + "step": 3753 + }, + { + "epoch": 0.477547385828775, + "ewc_loss": 0.006494523957371712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.494524131994694e-05, + "grad_norm": 3.4479620456695557, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8534023761749268, + "num_tokens": 143228009.0, + "step": 3754 + }, + { + "epoch": 0.47767459610736546, + "ewc_loss": 0.006533169653266668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.533169653266668e-05, + "grad_norm": 3.4150214195251465, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8512462973594666, + "num_tokens": 143269820.0, + "step": 3755 + }, + { + "epoch": 0.477801806385956, + "ewc_loss": 0.006469780579209328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.469780782936141e-05, + "grad_norm": 3.4203040599823, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8649721145629883, + "num_tokens": 143303441.0, + "step": 3756 + }, + { + "epoch": 0.4779290166645465, + "ewc_loss": 0.0065019563771784306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.501956522697583e-05, + "grad_norm": 3.4055590629577637, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8543117642402649, + "num_tokens": 143338748.0, + "step": 3757 + }, + { + "epoch": 0.478056226943137, + "ewc_loss": 0.006491275504231453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.491275416919962e-05, + "grad_norm": 3.399322509765625, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.86210036277771, + "num_tokens": 143373900.0, + "step": 3758 + }, + { + "epoch": 0.4781834372217275, + "ewc_loss": 0.006496110465377569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.496110290754586e-05, + "grad_norm": 3.4346015453338623, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8556610941886902, + "num_tokens": 143411190.0, + "step": 3759 + }, + { + "epoch": 0.47831064750031804, + "ewc_loss": 0.006522697396576405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.522697367472574e-05, + "grad_norm": 3.3861472606658936, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8560794591903687, + "num_tokens": 143449470.0, + "step": 3760 + }, + { + "epoch": 0.4784378577789085, + "ewc_loss": 0.006480883341282606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.480883166659623e-05, + "grad_norm": 3.4408562183380127, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8657430410385132, + "num_tokens": 143485868.0, + "step": 3761 + }, + { + "epoch": 0.47856506805749904, + "ewc_loss": 0.006541525945067406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.541526090586558e-05, + "grad_norm": 3.4052436351776123, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.869804859161377, + "num_tokens": 143520334.0, + "step": 3762 + }, + { + "epoch": 0.47869227833608957, + "ewc_loss": 0.006487515289336443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.487515202024952e-05, + "grad_norm": 3.379432439804077, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8546899557113647, + "num_tokens": 143557752.0, + "step": 3763 + }, + { + "epoch": 0.47881948861468004, + "ewc_loss": 0.006496875546872616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.496875721495599e-05, + "grad_norm": 3.4074368476867676, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8596476316452026, + "num_tokens": 143593385.0, + "step": 3764 + }, + { + "epoch": 0.4789466988932706, + "ewc_loss": 0.006519089452922344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.5190892200917e-05, + "grad_norm": 3.435683488845825, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.840362548828125, + "num_tokens": 143631975.0, + "step": 3765 + }, + { + "epoch": 0.4790739091718611, + "ewc_loss": 0.006523539777845144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.523539923364297e-05, + "grad_norm": 3.4728171825408936, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.843461275100708, + "num_tokens": 143668382.0, + "step": 3766 + }, + { + "epoch": 0.4792011194504516, + "ewc_loss": 0.0065359920263290405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.53599199722521e-05, + "grad_norm": 3.414299726486206, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8458968997001648, + "num_tokens": 143706623.0, + "step": 3767 + }, + { + "epoch": 0.4793283297290421, + "ewc_loss": 0.006502353120595217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.502353062387556e-05, + "grad_norm": 3.430379867553711, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8653771877288818, + "num_tokens": 143745664.0, + "step": 3768 + }, + { + "epoch": 0.47945554000763263, + "ewc_loss": 0.006512900348752737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.512900290545076e-05, + "grad_norm": 3.4455173015594482, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8521688580513, + "num_tokens": 143780761.0, + "step": 3769 + }, + { + "epoch": 0.4795827502862231, + "ewc_loss": 0.006539611611515284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.539611786138266e-05, + "grad_norm": 3.382112503051758, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.865704357624054, + "num_tokens": 143825298.0, + "step": 3770 + }, + { + "epoch": 0.47970996056481363, + "ewc_loss": 0.006483679171651602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.483679317170754e-05, + "grad_norm": 3.389329433441162, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8814018368721008, + "num_tokens": 143859855.0, + "step": 3771 + }, + { + "epoch": 0.47983717084340416, + "ewc_loss": 0.006510578561574221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.51057853247039e-05, + "grad_norm": 3.371166944503784, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8742122650146484, + "num_tokens": 143898553.0, + "step": 3772 + }, + { + "epoch": 0.47996438112199463, + "ewc_loss": 0.006499040871858597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.49904104648158e-05, + "grad_norm": 3.4349000453948975, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.857472836971283, + "num_tokens": 143935362.0, + "step": 3773 + }, + { + "epoch": 0.48009159140058516, + "ewc_loss": 0.00654505705460906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.545057112816721e-05, + "grad_norm": 3.3525733947753906, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.861067533493042, + "num_tokens": 143978692.0, + "step": 3774 + }, + { + "epoch": 0.4802188016791757, + "ewc_loss": 0.006478752940893173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.47875276627019e-05, + "grad_norm": 3.378948450088501, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8680779933929443, + "num_tokens": 144020951.0, + "step": 3775 + }, + { + "epoch": 0.48034601195776616, + "ewc_loss": 0.006504478864371777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.50447909720242e-05, + "grad_norm": 3.381821393966675, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8703638315200806, + "num_tokens": 144058341.0, + "step": 3776 + }, + { + "epoch": 0.4804732222363567, + "ewc_loss": 0.006507632322609425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.507632497232407e-05, + "grad_norm": 3.394284248352051, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8541054725646973, + "num_tokens": 144098232.0, + "step": 3777 + }, + { + "epoch": 0.4806004325149472, + "ewc_loss": 0.006512643303722143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.512643449241295e-05, + "grad_norm": 3.399618625640869, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8557167053222656, + "num_tokens": 144137537.0, + "step": 3778 + }, + { + "epoch": 0.4807276427935377, + "ewc_loss": 0.006518797483295202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.518797454191372e-05, + "grad_norm": 3.3953423500061035, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8668868541717529, + "num_tokens": 144178459.0, + "step": 3779 + }, + { + "epoch": 0.4808548530721282, + "ewc_loss": 0.006508807651698589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.508807564387098e-05, + "grad_norm": 3.4137985706329346, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.856474757194519, + "num_tokens": 144219257.0, + "step": 3780 + }, + { + "epoch": 0.48098206335071875, + "ewc_loss": 0.006520130205899477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.52013040962629e-05, + "grad_norm": 3.3574554920196533, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8667067885398865, + "num_tokens": 144262232.0, + "step": 3781 + }, + { + "epoch": 0.4811092736293093, + "ewc_loss": 0.006470848806202412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.470848893513903e-05, + "grad_norm": 3.4779999256134033, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8385912775993347, + "num_tokens": 144297144.0, + "step": 3782 + }, + { + "epoch": 0.48123648390789975, + "ewc_loss": 0.0065481639467179775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.548163946717978e-05, + "grad_norm": 3.362433671951294, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8617857694625854, + "num_tokens": 144336078.0, + "step": 3783 + }, + { + "epoch": 0.4813636941864903, + "ewc_loss": 0.006447523832321167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.447523628594354e-05, + "grad_norm": 3.390838861465454, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8518744707107544, + "num_tokens": 144380162.0, + "step": 3784 + }, + { + "epoch": 0.4814909044650808, + "ewc_loss": 0.0064819976687431335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.481997843366116e-05, + "grad_norm": 3.4435653686523438, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8478707075119019, + "num_tokens": 144416735.0, + "step": 3785 + }, + { + "epoch": 0.4816181147436713, + "ewc_loss": 0.0065294299274683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.529429811052978e-05, + "grad_norm": 3.436561107635498, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8500240445137024, + "num_tokens": 144452780.0, + "step": 3786 + }, + { + "epoch": 0.4817453250222618, + "ewc_loss": 0.00648808479309082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.488084909506142e-05, + "grad_norm": 3.3217661380767822, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.866307258605957, + "num_tokens": 144494155.0, + "step": 3787 + }, + { + "epoch": 0.48187253530085233, + "ewc_loss": 0.006426702719181776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.426702748285607e-05, + "grad_norm": 3.34199595451355, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8702328205108643, + "num_tokens": 144533989.0, + "step": 3788 + }, + { + "epoch": 0.4819997455794428, + "ewc_loss": 0.006480865646153688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.48086570436135e-05, + "grad_norm": 3.3813602924346924, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8565747737884521, + "num_tokens": 144577409.0, + "step": 3789 + }, + { + "epoch": 0.48212695585803333, + "ewc_loss": 0.0064810048788785934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.48100467515178e-05, + "grad_norm": 3.393892765045166, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.853237509727478, + "num_tokens": 144618501.0, + "step": 3790 + }, + { + "epoch": 0.48225416613662386, + "ewc_loss": 0.006476276088505983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.476276030298322e-05, + "grad_norm": 3.371460437774658, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8664611577987671, + "num_tokens": 144659454.0, + "step": 3791 + }, + { + "epoch": 0.48238137641521434, + "ewc_loss": 0.006440575700253248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.44057581666857e-05, + "grad_norm": 3.3918747901916504, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8664609789848328, + "num_tokens": 144695580.0, + "step": 3792 + }, + { + "epoch": 0.48250858669380486, + "ewc_loss": 0.006466521881520748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.466521881520748e-05, + "grad_norm": 3.4423251152038574, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8430013060569763, + "num_tokens": 144735586.0, + "step": 3793 + }, + { + "epoch": 0.4826357969723954, + "ewc_loss": 0.006493501830846071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.493501859949902e-05, + "grad_norm": 3.3404624462127686, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8620132207870483, + "num_tokens": 144776371.0, + "step": 3794 + }, + { + "epoch": 0.48276300725098586, + "ewc_loss": 0.006405318155884743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.40531798126176e-05, + "grad_norm": 3.3973608016967773, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8562999963760376, + "num_tokens": 144816724.0, + "step": 3795 + }, + { + "epoch": 0.4828902175295764, + "ewc_loss": 0.006491436157375574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.491436215583235e-05, + "grad_norm": 3.3970234394073486, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8671416640281677, + "num_tokens": 144854591.0, + "step": 3796 + }, + { + "epoch": 0.4830174278081669, + "ewc_loss": 0.00645692041143775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.456920527853072e-05, + "grad_norm": 3.3553683757781982, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8610265851020813, + "num_tokens": 144893694.0, + "step": 3797 + }, + { + "epoch": 0.4831446380867574, + "ewc_loss": 0.006442897021770477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.442896847147495e-05, + "grad_norm": 3.3723671436309814, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8570560216903687, + "num_tokens": 144932985.0, + "step": 3798 + }, + { + "epoch": 0.4832718483653479, + "ewc_loss": 0.006485870108008385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.485870108008385e-05, + "grad_norm": 3.4275095462799072, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8627867698669434, + "num_tokens": 144967823.0, + "step": 3799 + }, + { + "epoch": 0.48339905864393845, + "ewc_loss": 0.006504361983388662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.504361954284832e-05, + "grad_norm": 3.317875623703003, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8804567456245422, + "num_tokens": 145007750.0, + "step": 3800 + }, + { + "epoch": 0.4835262689225289, + "ewc_loss": 0.006445774342864752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.445774488383904e-05, + "grad_norm": 3.414640188217163, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8543834090232849, + "num_tokens": 145048596.0, + "step": 3801 + }, + { + "epoch": 0.48365347920111945, + "ewc_loss": 0.006533460691571236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.533460691571236e-05, + "grad_norm": 3.4983952045440674, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8502516746520996, + "num_tokens": 145080868.0, + "step": 3802 + }, + { + "epoch": 0.48378068947971, + "ewc_loss": 0.0065526715479791164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.552671402459964e-05, + "grad_norm": 3.4139435291290283, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8696724772453308, + "num_tokens": 145115865.0, + "step": 3803 + }, + { + "epoch": 0.48390789975830045, + "ewc_loss": 0.0064672306180000305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.46723055979237e-05, + "grad_norm": 3.483339548110962, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8730220794677734, + "num_tokens": 145151748.0, + "step": 3804 + }, + { + "epoch": 0.484035110036891, + "ewc_loss": 0.006551520898938179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.551521073561162e-05, + "grad_norm": 3.418332815170288, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8610385060310364, + "num_tokens": 145187074.0, + "step": 3805 + }, + { + "epoch": 0.4841623203154815, + "ewc_loss": 0.0064881229773163795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.488122744485736e-05, + "grad_norm": 3.3577136993408203, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8787888288497925, + "num_tokens": 145224855.0, + "step": 3806 + }, + { + "epoch": 0.484289530594072, + "ewc_loss": 0.0064776004292070866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.477600254584104e-05, + "grad_norm": 3.3610148429870605, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.868817925453186, + "num_tokens": 145266538.0, + "step": 3807 + }, + { + "epoch": 0.4844167408726625, + "ewc_loss": 0.0064904955215752125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.490495434263721e-05, + "grad_norm": 3.394775152206421, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8662447929382324, + "num_tokens": 145303560.0, + "step": 3808 + }, + { + "epoch": 0.48454395115125304, + "ewc_loss": 0.006517258007079363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.51725786156021e-05, + "grad_norm": 3.3736236095428467, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8630152940750122, + "num_tokens": 145345985.0, + "step": 3809 + }, + { + "epoch": 0.4846711614298435, + "ewc_loss": 0.006482937838882208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.482937897089869e-05, + "grad_norm": 3.4387381076812744, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8498513102531433, + "num_tokens": 145382132.0, + "step": 3810 + }, + { + "epoch": 0.48479837170843404, + "ewc_loss": 0.006547124125063419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.547124212374911e-05, + "grad_norm": 3.440847158432007, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8515888452529907, + "num_tokens": 145418992.0, + "step": 3811 + }, + { + "epoch": 0.48492558198702457, + "ewc_loss": 0.006520438008010387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.52043818263337e-05, + "grad_norm": 3.40938401222229, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8593606352806091, + "num_tokens": 145455892.0, + "step": 3812 + }, + { + "epoch": 0.48505279226561504, + "ewc_loss": 0.006505451165139675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.505451165139675e-05, + "grad_norm": 3.37349534034729, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8593518137931824, + "num_tokens": 145497217.0, + "step": 3813 + }, + { + "epoch": 0.48518000254420557, + "ewc_loss": 0.006495959125459194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.495958950836211e-05, + "grad_norm": 3.36985182762146, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8668087720870972, + "num_tokens": 145533822.0, + "step": 3814 + }, + { + "epoch": 0.4853072128227961, + "ewc_loss": 0.006514670327305794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.514670531032607e-05, + "grad_norm": 3.339108467102051, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8806160688400269, + "num_tokens": 145572967.0, + "step": 3815 + }, + { + "epoch": 0.48543442310138657, + "ewc_loss": 0.006496942136436701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.496941932709888e-05, + "grad_norm": 3.367988348007202, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8751349449157715, + "num_tokens": 145609436.0, + "step": 3816 + }, + { + "epoch": 0.4855616333799771, + "ewc_loss": 0.0065240501426160336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.524049967993051e-05, + "grad_norm": 3.430978536605835, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8738704919815063, + "num_tokens": 145645083.0, + "step": 3817 + }, + { + "epoch": 0.4856888436585676, + "ewc_loss": 0.006565830670297146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.565830699400976e-05, + "grad_norm": 3.499699831008911, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8425027132034302, + "num_tokens": 145680249.0, + "step": 3818 + }, + { + "epoch": 0.4858160539371581, + "ewc_loss": 0.006585362832993269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.585363007616252e-05, + "grad_norm": 3.3930633068084717, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8657903671264648, + "num_tokens": 145718569.0, + "step": 3819 + }, + { + "epoch": 0.4859432642157486, + "ewc_loss": 0.0064958506263792515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.49585053906776e-05, + "grad_norm": 3.508483648300171, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.846253514289856, + "num_tokens": 145751868.0, + "step": 3820 + }, + { + "epoch": 0.48607047449433916, + "ewc_loss": 0.0066071683540940285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.607168324990198e-05, + "grad_norm": 3.4763989448547363, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8665328621864319, + "num_tokens": 145787881.0, + "step": 3821 + }, + { + "epoch": 0.48619768477292963, + "ewc_loss": 0.006543750874698162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.543751078424975e-05, + "grad_norm": 3.387885570526123, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8677206039428711, + "num_tokens": 145824010.0, + "step": 3822 + }, + { + "epoch": 0.48632489505152016, + "ewc_loss": 0.006501118186861277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.501118332380429e-05, + "grad_norm": 3.4115383625030518, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8688800930976868, + "num_tokens": 145860981.0, + "step": 3823 + }, + { + "epoch": 0.4864521053301107, + "ewc_loss": 0.006558859720826149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.558859604410827e-05, + "grad_norm": 3.4013850688934326, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8654669523239136, + "num_tokens": 145900687.0, + "step": 3824 + }, + { + "epoch": 0.48657931560870116, + "ewc_loss": 0.006534911692142487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.534911517519504e-05, + "grad_norm": 3.418980360031128, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8532888293266296, + "num_tokens": 145938888.0, + "step": 3825 + }, + { + "epoch": 0.4867065258872917, + "ewc_loss": 0.006556849926710129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.55684998491779e-05, + "grad_norm": 3.3281490802764893, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8581105470657349, + "num_tokens": 145987060.0, + "step": 3826 + }, + { + "epoch": 0.4868337361658822, + "ewc_loss": 0.00650421716272831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.50421716272831e-05, + "grad_norm": 3.433231830596924, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8566132187843323, + "num_tokens": 146026447.0, + "step": 3827 + }, + { + "epoch": 0.4869609464444727, + "ewc_loss": 0.006593808066099882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.593808211619034e-05, + "grad_norm": 3.3590922355651855, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8637662529945374, + "num_tokens": 146067099.0, + "step": 3828 + }, + { + "epoch": 0.4870881567230632, + "ewc_loss": 0.006506460253149271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.506460340460762e-05, + "grad_norm": 3.3776698112487793, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8497406244277954, + "num_tokens": 146110767.0, + "step": 3829 + }, + { + "epoch": 0.48721536700165374, + "ewc_loss": 0.006547300145030022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.547300290549174e-05, + "grad_norm": 3.471602201461792, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8546894788742065, + "num_tokens": 146147603.0, + "step": 3830 + }, + { + "epoch": 0.48734257728024427, + "ewc_loss": 0.006578458938747644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.578458851436153e-05, + "grad_norm": 3.392894744873047, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8717052936553955, + "num_tokens": 146188821.0, + "step": 3831 + }, + { + "epoch": 0.48746978755883474, + "ewc_loss": 0.006499607115983963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.499607115983963e-05, + "grad_norm": 3.380812168121338, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.85857093334198, + "num_tokens": 146231886.0, + "step": 3832 + }, + { + "epoch": 0.48759699783742527, + "ewc_loss": 0.006504159886389971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.504159682663158e-05, + "grad_norm": 3.4006428718566895, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8390316367149353, + "num_tokens": 146271965.0, + "step": 3833 + }, + { + "epoch": 0.4877242081160158, + "ewc_loss": 0.006532260216772556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.532260158564895e-05, + "grad_norm": 3.3854384422302246, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.847740113735199, + "num_tokens": 146316686.0, + "step": 3834 + }, + { + "epoch": 0.4878514183946063, + "ewc_loss": 0.0065033878199756145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.503387703560293e-05, + "grad_norm": 3.396021604537964, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8509156703948975, + "num_tokens": 146355098.0, + "step": 3835 + }, + { + "epoch": 0.4879786286731968, + "ewc_loss": 0.006512762978672981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.512762774946168e-05, + "grad_norm": 3.3961830139160156, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8659749627113342, + "num_tokens": 146390666.0, + "step": 3836 + }, + { + "epoch": 0.48810583895178733, + "ewc_loss": 0.0065062581561505795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.506258068839088e-05, + "grad_norm": 3.4192283153533936, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8689056634902954, + "num_tokens": 146426740.0, + "step": 3837 + }, + { + "epoch": 0.4882330492303778, + "ewc_loss": 0.0065024616196751595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.502461474156007e-05, + "grad_norm": 3.428340435028076, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8521729707717896, + "num_tokens": 146462668.0, + "step": 3838 + }, + { + "epoch": 0.48836025950896833, + "ewc_loss": 0.006520455703139305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.520455644931644e-05, + "grad_norm": 3.4572219848632812, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8538670539855957, + "num_tokens": 146500160.0, + "step": 3839 + }, + { + "epoch": 0.48848746978755886, + "ewc_loss": 0.006514087785035372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.514087726827711e-05, + "grad_norm": 3.349119186401367, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8564454317092896, + "num_tokens": 146542963.0, + "step": 3840 + }, + { + "epoch": 0.48861468006614933, + "ewc_loss": 0.006457656621932983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.457656854763627e-05, + "grad_norm": 3.3939764499664307, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8675776720046997, + "num_tokens": 146580649.0, + "step": 3841 + }, + { + "epoch": 0.48874189034473986, + "ewc_loss": 0.006506835110485554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.506835052277893e-05, + "grad_norm": 3.3947033882141113, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8673577904701233, + "num_tokens": 146620823.0, + "step": 3842 + }, + { + "epoch": 0.4888691006233304, + "ewc_loss": 0.0064789121970534325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.478912109741941e-05, + "grad_norm": 3.358147382736206, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.87138831615448, + "num_tokens": 146663700.0, + "step": 3843 + }, + { + "epoch": 0.48899631090192086, + "ewc_loss": 0.006448867730796337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.448867497965693e-05, + "grad_norm": 3.417712926864624, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8561890125274658, + "num_tokens": 146698211.0, + "step": 3844 + }, + { + "epoch": 0.4891235211805114, + "ewc_loss": 0.006498410832136869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.498410948552191e-05, + "grad_norm": 3.4006786346435547, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8604193329811096, + "num_tokens": 146738380.0, + "step": 3845 + }, + { + "epoch": 0.4892507314591019, + "ewc_loss": 0.006442204117774963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.442204175982624e-05, + "grad_norm": 3.418877363204956, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8641453981399536, + "num_tokens": 146775413.0, + "step": 3846 + }, + { + "epoch": 0.4893779417376924, + "ewc_loss": 0.00646464666351676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.464646867243573e-05, + "grad_norm": 3.3953053951263428, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8635953664779663, + "num_tokens": 146814774.0, + "step": 3847 + }, + { + "epoch": 0.4895051520162829, + "ewc_loss": 0.006447001360356808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.447001214837655e-05, + "grad_norm": 3.3877713680267334, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8717129230499268, + "num_tokens": 146853957.0, + "step": 3848 + }, + { + "epoch": 0.48963236229487345, + "ewc_loss": 0.006436489522457123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.436489638872445e-05, + "grad_norm": 3.4390761852264404, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8539209961891174, + "num_tokens": 146889060.0, + "step": 3849 + }, + { + "epoch": 0.4897595725734639, + "ewc_loss": 0.006477411836385727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.477411807281896e-05, + "grad_norm": 3.3639988899230957, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8709856271743774, + "num_tokens": 146929849.0, + "step": 3850 + }, + { + "epoch": 0.48988678285205445, + "ewc_loss": 0.0064211091957986355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.421108992071822e-05, + "grad_norm": 3.407311201095581, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8621506690979004, + "num_tokens": 146968376.0, + "step": 3851 + }, + { + "epoch": 0.490013993130645, + "ewc_loss": 0.006470054388046265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.470054358942434e-05, + "grad_norm": 3.493288278579712, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8463449478149414, + "num_tokens": 147002871.0, + "step": 3852 + }, + { + "epoch": 0.49014120340923545, + "ewc_loss": 0.006498631555587053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.498631410067901e-05, + "grad_norm": 3.4016573429107666, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8569083213806152, + "num_tokens": 147039412.0, + "step": 3853 + }, + { + "epoch": 0.490268413687826, + "ewc_loss": 0.006416775286197662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.416775431716815e-05, + "grad_norm": 3.3373358249664307, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.86541748046875, + "num_tokens": 147080307.0, + "step": 3854 + }, + { + "epoch": 0.4903956239664165, + "ewc_loss": 0.006446592975407839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.4465930336155e-05, + "grad_norm": 3.364837884902954, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8695051074028015, + "num_tokens": 147120131.0, + "step": 3855 + }, + { + "epoch": 0.490522834245007, + "ewc_loss": 0.006459008902311325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.459008727688342e-05, + "grad_norm": 3.434183120727539, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8546173572540283, + "num_tokens": 147155760.0, + "step": 3856 + }, + { + "epoch": 0.4906500445235975, + "ewc_loss": 0.006495649926364422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.495649722637609e-05, + "grad_norm": 3.3725028038024902, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8672664165496826, + "num_tokens": 147195406.0, + "step": 3857 + }, + { + "epoch": 0.49077725480218803, + "ewc_loss": 0.006457215175032616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.457215204136446e-05, + "grad_norm": 3.3736889362335205, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.873222827911377, + "num_tokens": 147235688.0, + "step": 3858 + }, + { + "epoch": 0.4909044650807785, + "ewc_loss": 0.006487308535724878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.487308564828709e-05, + "grad_norm": 3.425920248031616, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8692294359207153, + "num_tokens": 147271575.0, + "step": 3859 + }, + { + "epoch": 0.49103167535936904, + "ewc_loss": 0.006519051268696785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.519051385112107e-05, + "grad_norm": 3.3488929271698, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8582139015197754, + "num_tokens": 147315473.0, + "step": 3860 + }, + { + "epoch": 0.49115888563795956, + "ewc_loss": 0.006446864455938339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.446864426834509e-05, + "grad_norm": 3.3412234783172607, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8648494482040405, + "num_tokens": 147359058.0, + "step": 3861 + }, + { + "epoch": 0.49128609591655004, + "ewc_loss": 0.006478552706539631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.4785526774358e-05, + "grad_norm": 3.456282377243042, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8669610023498535, + "num_tokens": 147394443.0, + "step": 3862 + }, + { + "epoch": 0.49141330619514056, + "ewc_loss": 0.006528142839670181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.528142694151029e-05, + "grad_norm": 3.3584697246551514, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8564107418060303, + "num_tokens": 147436225.0, + "step": 3863 + }, + { + "epoch": 0.4915405164737311, + "ewc_loss": 0.006435583811253309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.435583782149479e-05, + "grad_norm": 3.38192081451416, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8719140291213989, + "num_tokens": 147473449.0, + "step": 3864 + }, + { + "epoch": 0.49166772675232157, + "ewc_loss": 0.006486978847533464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.486978963948786e-05, + "grad_norm": 3.4007980823516846, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8596014976501465, + "num_tokens": 147510330.0, + "step": 3865 + }, + { + "epoch": 0.4917949370309121, + "ewc_loss": 0.00647393986582756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.473939720308408e-05, + "grad_norm": 3.429967164993286, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8727781772613525, + "num_tokens": 147544683.0, + "step": 3866 + }, + { + "epoch": 0.4919221473095026, + "ewc_loss": 0.006489466410130262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.489466613857076e-05, + "grad_norm": 3.314530849456787, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8795562386512756, + "num_tokens": 147589330.0, + "step": 3867 + }, + { + "epoch": 0.4920493575880931, + "ewc_loss": 0.006403776817023754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.403776933439076e-05, + "grad_norm": 3.446120500564575, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8311346173286438, + "num_tokens": 147626864.0, + "step": 3868 + }, + { + "epoch": 0.4921765678666836, + "ewc_loss": 0.006532920058816671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.532920087920502e-05, + "grad_norm": 3.369964838027954, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8437447547912598, + "num_tokens": 147672006.0, + "step": 3869 + }, + { + "epoch": 0.49230377814527415, + "ewc_loss": 0.006420972291380167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.420972204068676e-05, + "grad_norm": 3.3607356548309326, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8519585728645325, + "num_tokens": 147717388.0, + "step": 3870 + }, + { + "epoch": 0.4924309884238646, + "ewc_loss": 0.006444960366934538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.444960308726877e-05, + "grad_norm": 3.378095865249634, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8514078259468079, + "num_tokens": 147759827.0, + "step": 3871 + }, + { + "epoch": 0.49255819870245515, + "ewc_loss": 0.006464456208050251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.464456237154081e-05, + "grad_norm": 3.4694015979766846, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8433337211608887, + "num_tokens": 147793742.0, + "step": 3872 + }, + { + "epoch": 0.4926854089810457, + "ewc_loss": 0.006509061437100172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.509061495307833e-05, + "grad_norm": 3.404986619949341, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8718970417976379, + "num_tokens": 147829881.0, + "step": 3873 + }, + { + "epoch": 0.49281261925963615, + "ewc_loss": 0.006446565501391888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.446565384976566e-05, + "grad_norm": 3.3878369331359863, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8468193411827087, + "num_tokens": 147875051.0, + "step": 3874 + }, + { + "epoch": 0.4929398295382267, + "ewc_loss": 0.006459198426455259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.459198630182073e-05, + "grad_norm": 3.4136035442352295, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8553006649017334, + "num_tokens": 147912786.0, + "step": 3875 + }, + { + "epoch": 0.4930670398168172, + "ewc_loss": 0.006487905979156494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.487905920948833e-05, + "grad_norm": 3.4148077964782715, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8413571119308472, + "num_tokens": 147954981.0, + "step": 3876 + }, + { + "epoch": 0.4931942500954077, + "ewc_loss": 0.006459010764956474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.459010910475627e-05, + "grad_norm": 3.4227209091186523, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8539202213287354, + "num_tokens": 147992179.0, + "step": 3877 + }, + { + "epoch": 0.4933214603739982, + "ewc_loss": 0.006491506937891245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.491506792372093e-05, + "grad_norm": 3.4000587463378906, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8661526441574097, + "num_tokens": 148029866.0, + "step": 3878 + }, + { + "epoch": 0.49344867065258874, + "ewc_loss": 0.006465130019932985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.465129990829155e-05, + "grad_norm": 3.3751988410949707, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8608517050743103, + "num_tokens": 148070057.0, + "step": 3879 + }, + { + "epoch": 0.4935758809311792, + "ewc_loss": 0.006469331681728363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.469331856351346e-05, + "grad_norm": 3.4008305072784424, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8652729392051697, + "num_tokens": 148108461.0, + "step": 3880 + }, + { + "epoch": 0.49370309120976974, + "ewc_loss": 0.006500499323010445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.500499148387462e-05, + "grad_norm": 3.3806049823760986, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8547030687332153, + "num_tokens": 148147609.0, + "step": 3881 + }, + { + "epoch": 0.49383030148836027, + "ewc_loss": 0.006492462940514088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.492462853202596e-05, + "grad_norm": 3.3543827533721924, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8555737733840942, + "num_tokens": 148190475.0, + "step": 3882 + }, + { + "epoch": 0.4939575117669508, + "ewc_loss": 0.006491458974778652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.491458771051839e-05, + "grad_norm": 3.464096784591675, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8741618990898132, + "num_tokens": 148221187.0, + "step": 3883 + }, + { + "epoch": 0.49408472204554127, + "ewc_loss": 0.006556966342031956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.556966400239617e-05, + "grad_norm": 3.350463390350342, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8695619106292725, + "num_tokens": 148260916.0, + "step": 3884 + }, + { + "epoch": 0.4942119323241318, + "ewc_loss": 0.006463289726525545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.463289901148528e-05, + "grad_norm": 3.4224929809570312, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8665000200271606, + "num_tokens": 148295591.0, + "step": 3885 + }, + { + "epoch": 0.4943391426027223, + "ewc_loss": 0.006551350932568312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.55135081615299e-05, + "grad_norm": 3.3896749019622803, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8567948341369629, + "num_tokens": 148333445.0, + "step": 3886 + }, + { + "epoch": 0.4944663528813128, + "ewc_loss": 0.006494868081063032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.494868284789845e-05, + "grad_norm": 3.3971846103668213, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8695023059844971, + "num_tokens": 148367951.0, + "step": 3887 + }, + { + "epoch": 0.4945935631599033, + "ewc_loss": 0.006503821350634098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.503821350634098e-05, + "grad_norm": 3.38173508644104, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.852519154548645, + "num_tokens": 148408491.0, + "step": 3888 + }, + { + "epoch": 0.49472077343849385, + "ewc_loss": 0.006503437180072069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.503437180072069e-05, + "grad_norm": 3.4462313652038574, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8602944612503052, + "num_tokens": 148443883.0, + "step": 3889 + }, + { + "epoch": 0.4948479837170843, + "ewc_loss": 0.006539422087371349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.539421883644536e-05, + "grad_norm": 3.4396791458129883, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8618932962417603, + "num_tokens": 148480511.0, + "step": 3890 + }, + { + "epoch": 0.49497519399567486, + "ewc_loss": 0.006501997821033001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.501997995655984e-05, + "grad_norm": 3.401306629180908, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8567670583724976, + "num_tokens": 148514759.0, + "step": 3891 + }, + { + "epoch": 0.4951024042742654, + "ewc_loss": 0.0065047722309827805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.504772318294272e-05, + "grad_norm": 3.371554136276245, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8626816272735596, + "num_tokens": 148553209.0, + "step": 3892 + }, + { + "epoch": 0.49522961455285586, + "ewc_loss": 0.006493939086794853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.493939145002514e-05, + "grad_norm": 3.4004251956939697, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8524288535118103, + "num_tokens": 148590387.0, + "step": 3893 + }, + { + "epoch": 0.4953568248314464, + "ewc_loss": 0.006539664696902037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.53966490062885e-05, + "grad_norm": 3.4206702709198, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.859074056148529, + "num_tokens": 148625201.0, + "step": 3894 + }, + { + "epoch": 0.4954840351100369, + "ewc_loss": 0.006528123747557402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.528123776661232e-05, + "grad_norm": 3.511549234390259, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8641088008880615, + "num_tokens": 148665989.0, + "step": 3895 + }, + { + "epoch": 0.4956112453886274, + "ewc_loss": 0.006602576468139887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.602576468139887e-05, + "grad_norm": 3.396449327468872, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8642011880874634, + "num_tokens": 148704839.0, + "step": 3896 + }, + { + "epoch": 0.4957384556672179, + "ewc_loss": 0.006514835637062788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.514835695270449e-05, + "grad_norm": 3.3884642124176025, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8700456619262695, + "num_tokens": 148740523.0, + "step": 3897 + }, + { + "epoch": 0.49586566594580844, + "ewc_loss": 0.006559476722031832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.55947660561651e-05, + "grad_norm": 3.3733744621276855, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8767595887184143, + "num_tokens": 148779103.0, + "step": 3898 + }, + { + "epoch": 0.4959928762243989, + "ewc_loss": 0.006545696873217821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.545696669491008e-05, + "grad_norm": 3.3944830894470215, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.871719479560852, + "num_tokens": 148814659.0, + "step": 3899 + }, + { + "epoch": 0.49612008650298944, + "ewc_loss": 0.006565287243574858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.565287185367197e-05, + "grad_norm": 3.3564205169677734, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8813955187797546, + "num_tokens": 148856864.0, + "step": 3900 + }, + { + "epoch": 0.49624729678157997, + "ewc_loss": 0.0065431310795247555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.543131166836247e-05, + "grad_norm": 3.411428213119507, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8679196834564209, + "num_tokens": 148890309.0, + "step": 3901 + }, + { + "epoch": 0.49637450706017044, + "ewc_loss": 0.006580907851457596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.580907938769087e-05, + "grad_norm": 3.436258316040039, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8473348021507263, + "num_tokens": 148926662.0, + "step": 3902 + }, + { + "epoch": 0.496501717338761, + "ewc_loss": 0.00656850403174758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.568503886228427e-05, + "grad_norm": 3.4365921020507812, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8488726615905762, + "num_tokens": 148963032.0, + "step": 3903 + }, + { + "epoch": 0.4966289276173515, + "ewc_loss": 0.00657933484762907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.579334876732901e-05, + "grad_norm": 3.4703989028930664, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8671141862869263, + "num_tokens": 148996684.0, + "step": 3904 + }, + { + "epoch": 0.496756137895942, + "ewc_loss": 0.006585860159248114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.5858599555213e-05, + "grad_norm": 3.3882369995117188, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8609421849250793, + "num_tokens": 149032735.0, + "step": 3905 + }, + { + "epoch": 0.4968833481745325, + "ewc_loss": 0.0065368725918233395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.536872388096526e-05, + "grad_norm": 3.4203789234161377, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8638390898704529, + "num_tokens": 149069415.0, + "step": 3906 + }, + { + "epoch": 0.49701055845312303, + "ewc_loss": 0.006591484881937504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.591484998352826e-05, + "grad_norm": 3.3631300926208496, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8472135066986084, + "num_tokens": 149115042.0, + "step": 3907 + }, + { + "epoch": 0.4971377687317135, + "ewc_loss": 0.006555537227541208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.555537402164191e-05, + "grad_norm": 3.477088212966919, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8701728582382202, + "num_tokens": 149149767.0, + "step": 3908 + }, + { + "epoch": 0.49726497901030403, + "ewc_loss": 0.006631549447774887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.631549331359565e-05, + "grad_norm": 3.372525930404663, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8540668487548828, + "num_tokens": 149193000.0, + "step": 3909 + }, + { + "epoch": 0.49739218928889456, + "ewc_loss": 0.006544685922563076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.544686038978398e-05, + "grad_norm": 3.433993101119995, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8582415580749512, + "num_tokens": 149226963.0, + "step": 3910 + }, + { + "epoch": 0.49751939956748503, + "ewc_loss": 0.006610839162021875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.610839045606554e-05, + "grad_norm": 3.3879904747009277, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8564857244491577, + "num_tokens": 149268139.0, + "step": 3911 + }, + { + "epoch": 0.49764660984607556, + "ewc_loss": 0.006576111540198326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.576111627509817e-05, + "grad_norm": 3.4350168704986572, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8694124817848206, + "num_tokens": 149305882.0, + "step": 3912 + }, + { + "epoch": 0.4977738201246661, + "ewc_loss": 0.0065986355766654015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.598635809496045e-05, + "grad_norm": 3.3810229301452637, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.858608067035675, + "num_tokens": 149344119.0, + "step": 3913 + }, + { + "epoch": 0.49790103040325656, + "ewc_loss": 0.00655895983800292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.558960012625903e-05, + "grad_norm": 3.4352760314941406, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8519009947776794, + "num_tokens": 149384102.0, + "step": 3914 + }, + { + "epoch": 0.4980282406818471, + "ewc_loss": 0.006610125303268433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.610125274164602e-05, + "grad_norm": 3.359790086746216, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8583929538726807, + "num_tokens": 149425677.0, + "step": 3915 + }, + { + "epoch": 0.4981554509604376, + "ewc_loss": 0.006540660746395588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.540660979226232e-05, + "grad_norm": 3.422062397003174, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8573294878005981, + "num_tokens": 149463275.0, + "step": 3916 + }, + { + "epoch": 0.4982826612390281, + "ewc_loss": 0.006605133879929781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.605133967241272e-05, + "grad_norm": 3.3740429878234863, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8765884637832642, + "num_tokens": 149503950.0, + "step": 3917 + }, + { + "epoch": 0.4984098715176186, + "ewc_loss": 0.006555473431944847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.555473373737186e-05, + "grad_norm": 3.392507314682007, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8520303964614868, + "num_tokens": 149541192.0, + "step": 3918 + }, + { + "epoch": 0.49853708179620915, + "ewc_loss": 0.0065828002989292145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.582800415344536e-05, + "grad_norm": 3.417879104614258, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8577688336372375, + "num_tokens": 149581784.0, + "step": 3919 + }, + { + "epoch": 0.4986642920747996, + "ewc_loss": 0.006574220024049282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.57421987853013e-05, + "grad_norm": 3.3865299224853516, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8677574992179871, + "num_tokens": 149619312.0, + "step": 3920 + }, + { + "epoch": 0.49879150235339015, + "ewc_loss": 0.006558681838214397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.558682071045041e-05, + "grad_norm": 3.4049065113067627, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8584386110305786, + "num_tokens": 149657443.0, + "step": 3921 + }, + { + "epoch": 0.4989187126319807, + "ewc_loss": 0.006576592568308115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.576592568308115e-05, + "grad_norm": 3.3993632793426514, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8733682632446289, + "num_tokens": 149694992.0, + "step": 3922 + }, + { + "epoch": 0.49904592291057115, + "ewc_loss": 0.006550797261297703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.55079711577855e-05, + "grad_norm": 3.453608989715576, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.857388436794281, + "num_tokens": 149729554.0, + "step": 3923 + }, + { + "epoch": 0.4991731331891617, + "ewc_loss": 0.006594791077077389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.59479119349271e-05, + "grad_norm": 3.415466547012329, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8673223853111267, + "num_tokens": 149770051.0, + "step": 3924 + }, + { + "epoch": 0.4993003434677522, + "ewc_loss": 0.006547178607434034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.547178782057017e-05, + "grad_norm": 3.4548263549804688, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8304739594459534, + "num_tokens": 149806724.0, + "step": 3925 + }, + { + "epoch": 0.4994275537463427, + "ewc_loss": 0.006562675349414349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.562675116583705e-05, + "grad_norm": 3.400096893310547, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8531792163848877, + "num_tokens": 149849874.0, + "step": 3926 + }, + { + "epoch": 0.4995547640249332, + "ewc_loss": 0.00652433093637228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.524330819956958e-05, + "grad_norm": 3.3807942867279053, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8602228164672852, + "num_tokens": 149888076.0, + "step": 3927 + }, + { + "epoch": 0.49968197430352373, + "ewc_loss": 0.006522986106574535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.522986222989857e-05, + "grad_norm": 3.392866373062134, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8614543676376343, + "num_tokens": 149923750.0, + "step": 3928 + }, + { + "epoch": 0.4998091845821142, + "ewc_loss": 0.006549562327563763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.549562385771424e-05, + "grad_norm": 3.440838575363159, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8483873605728149, + "num_tokens": 149960657.0, + "step": 3929 + }, + { + "epoch": 0.49993639486070474, + "ewc_loss": 0.006556186359375715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.556186417583376e-05, + "grad_norm": 3.449491262435913, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.877120852470398, + "num_tokens": 149995983.0, + "step": 3930 + }, + { + "epoch": 0.5000636051392953, + "ewc_loss": 0.006557684391736984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.557684537256137e-05, + "grad_norm": 3.4031763076782227, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8395357131958008, + "num_tokens": 150037508.0, + "step": 3931 + }, + { + "epoch": 0.5001908154178858, + "ewc_loss": 0.00652194581925869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.521945761051029e-05, + "grad_norm": 3.3756821155548096, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8565027713775635, + "num_tokens": 150077806.0, + "step": 3932 + }, + { + "epoch": 0.5003180256964763, + "ewc_loss": 0.006524828262627125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.524828495457768e-05, + "grad_norm": 3.418015241622925, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8586646914482117, + "num_tokens": 150117589.0, + "step": 3933 + }, + { + "epoch": 0.5004452359750667, + "ewc_loss": 0.006545843556523323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.545843643834814e-05, + "grad_norm": 3.3499395847320557, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8540688753128052, + "num_tokens": 150161579.0, + "step": 3934 + }, + { + "epoch": 0.5005724462536573, + "ewc_loss": 0.006483844481408596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.483844481408596e-05, + "grad_norm": 3.3801021575927734, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8480523824691772, + "num_tokens": 150208018.0, + "step": 3935 + }, + { + "epoch": 0.5006996565322478, + "ewc_loss": 0.006534081883728504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.534082058351487e-05, + "grad_norm": 3.3910303115844727, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8614145517349243, + "num_tokens": 150250164.0, + "step": 3936 + }, + { + "epoch": 0.5008268668108383, + "ewc_loss": 0.006506915669888258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.50691581540741e-05, + "grad_norm": 3.357046604156494, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8796980977058411, + "num_tokens": 150289972.0, + "step": 3937 + }, + { + "epoch": 0.5009540770894289, + "ewc_loss": 0.006483996752649546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.483996548922732e-05, + "grad_norm": 3.3827216625213623, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8584950566291809, + "num_tokens": 150333656.0, + "step": 3938 + }, + { + "epoch": 0.5010812873680194, + "ewc_loss": 0.006494241766631603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.494241824839264e-05, + "grad_norm": 3.3772008419036865, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8737552762031555, + "num_tokens": 150374086.0, + "step": 3939 + }, + { + "epoch": 0.5012084976466098, + "ewc_loss": 0.006469571031630039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.469571235356852e-05, + "grad_norm": 3.455193519592285, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.86411052942276, + "num_tokens": 150407472.0, + "step": 3940 + }, + { + "epoch": 0.5013357079252003, + "ewc_loss": 0.00651036761701107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.510367529699579e-05, + "grad_norm": 3.363507032394409, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8666098117828369, + "num_tokens": 150444560.0, + "step": 3941 + }, + { + "epoch": 0.5014629182037909, + "ewc_loss": 0.006448180414736271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.448180647566915e-05, + "grad_norm": 3.4263648986816406, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.865314245223999, + "num_tokens": 150483621.0, + "step": 3942 + }, + { + "epoch": 0.5015901284823814, + "ewc_loss": 0.0065108370035886765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.510836828965694e-05, + "grad_norm": 3.4184460639953613, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.86052405834198, + "num_tokens": 150520414.0, + "step": 3943 + }, + { + "epoch": 0.5017173387609719, + "ewc_loss": 0.006484766956418753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.484767072834074e-05, + "grad_norm": 3.487513542175293, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.852726936340332, + "num_tokens": 150556589.0, + "step": 3944 + }, + { + "epoch": 0.5018445490395624, + "ewc_loss": 0.00651301397010684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.513013795483857e-05, + "grad_norm": 3.3927953243255615, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8598899245262146, + "num_tokens": 150592393.0, + "step": 3945 + }, + { + "epoch": 0.5019717593181529, + "ewc_loss": 0.006453301291912794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.453301466535777e-05, + "grad_norm": 3.4361467361450195, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8483326435089111, + "num_tokens": 150630457.0, + "step": 3946 + }, + { + "epoch": 0.5020989695967434, + "ewc_loss": 0.006503867916762829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.503867916762829e-05, + "grad_norm": 3.474576234817505, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8691616058349609, + "num_tokens": 150659482.0, + "step": 3947 + }, + { + "epoch": 0.5022261798753339, + "ewc_loss": 0.006532945670187473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.532945553772151e-05, + "grad_norm": 3.400912284851074, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8606199622154236, + "num_tokens": 150700920.0, + "step": 3948 + }, + { + "epoch": 0.5023533901539244, + "ewc_loss": 0.006476884242147207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.476884300354868e-05, + "grad_norm": 3.4321835041046143, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8708838820457458, + "num_tokens": 150736393.0, + "step": 3949 + }, + { + "epoch": 0.502480600432515, + "ewc_loss": 0.006541624199599028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.54162431601435e-05, + "grad_norm": 3.3689775466918945, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.858702540397644, + "num_tokens": 150783848.0, + "step": 3950 + }, + { + "epoch": 0.5026078107111055, + "ewc_loss": 0.006474413443356752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.474413385149091e-05, + "grad_norm": 3.387347459793091, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8578600287437439, + "num_tokens": 150824179.0, + "step": 3951 + }, + { + "epoch": 0.5027350209896959, + "ewc_loss": 0.006512777879834175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.512778054457158e-05, + "grad_norm": 3.416501045227051, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8615509271621704, + "num_tokens": 150862775.0, + "step": 3952 + }, + { + "epoch": 0.5028622312682864, + "ewc_loss": 0.00653793616220355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.53793613309972e-05, + "grad_norm": 3.4116697311401367, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8613721132278442, + "num_tokens": 150900005.0, + "step": 3953 + }, + { + "epoch": 0.502989441546877, + "ewc_loss": 0.006524847354739904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.524847412947565e-05, + "grad_norm": 3.439497947692871, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8568933010101318, + "num_tokens": 150936148.0, + "step": 3954 + }, + { + "epoch": 0.5031166518254675, + "ewc_loss": 0.006534808315336704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.534808198921382e-05, + "grad_norm": 3.4201488494873047, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8580888509750366, + "num_tokens": 150971661.0, + "step": 3955 + }, + { + "epoch": 0.503243862104058, + "ewc_loss": 0.006524963304400444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.524963100673631e-05, + "grad_norm": 3.3702242374420166, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.871013879776001, + "num_tokens": 151009521.0, + "step": 3956 + }, + { + "epoch": 0.5033710723826486, + "ewc_loss": 0.006506460253149271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.506460340460762e-05, + "grad_norm": 3.3897933959960938, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.858322262763977, + "num_tokens": 151053641.0, + "step": 3957 + }, + { + "epoch": 0.5034982826612391, + "ewc_loss": 0.006523438263684511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.523438059957698e-05, + "grad_norm": 3.3525030612945557, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8650168776512146, + "num_tokens": 151094476.0, + "step": 3958 + }, + { + "epoch": 0.5036254929398295, + "ewc_loss": 0.006504612043499947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.50461224722676e-05, + "grad_norm": 3.49932599067688, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8559750318527222, + "num_tokens": 151126381.0, + "step": 3959 + }, + { + "epoch": 0.50375270321842, + "ewc_loss": 0.006603136193007231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.603135989280418e-05, + "grad_norm": 3.4730145931243896, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8655251264572144, + "num_tokens": 151156875.0, + "step": 3960 + }, + { + "epoch": 0.5038799134970106, + "ewc_loss": 0.0065329899080097675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.532989937113598e-05, + "grad_norm": 3.3789758682250977, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8573806285858154, + "num_tokens": 151194394.0, + "step": 3961 + }, + { + "epoch": 0.5040071237756011, + "ewc_loss": 0.006510041654109955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.510041566798463e-05, + "grad_norm": 3.428723096847534, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8472886085510254, + "num_tokens": 151234472.0, + "step": 3962 + }, + { + "epoch": 0.5041343340541916, + "ewc_loss": 0.006577495485544205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.577495514648035e-05, + "grad_norm": 3.466697931289673, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8564454317092896, + "num_tokens": 151270426.0, + "step": 3963 + }, + { + "epoch": 0.5042615443327821, + "ewc_loss": 0.006593137513846159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.593137368327007e-05, + "grad_norm": 3.462836265563965, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8587064146995544, + "num_tokens": 151305793.0, + "step": 3964 + }, + { + "epoch": 0.5043887546113726, + "ewc_loss": 0.006570842582732439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.570842379005626e-05, + "grad_norm": 3.398857355117798, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8487266302108765, + "num_tokens": 151345116.0, + "step": 3965 + }, + { + "epoch": 0.5045159648899631, + "ewc_loss": 0.006547058001160622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.547058001160622e-05, + "grad_norm": 3.3997294902801514, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8659864068031311, + "num_tokens": 151381901.0, + "step": 3966 + }, + { + "epoch": 0.5046431751685536, + "ewc_loss": 0.0065857283771038055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.585728260688484e-05, + "grad_norm": 3.370800495147705, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8623908758163452, + "num_tokens": 151426884.0, + "step": 3967 + }, + { + "epoch": 0.5047703854471441, + "ewc_loss": 0.0065662250854074955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.566225056303665e-05, + "grad_norm": 3.380858898162842, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8551879525184631, + "num_tokens": 151467596.0, + "step": 3968 + }, + { + "epoch": 0.5048975957257347, + "ewc_loss": 0.006582430098205805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.582430069101974e-05, + "grad_norm": 3.4063498973846436, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8813615441322327, + "num_tokens": 151502131.0, + "step": 3969 + }, + { + "epoch": 0.5050248060043252, + "ewc_loss": 0.006598379462957382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.598379695788026e-05, + "grad_norm": 3.4015161991119385, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8577399253845215, + "num_tokens": 151538932.0, + "step": 3970 + }, + { + "epoch": 0.5051520162829156, + "ewc_loss": 0.006576323416084051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.57632335787639e-05, + "grad_norm": 3.369964122772217, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.862122118473053, + "num_tokens": 151577839.0, + "step": 3971 + }, + { + "epoch": 0.5052792265615061, + "ewc_loss": 0.006576030980795622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.5760308643803e-05, + "grad_norm": 3.3628501892089844, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8611365556716919, + "num_tokens": 151618984.0, + "step": 3972 + }, + { + "epoch": 0.5054064368400967, + "ewc_loss": 0.006577302701771259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.577302701771259e-05, + "grad_norm": 3.414884090423584, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8609167337417603, + "num_tokens": 151658392.0, + "step": 3973 + }, + { + "epoch": 0.5055336471186872, + "ewc_loss": 0.006595891900360584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.595892045879737e-05, + "grad_norm": 3.4712555408477783, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8704050183296204, + "num_tokens": 151689781.0, + "step": 3974 + }, + { + "epoch": 0.5056608573972777, + "ewc_loss": 0.006614550482481718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.614550511585549e-05, + "grad_norm": 3.413682222366333, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8464789986610413, + "num_tokens": 151729612.0, + "step": 3975 + }, + { + "epoch": 0.5057880676758683, + "ewc_loss": 0.006559757515788078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.559757457580417e-05, + "grad_norm": 3.4218051433563232, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8481463193893433, + "num_tokens": 151769270.0, + "step": 3976 + }, + { + "epoch": 0.5059152779544587, + "ewc_loss": 0.006584916263818741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.584916263818741e-05, + "grad_norm": 3.452185869216919, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8424496650695801, + "num_tokens": 151803936.0, + "step": 3977 + }, + { + "epoch": 0.5060424882330492, + "ewc_loss": 0.006597150582820177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.59715078654699e-05, + "grad_norm": 3.462059259414673, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8499906063079834, + "num_tokens": 151840429.0, + "step": 3978 + }, + { + "epoch": 0.5061696985116397, + "ewc_loss": 0.006616143975406885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.616143946303055e-05, + "grad_norm": 3.3544394969940186, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8622127771377563, + "num_tokens": 151883296.0, + "step": 3979 + }, + { + "epoch": 0.5062969087902303, + "ewc_loss": 0.006542722228914499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.54272225801833e-05, + "grad_norm": 3.392861843109131, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8770356774330139, + "num_tokens": 151917658.0, + "step": 3980 + }, + { + "epoch": 0.5064241190688208, + "ewc_loss": 0.006612391676753759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.61239173496142e-05, + "grad_norm": 3.3931233882904053, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8608633875846863, + "num_tokens": 151958822.0, + "step": 3981 + }, + { + "epoch": 0.5065513293474113, + "ewc_loss": 0.006587745156139135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.587745156139135e-05, + "grad_norm": 3.4477145671844482, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8562411665916443, + "num_tokens": 151994964.0, + "step": 3982 + }, + { + "epoch": 0.5066785396260017, + "ewc_loss": 0.00662170210853219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.62170205032453e-05, + "grad_norm": 3.3591701984405518, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8560118079185486, + "num_tokens": 152037013.0, + "step": 3983 + }, + { + "epoch": 0.5068057499045923, + "ewc_loss": 0.0065556117333471775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.555611616931856e-05, + "grad_norm": 3.3936517238616943, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8576396703720093, + "num_tokens": 152074821.0, + "step": 3984 + }, + { + "epoch": 0.5069329601831828, + "ewc_loss": 0.006610572803765535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.610572745557874e-05, + "grad_norm": 3.3972690105438232, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8526301383972168, + "num_tokens": 152116667.0, + "step": 3985 + }, + { + "epoch": 0.5070601704617733, + "ewc_loss": 0.0065911938436329365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.591193960048258e-05, + "grad_norm": 3.4282100200653076, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8648836016654968, + "num_tokens": 152152714.0, + "step": 3986 + }, + { + "epoch": 0.5071873807403638, + "ewc_loss": 0.006601118482649326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.601118366234004e-05, + "grad_norm": 3.4132020473480225, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8472424149513245, + "num_tokens": 152190230.0, + "step": 3987 + }, + { + "epoch": 0.5073145910189544, + "ewc_loss": 0.006582936272025108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.582936475751922e-05, + "grad_norm": 3.3973047733306885, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8604733943939209, + "num_tokens": 152228887.0, + "step": 3988 + }, + { + "epoch": 0.5074418012975448, + "ewc_loss": 0.006583179347217083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.583179492736235e-05, + "grad_norm": 3.349802017211914, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8700518012046814, + "num_tokens": 152271863.0, + "step": 3989 + }, + { + "epoch": 0.5075690115761353, + "ewc_loss": 0.0065510086715221405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.551008846145123e-05, + "grad_norm": 3.3987643718719482, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8554681539535522, + "num_tokens": 152310765.0, + "step": 3990 + }, + { + "epoch": 0.5076962218547258, + "ewc_loss": 0.006582977250218391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.582977221114561e-05, + "grad_norm": 3.4708633422851562, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8838543891906738, + "num_tokens": 152342545.0, + "step": 3991 + }, + { + "epoch": 0.5078234321333164, + "ewc_loss": 0.0066023473627865314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.60234727547504e-05, + "grad_norm": 3.407822370529175, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8438633680343628, + "num_tokens": 152386223.0, + "step": 3992 + }, + { + "epoch": 0.5079506424119069, + "ewc_loss": 0.006546161603182554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.546161603182554e-05, + "grad_norm": 3.4512548446655273, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8675529956817627, + "num_tokens": 152421811.0, + "step": 3993 + }, + { + "epoch": 0.5080778526904974, + "ewc_loss": 0.006593567319214344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.593567377422005e-05, + "grad_norm": 3.4429588317871094, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8498561978340149, + "num_tokens": 152454581.0, + "step": 3994 + }, + { + "epoch": 0.5082050629690879, + "ewc_loss": 0.006578915752470493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.578915781574324e-05, + "grad_norm": 3.43829345703125, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8502613306045532, + "num_tokens": 152490591.0, + "step": 3995 + }, + { + "epoch": 0.5083322732476784, + "ewc_loss": 0.006578498985618353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.578498869203031e-05, + "grad_norm": 3.3336353302001953, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8825877904891968, + "num_tokens": 152532290.0, + "step": 3996 + }, + { + "epoch": 0.5084594835262689, + "ewc_loss": 0.006518796551972628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.51879672659561e-05, + "grad_norm": 3.395474433898926, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8652607798576355, + "num_tokens": 152571984.0, + "step": 3997 + }, + { + "epoch": 0.5085866938048594, + "ewc_loss": 0.006602959241718054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.602959183510393e-05, + "grad_norm": 3.42677903175354, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.861467719078064, + "num_tokens": 152611082.0, + "step": 3998 + }, + { + "epoch": 0.50871390408345, + "ewc_loss": 0.0065750377252697945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.575037696165964e-05, + "grad_norm": 3.3334360122680664, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8689011335372925, + "num_tokens": 152655303.0, + "step": 3999 + }, + { + "epoch": 0.5088411143620405, + "ewc_loss": 0.006522726267576218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.522726471303031e-05, + "grad_norm": 3.4754586219787598, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8606145977973938, + "num_tokens": 152688432.0, + "step": 4000 + }, + { + "epoch": 0.5089683246406309, + "ewc_loss": 0.006648653652518988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.648653652518988e-05, + "grad_norm": 3.4138832092285156, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8564008474349976, + "num_tokens": 152725637.0, + "step": 4001 + }, + { + "epoch": 0.5090955349192214, + "ewc_loss": 0.006536323111504316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.536323053296655e-05, + "grad_norm": 3.4375133514404297, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8576235771179199, + "num_tokens": 152763108.0, + "step": 4002 + }, + { + "epoch": 0.509222745197812, + "ewc_loss": 0.006583211477845907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.583211506949738e-05, + "grad_norm": 3.4088664054870605, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8673891425132751, + "num_tokens": 152800659.0, + "step": 4003 + }, + { + "epoch": 0.5093499554764025, + "ewc_loss": 0.006549422163516283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.54942195978947e-05, + "grad_norm": 3.370144844055176, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8510627746582031, + "num_tokens": 152842797.0, + "step": 4004 + }, + { + "epoch": 0.509477165754993, + "ewc_loss": 0.006544970441609621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.544970528921112e-05, + "grad_norm": 3.3646624088287354, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8520558476448059, + "num_tokens": 152888257.0, + "step": 4005 + }, + { + "epoch": 0.5096043760335836, + "ewc_loss": 0.006555515341460705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.555515574291348e-05, + "grad_norm": 3.402219772338867, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8588627576828003, + "num_tokens": 152929089.0, + "step": 4006 + }, + { + "epoch": 0.5097315863121741, + "ewc_loss": 0.006577251013368368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.577251042472199e-05, + "grad_norm": 3.3909151554107666, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8549211621284485, + "num_tokens": 152972710.0, + "step": 4007 + }, + { + "epoch": 0.5098587965907645, + "ewc_loss": 0.0065551334992051125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.555133586516604e-05, + "grad_norm": 3.4109957218170166, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8610776662826538, + "num_tokens": 153008867.0, + "step": 4008 + }, + { + "epoch": 0.509986006869355, + "ewc_loss": 0.006565793417394161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.565793592017144e-05, + "grad_norm": 3.3670575618743896, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8554868698120117, + "num_tokens": 153051497.0, + "step": 4009 + }, + { + "epoch": 0.5101132171479456, + "ewc_loss": 0.006530898157507181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.53089809929952e-05, + "grad_norm": 3.380337953567505, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.876610279083252, + "num_tokens": 153089295.0, + "step": 4010 + }, + { + "epoch": 0.5102404274265361, + "ewc_loss": 0.0065484317019581795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.54843170195818e-05, + "grad_norm": 3.42388653755188, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8663367629051208, + "num_tokens": 153123957.0, + "step": 4011 + }, + { + "epoch": 0.5103676377051266, + "ewc_loss": 0.006576771382242441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.576771556865424e-05, + "grad_norm": 3.4065487384796143, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8585048913955688, + "num_tokens": 153165248.0, + "step": 4012 + }, + { + "epoch": 0.5104948479837171, + "ewc_loss": 0.0065483166836202145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.548316741827875e-05, + "grad_norm": 3.382694959640503, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8745720386505127, + "num_tokens": 153206163.0, + "step": 4013 + }, + { + "epoch": 0.5106220582623076, + "ewc_loss": 0.006535633467137814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.535633292514831e-05, + "grad_norm": 3.390991687774658, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8525373935699463, + "num_tokens": 153248666.0, + "step": 4014 + }, + { + "epoch": 0.5107492685408981, + "ewc_loss": 0.006543290335685015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.543290510307997e-05, + "grad_norm": 3.3549468517303467, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8826565742492676, + "num_tokens": 153289009.0, + "step": 4015 + }, + { + "epoch": 0.5108764788194886, + "ewc_loss": 0.006516012828797102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.516012945212424e-05, + "grad_norm": 3.4422667026519775, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8653807640075684, + "num_tokens": 153321354.0, + "step": 4016 + }, + { + "epoch": 0.5110036890980791, + "ewc_loss": 0.006578040309250355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.578040483873338e-05, + "grad_norm": 3.47239351272583, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8603354096412659, + "num_tokens": 153361061.0, + "step": 4017 + }, + { + "epoch": 0.5111308993766697, + "ewc_loss": 0.0065612876787781715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.56128759146668e-05, + "grad_norm": 3.4149630069732666, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8538100719451904, + "num_tokens": 153397002.0, + "step": 4018 + }, + { + "epoch": 0.5112581096552602, + "ewc_loss": 0.006502662319689989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.502662290586159e-05, + "grad_norm": 3.412299871444702, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8485647439956665, + "num_tokens": 153435264.0, + "step": 4019 + }, + { + "epoch": 0.5113853199338506, + "ewc_loss": 0.006543169729411602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.543169729411602e-05, + "grad_norm": 3.383476972579956, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8591967821121216, + "num_tokens": 153474751.0, + "step": 4020 + }, + { + "epoch": 0.5115125302124411, + "ewc_loss": 0.0065173679031431675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.517367728520185e-05, + "grad_norm": 3.402377128601074, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8697351813316345, + "num_tokens": 153510084.0, + "step": 4021 + }, + { + "epoch": 0.5116397404910317, + "ewc_loss": 0.006542833521962166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.542833580169827e-05, + "grad_norm": 3.394670009613037, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8476747274398804, + "num_tokens": 153550387.0, + "step": 4022 + }, + { + "epoch": 0.5117669507696222, + "ewc_loss": 0.006551271304488182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.551271508214995e-05, + "grad_norm": 3.3138587474823, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8821795582771301, + "num_tokens": 153592441.0, + "step": 4023 + }, + { + "epoch": 0.5118941610482127, + "ewc_loss": 0.006501801311969757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.501801544800401e-05, + "grad_norm": 3.3999545574188232, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8675066828727722, + "num_tokens": 153629202.0, + "step": 4024 + }, + { + "epoch": 0.5120213713268033, + "ewc_loss": 0.0065894839353859425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.589484110008925e-05, + "grad_norm": 3.417196750640869, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8557182550430298, + "num_tokens": 153669243.0, + "step": 4025 + }, + { + "epoch": 0.5121485816053937, + "ewc_loss": 0.0065602269023656845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.560226756846532e-05, + "grad_norm": 3.4163589477539062, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.85662442445755, + "num_tokens": 153709206.0, + "step": 4026 + }, + { + "epoch": 0.5122757918839842, + "ewc_loss": 0.006560186855494976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.560186739079654e-05, + "grad_norm": 3.4620003700256348, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8692086935043335, + "num_tokens": 153741251.0, + "step": 4027 + }, + { + "epoch": 0.5124030021625747, + "ewc_loss": 0.006598704028874636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.598704203497618e-05, + "grad_norm": 3.44571852684021, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8468037843704224, + "num_tokens": 153775559.0, + "step": 4028 + }, + { + "epoch": 0.5125302124411653, + "ewc_loss": 0.006571794860064983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.571794801857322e-05, + "grad_norm": 3.346696376800537, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8655598163604736, + "num_tokens": 153819158.0, + "step": 4029 + }, + { + "epoch": 0.5126574227197558, + "ewc_loss": 0.006541037932038307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.541037873830646e-05, + "grad_norm": 3.38468074798584, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8551433086395264, + "num_tokens": 153856231.0, + "step": 4030 + }, + { + "epoch": 0.5127846329983463, + "ewc_loss": 0.006584459450095892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.58445933368057e-05, + "grad_norm": 3.3409407138824463, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8713620901107788, + "num_tokens": 153897125.0, + "step": 4031 + }, + { + "epoch": 0.5129118432769367, + "ewc_loss": 0.00654454855248332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.54454852337949e-05, + "grad_norm": 3.4127583503723145, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8688554763793945, + "num_tokens": 153933552.0, + "step": 4032 + }, + { + "epoch": 0.5130390535555273, + "ewc_loss": 0.006599445827305317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.599445623578504e-05, + "grad_norm": 3.42266845703125, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8456458449363708, + "num_tokens": 153974337.0, + "step": 4033 + }, + { + "epoch": 0.5131662638341178, + "ewc_loss": 0.006594855338335037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.594855221919715e-05, + "grad_norm": 3.387481927871704, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8556303381919861, + "num_tokens": 154018911.0, + "step": 4034 + }, + { + "epoch": 0.5132934741127083, + "ewc_loss": 0.006566291209310293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.566291267517954e-05, + "grad_norm": 3.367184638977051, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8715848922729492, + "num_tokens": 154056441.0, + "step": 4035 + }, + { + "epoch": 0.5134206843912988, + "ewc_loss": 0.006574735045433044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.574735016329214e-05, + "grad_norm": 3.432164192199707, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8516213297843933, + "num_tokens": 154096326.0, + "step": 4036 + }, + { + "epoch": 0.5135478946698894, + "ewc_loss": 0.006615308113396168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.615307938773185e-05, + "grad_norm": 3.3734867572784424, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8649311065673828, + "num_tokens": 154137223.0, + "step": 4037 + }, + { + "epoch": 0.5136751049484798, + "ewc_loss": 0.006555219646543264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.555219442816451e-05, + "grad_norm": 3.549363851547241, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8581315279006958, + "num_tokens": 154168312.0, + "step": 4038 + }, + { + "epoch": 0.5138023152270703, + "ewc_loss": 0.006683178246021271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.683178071398288e-05, + "grad_norm": 3.39947772026062, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.873681366443634, + "num_tokens": 154203125.0, + "step": 4039 + }, + { + "epoch": 0.5139295255056608, + "ewc_loss": 0.006538905203342438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.538905290653929e-05, + "grad_norm": 3.3533148765563965, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8549810647964478, + "num_tokens": 154248437.0, + "step": 4040 + }, + { + "epoch": 0.5140567357842514, + "ewc_loss": 0.00655911723151803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.559117173310369e-05, + "grad_norm": 3.40102219581604, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8650363683700562, + "num_tokens": 154286303.0, + "step": 4041 + }, + { + "epoch": 0.5141839460628419, + "ewc_loss": 0.006601545959711075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.601546192541718e-05, + "grad_norm": 3.3879053592681885, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8521613478660583, + "num_tokens": 154328638.0, + "step": 4042 + }, + { + "epoch": 0.5143111563414324, + "ewc_loss": 0.006581432651728392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.58143253531307e-05, + "grad_norm": 3.4388043880462646, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8601042032241821, + "num_tokens": 154362953.0, + "step": 4043 + }, + { + "epoch": 0.5144383666200228, + "ewc_loss": 0.006609829142689705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.609829142689705e-05, + "grad_norm": 3.4352304935455322, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.849521815776825, + "num_tokens": 154401289.0, + "step": 4044 + }, + { + "epoch": 0.5145655768986134, + "ewc_loss": 0.006593312136828899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.593311991309747e-05, + "grad_norm": 3.4140753746032715, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8649132251739502, + "num_tokens": 154438779.0, + "step": 4045 + }, + { + "epoch": 0.5146927871772039, + "ewc_loss": 0.006582892034202814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.582892092410475e-05, + "grad_norm": 3.517441987991333, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8552217483520508, + "num_tokens": 154467032.0, + "step": 4046 + }, + { + "epoch": 0.5148199974557944, + "ewc_loss": 0.006661465857177973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.661465886281803e-05, + "grad_norm": 3.400890350341797, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8700892925262451, + "num_tokens": 154505222.0, + "step": 4047 + }, + { + "epoch": 0.514947207734385, + "ewc_loss": 0.006577720865607262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.577721069334075e-05, + "grad_norm": 3.442934036254883, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8483253717422485, + "num_tokens": 154544953.0, + "step": 4048 + }, + { + "epoch": 0.5150744180129755, + "ewc_loss": 0.006625770125538111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.62577003822662e-05, + "grad_norm": 3.4071109294891357, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8723639249801636, + "num_tokens": 154586193.0, + "step": 4049 + }, + { + "epoch": 0.5152016282915659, + "ewc_loss": 0.006604986265301704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.604986265301704e-05, + "grad_norm": 3.4140586853027344, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8640375137329102, + "num_tokens": 154629531.0, + "step": 4050 + }, + { + "epoch": 0.5153288385701564, + "ewc_loss": 0.006614530924707651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.61453086649999e-05, + "grad_norm": 3.4329326152801514, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8769519925117493, + "num_tokens": 154659482.0, + "step": 4051 + }, + { + "epoch": 0.515456048848747, + "ewc_loss": 0.006626133807003498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.626133836107329e-05, + "grad_norm": 3.409719467163086, + "learning_rate": 1e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8272830247879028, + "num_tokens": 154702645.0, + "step": 4052 + }, + { + "epoch": 0.5155832591273375, + "ewc_loss": 0.006612502504140139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.612502329517156e-05, + "grad_norm": 3.3638269901275635, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8690359592437744, + "num_tokens": 154743329.0, + "step": 4053 + }, + { + "epoch": 0.515710469405928, + "ewc_loss": 0.00659739226102829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.597392348339781e-05, + "grad_norm": 3.38154935836792, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8586518168449402, + "num_tokens": 154783460.0, + "step": 4054 + }, + { + "epoch": 0.5158376796845185, + "ewc_loss": 0.0066290260292589664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.629026029258966e-05, + "grad_norm": 3.3747575283050537, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8542704582214355, + "num_tokens": 154827209.0, + "step": 4055 + }, + { + "epoch": 0.5159648899631091, + "ewc_loss": 0.006615761201828718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.615761230932549e-05, + "grad_norm": 3.4420719146728516, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.836144208908081, + "num_tokens": 154866577.0, + "step": 4056 + }, + { + "epoch": 0.5160921002416995, + "ewc_loss": 0.006652680225670338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.652680167462677e-05, + "grad_norm": 3.338716983795166, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8634032011032104, + "num_tokens": 154912191.0, + "step": 4057 + }, + { + "epoch": 0.51621931052029, + "ewc_loss": 0.006561765447258949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.561765621881932e-05, + "grad_norm": 3.435718297958374, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.848362386226654, + "num_tokens": 154953412.0, + "step": 4058 + }, + { + "epoch": 0.5163465207988805, + "ewc_loss": 0.006659870967268944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.659870996372774e-05, + "grad_norm": 3.366655111312866, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8715183138847351, + "num_tokens": 154989913.0, + "step": 4059 + }, + { + "epoch": 0.5164737310774711, + "ewc_loss": 0.006577733438462019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.577733438462019e-05, + "grad_norm": 3.3677523136138916, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.84658282995224, + "num_tokens": 155038094.0, + "step": 4060 + }, + { + "epoch": 0.5166009413560616, + "ewc_loss": 0.006586081814020872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.586081872228533e-05, + "grad_norm": 3.4269187450408936, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8513832688331604, + "num_tokens": 155075900.0, + "step": 4061 + }, + { + "epoch": 0.5167281516346521, + "ewc_loss": 0.0066230762749910355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.623076478717849e-05, + "grad_norm": 3.3574411869049072, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8600554466247559, + "num_tokens": 155117038.0, + "step": 4062 + }, + { + "epoch": 0.5168553619132426, + "ewc_loss": 0.006554646883159876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.554646824952215e-05, + "grad_norm": 3.4531712532043457, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8755955696105957, + "num_tokens": 155151442.0, + "step": 4063 + }, + { + "epoch": 0.5169825721918331, + "ewc_loss": 0.006632962264120579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.63296232232824e-05, + "grad_norm": 3.4025518894195557, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8679126501083374, + "num_tokens": 155192288.0, + "step": 4064 + }, + { + "epoch": 0.5171097824704236, + "ewc_loss": 0.0065666004084050655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.566600495716557e-05, + "grad_norm": 3.3409104347229004, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8631998896598816, + "num_tokens": 155238931.0, + "step": 4065 + }, + { + "epoch": 0.5172369927490141, + "ewc_loss": 0.006529894657433033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.529894744744524e-05, + "grad_norm": 3.3964791297912598, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8738508820533752, + "num_tokens": 155276644.0, + "step": 4066 + }, + { + "epoch": 0.5173642030276047, + "ewc_loss": 0.0065901316702365875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.590131670236588e-05, + "grad_norm": 3.4896368980407715, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8735475540161133, + "num_tokens": 155307638.0, + "step": 4067 + }, + { + "epoch": 0.5174914133061952, + "ewc_loss": 0.0066032400354743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.6032400354743e-05, + "grad_norm": 3.4185636043548584, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8551580309867859, + "num_tokens": 155343255.0, + "step": 4068 + }, + { + "epoch": 0.5176186235847856, + "ewc_loss": 0.006538409274071455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.538409070344642e-05, + "grad_norm": 3.411717176437378, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8619647026062012, + "num_tokens": 155382283.0, + "step": 4069 + }, + { + "epoch": 0.5177458338633761, + "ewc_loss": 0.006571096833795309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.571097037522122e-05, + "grad_norm": 3.4132003784179688, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8396172523498535, + "num_tokens": 155427096.0, + "step": 4070 + }, + { + "epoch": 0.5178730441419667, + "ewc_loss": 0.006564106326550245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.564106297446415e-05, + "grad_norm": 3.4197998046875, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8486694097518921, + "num_tokens": 155467472.0, + "step": 4071 + }, + { + "epoch": 0.5180002544205572, + "ewc_loss": 0.006558798253536224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.558798486366868e-05, + "grad_norm": 3.3981103897094727, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.856141209602356, + "num_tokens": 155508350.0, + "step": 4072 + }, + { + "epoch": 0.5181274646991477, + "ewc_loss": 0.00654905941337347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.549059617100284e-05, + "grad_norm": 3.3596935272216797, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8838099241256714, + "num_tokens": 155547215.0, + "step": 4073 + }, + { + "epoch": 0.5182546749777382, + "ewc_loss": 0.006524019408971071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.524019408971071e-05, + "grad_norm": 3.4074649810791016, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8648353219032288, + "num_tokens": 155586658.0, + "step": 4074 + }, + { + "epoch": 0.5183818852563287, + "ewc_loss": 0.006567669101059437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.56766933389008e-05, + "grad_norm": 3.4660894870758057, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8630676865577698, + "num_tokens": 155620944.0, + "step": 4075 + }, + { + "epoch": 0.5185090955349192, + "ewc_loss": 0.006573062390089035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.573062273673713e-05, + "grad_norm": 3.463752269744873, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8591595888137817, + "num_tokens": 155662699.0, + "step": 4076 + }, + { + "epoch": 0.5186363058135097, + "ewc_loss": 0.006546932738274336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.546932854689658e-05, + "grad_norm": 3.3910739421844482, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8479546308517456, + "num_tokens": 155707783.0, + "step": 4077 + }, + { + "epoch": 0.5187635160921003, + "ewc_loss": 0.0065187718719244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.518771988339722e-05, + "grad_norm": 3.422675609588623, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8663869500160217, + "num_tokens": 155745227.0, + "step": 4078 + }, + { + "epoch": 0.5188907263706908, + "ewc_loss": 0.006552763748914003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.552763807121664e-05, + "grad_norm": 3.383000135421753, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8629201650619507, + "num_tokens": 155786466.0, + "step": 4079 + }, + { + "epoch": 0.5190179366492813, + "ewc_loss": 0.0065168351866304874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.516835128422827e-05, + "grad_norm": 3.422255516052246, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.863207221031189, + "num_tokens": 155833145.0, + "step": 4080 + }, + { + "epoch": 0.5191451469278717, + "ewc_loss": 0.006552584003657103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.552584090968594e-05, + "grad_norm": 3.485546827316284, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.852220892906189, + "num_tokens": 155869533.0, + "step": 4081 + }, + { + "epoch": 0.5192723572064623, + "ewc_loss": 0.006570012774318457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.570012919837609e-05, + "grad_norm": 3.4190711975097656, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8589141964912415, + "num_tokens": 155909329.0, + "step": 4082 + }, + { + "epoch": 0.5193995674850528, + "ewc_loss": 0.006501944735646248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.5019448811654e-05, + "grad_norm": 3.4039952754974365, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8696557283401489, + "num_tokens": 155947848.0, + "step": 4083 + }, + { + "epoch": 0.5195267777636433, + "ewc_loss": 0.00652818288654089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.528182711917907e-05, + "grad_norm": 3.425476551055908, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8668244481086731, + "num_tokens": 155982693.0, + "step": 4084 + }, + { + "epoch": 0.5196539880422338, + "ewc_loss": 0.00654011033475399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.540110189234838e-05, + "grad_norm": 3.4156463146209717, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.860031247138977, + "num_tokens": 156023414.0, + "step": 4085 + }, + { + "epoch": 0.5197811983208244, + "ewc_loss": 0.0065208543092012405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.520854367408901e-05, + "grad_norm": 3.3727383613586426, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8604284524917603, + "num_tokens": 156063488.0, + "step": 4086 + }, + { + "epoch": 0.5199084085994148, + "ewc_loss": 0.006515171844512224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.515171844512224e-05, + "grad_norm": 3.4218969345092773, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8609882593154907, + "num_tokens": 156103081.0, + "step": 4087 + }, + { + "epoch": 0.5200356188780053, + "ewc_loss": 0.0065421112813055515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.542111077578738e-05, + "grad_norm": 3.4795520305633545, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8525468111038208, + "num_tokens": 156139453.0, + "step": 4088 + }, + { + "epoch": 0.5201628291565958, + "ewc_loss": 0.0065803187899291515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.580318586202338e-05, + "grad_norm": 3.421699047088623, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8558580279350281, + "num_tokens": 156176564.0, + "step": 4089 + }, + { + "epoch": 0.5202900394351864, + "ewc_loss": 0.0065187932923436165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.518793088616803e-05, + "grad_norm": 3.3356010913848877, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8639084100723267, + "num_tokens": 156221278.0, + "step": 4090 + }, + { + "epoch": 0.5204172497137769, + "ewc_loss": 0.0065002767369151115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.500276504084468e-05, + "grad_norm": 3.4736902713775635, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8647669553756714, + "num_tokens": 156259124.0, + "step": 4091 + }, + { + "epoch": 0.5205444599923674, + "ewc_loss": 0.006602959707379341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.602959911106154e-05, + "grad_norm": 3.426839590072632, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8507641553878784, + "num_tokens": 156297832.0, + "step": 4092 + }, + { + "epoch": 0.5206716702709578, + "ewc_loss": 0.006516631226986647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.51663140160963e-05, + "grad_norm": 3.488358974456787, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8660472631454468, + "num_tokens": 156329745.0, + "step": 4093 + }, + { + "epoch": 0.5207988805495484, + "ewc_loss": 0.006586807779967785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.586808012798429e-05, + "grad_norm": 3.509394884109497, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8612807989120483, + "num_tokens": 156360154.0, + "step": 4094 + }, + { + "epoch": 0.5209260908281389, + "ewc_loss": 0.00656876340508461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.568763637915254e-05, + "grad_norm": 3.379154920578003, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8645815849304199, + "num_tokens": 156399360.0, + "step": 4095 + }, + { + "epoch": 0.5210533011067294, + "ewc_loss": 0.006504065822809935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.504065822809935e-05, + "grad_norm": 3.3811631202697754, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8608139753341675, + "num_tokens": 156438782.0, + "step": 4096 + }, + { + "epoch": 0.52118051138532, + "ewc_loss": 0.006556310225278139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.556310108862817e-05, + "grad_norm": 3.475135326385498, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8692805767059326, + "num_tokens": 156470261.0, + "step": 4097 + }, + { + "epoch": 0.5213077216639105, + "ewc_loss": 0.006604304537177086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.604304508073255e-05, + "grad_norm": 3.447415351867676, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8435091376304626, + "num_tokens": 156510493.0, + "step": 4098 + }, + { + "epoch": 0.5214349319425009, + "ewc_loss": 0.006562084425240755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.562084308825433e-05, + "grad_norm": 3.4769949913024902, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8637687563896179, + "num_tokens": 156545040.0, + "step": 4099 + }, + { + "epoch": 0.5215621422210914, + "ewc_loss": 0.0065983557142317295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.598355685127899e-05, + "grad_norm": 3.410853862762451, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8589150309562683, + "num_tokens": 156583474.0, + "step": 4100 + }, + { + "epoch": 0.521689352499682, + "ewc_loss": 0.006563131231814623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.563131319126114e-05, + "grad_norm": 3.347576379776001, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8613157272338867, + "num_tokens": 156627137.0, + "step": 4101 + }, + { + "epoch": 0.5218165627782725, + "ewc_loss": 0.006546901538968086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.546901568071917e-05, + "grad_norm": 3.4357850551605225, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8499153852462769, + "num_tokens": 156664726.0, + "step": 4102 + }, + { + "epoch": 0.521943773056863, + "ewc_loss": 0.006624219007790089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.624218804063275e-05, + "grad_norm": 3.360731363296509, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8782708644866943, + "num_tokens": 156707014.0, + "step": 4103 + }, + { + "epoch": 0.5220709833354535, + "ewc_loss": 0.006548394449055195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.548394594574347e-05, + "grad_norm": 3.399916410446167, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8579854965209961, + "num_tokens": 156747784.0, + "step": 4104 + }, + { + "epoch": 0.522198193614044, + "ewc_loss": 0.006603010930120945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.603010842809454e-05, + "grad_norm": 3.4522783756256104, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8590915203094482, + "num_tokens": 156784760.0, + "step": 4105 + }, + { + "epoch": 0.5223254038926345, + "ewc_loss": 0.006601864006370306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.601864151889458e-05, + "grad_norm": 3.4062912464141846, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8595588803291321, + "num_tokens": 156822899.0, + "step": 4106 + }, + { + "epoch": 0.522452614171225, + "ewc_loss": 0.006566131021827459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.566131196450442e-05, + "grad_norm": 3.3868331909179688, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8688037395477295, + "num_tokens": 156861308.0, + "step": 4107 + }, + { + "epoch": 0.5225798244498155, + "ewc_loss": 0.006578674074262381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.578674219781533e-05, + "grad_norm": 3.3872594833374023, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8670622110366821, + "num_tokens": 156901981.0, + "step": 4108 + }, + { + "epoch": 0.5227070347284061, + "ewc_loss": 0.006564744282513857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.564744398929179e-05, + "grad_norm": 3.3610193729400635, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8660402894020081, + "num_tokens": 156942215.0, + "step": 4109 + }, + { + "epoch": 0.5228342450069966, + "ewc_loss": 0.006544272881001234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.544272764585912e-05, + "grad_norm": 3.386488914489746, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8689128160476685, + "num_tokens": 156983820.0, + "step": 4110 + }, + { + "epoch": 0.5229614552855871, + "ewc_loss": 0.006571730598807335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.571730773430318e-05, + "grad_norm": 3.4464385509490967, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8747009038925171, + "num_tokens": 157016215.0, + "step": 4111 + }, + { + "epoch": 0.5230886655641775, + "ewc_loss": 0.0065827807411551476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.582780770258978e-05, + "grad_norm": 3.455207109451294, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8610929846763611, + "num_tokens": 157052071.0, + "step": 4112 + }, + { + "epoch": 0.5232158758427681, + "ewc_loss": 0.006580851040780544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.580851186299697e-05, + "grad_norm": 3.4153642654418945, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8406094908714294, + "num_tokens": 157097711.0, + "step": 4113 + }, + { + "epoch": 0.5233430861213586, + "ewc_loss": 0.006551144178956747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.551144178956747e-05, + "grad_norm": 3.451960563659668, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8596185445785522, + "num_tokens": 157134801.0, + "step": 4114 + }, + { + "epoch": 0.5234702963999491, + "ewc_loss": 0.006594025529921055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.594025762751698e-05, + "grad_norm": 3.4879519939422607, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8623065948486328, + "num_tokens": 157171885.0, + "step": 4115 + }, + { + "epoch": 0.5235975066785397, + "ewc_loss": 0.0065873172134160995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.587317329831421e-05, + "grad_norm": 3.412505626678467, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8605003952980042, + "num_tokens": 157211915.0, + "step": 4116 + }, + { + "epoch": 0.5237247169571302, + "ewc_loss": 0.006550536025315523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.550535908900201e-05, + "grad_norm": 3.354154109954834, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8548088669776917, + "num_tokens": 157260564.0, + "step": 4117 + }, + { + "epoch": 0.5238519272357206, + "ewc_loss": 0.006527407094836235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.527407094836235e-05, + "grad_norm": 3.42279314994812, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8638898134231567, + "num_tokens": 157294676.0, + "step": 4118 + }, + { + "epoch": 0.5239791375143111, + "ewc_loss": 0.006591103971004486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.591103738173842e-05, + "grad_norm": 3.385612726211548, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8713940978050232, + "num_tokens": 157334607.0, + "step": 4119 + }, + { + "epoch": 0.5241063477929017, + "ewc_loss": 0.006543179042637348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.5431791881565e-05, + "grad_norm": 3.427849292755127, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8636128902435303, + "num_tokens": 157372021.0, + "step": 4120 + }, + { + "epoch": 0.5242335580714922, + "ewc_loss": 0.006571921985596418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.57192213111557e-05, + "grad_norm": 3.378878355026245, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8669325709342957, + "num_tokens": 157410092.0, + "step": 4121 + }, + { + "epoch": 0.5243607683500827, + "ewc_loss": 0.0065412940457463264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.541293987538666e-05, + "grad_norm": 3.42427921295166, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8594223260879517, + "num_tokens": 157447979.0, + "step": 4122 + }, + { + "epoch": 0.5244879786286732, + "ewc_loss": 0.006576702930033207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.57670316286385e-05, + "grad_norm": 3.4385714530944824, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8574240207672119, + "num_tokens": 157481376.0, + "step": 4123 + }, + { + "epoch": 0.5246151889072637, + "ewc_loss": 0.006588159129023552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.588159158127382e-05, + "grad_norm": 3.4272701740264893, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8684579730033875, + "num_tokens": 157519555.0, + "step": 4124 + }, + { + "epoch": 0.5247423991858542, + "ewc_loss": 0.006580346263945103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.580346234841272e-05, + "grad_norm": 3.4560928344726562, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8604061603546143, + "num_tokens": 157551849.0, + "step": 4125 + }, + { + "epoch": 0.5248696094644447, + "ewc_loss": 0.006611792836338282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.611792923649773e-05, + "grad_norm": 3.40480637550354, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8625248670578003, + "num_tokens": 157590823.0, + "step": 4126 + }, + { + "epoch": 0.5249968197430352, + "ewc_loss": 0.006571612320840359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.571612175321206e-05, + "grad_norm": 3.3577985763549805, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8712352514266968, + "num_tokens": 157633771.0, + "step": 4127 + }, + { + "epoch": 0.5251240300216258, + "ewc_loss": 0.00656890170648694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.568901881109923e-05, + "grad_norm": 3.5027477741241455, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8748549222946167, + "num_tokens": 157667796.0, + "step": 4128 + }, + { + "epoch": 0.5252512403002163, + "ewc_loss": 0.006671607028692961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.671607116004452e-05, + "grad_norm": 3.384645938873291, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8707970380783081, + "num_tokens": 157709452.0, + "step": 4129 + }, + { + "epoch": 0.5253784505788067, + "ewc_loss": 0.006545838434249163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.545838550664485e-05, + "grad_norm": 3.48927903175354, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8535988330841064, + "num_tokens": 157741407.0, + "step": 4130 + }, + { + "epoch": 0.5255056608573972, + "ewc_loss": 0.006663489155471325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.663489330094308e-05, + "grad_norm": 3.4323530197143555, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8728691935539246, + "num_tokens": 157777693.0, + "step": 4131 + }, + { + "epoch": 0.5256328711359878, + "ewc_loss": 0.0065949056297540665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.594905426027253e-05, + "grad_norm": 3.4210774898529053, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8600034713745117, + "num_tokens": 157814729.0, + "step": 4132 + }, + { + "epoch": 0.5257600814145783, + "ewc_loss": 0.0066110435873270035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.611043500015512e-05, + "grad_norm": 3.40162992477417, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8714020252227783, + "num_tokens": 157853698.0, + "step": 4133 + }, + { + "epoch": 0.5258872916931688, + "ewc_loss": 0.006601175293326378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.601175118703395e-05, + "grad_norm": 3.430591106414795, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8679484128952026, + "num_tokens": 157888580.0, + "step": 4134 + }, + { + "epoch": 0.5260145019717594, + "ewc_loss": 0.006626310758292675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.626310641877353e-05, + "grad_norm": 3.4086077213287354, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8599789142608643, + "num_tokens": 157928434.0, + "step": 4135 + }, + { + "epoch": 0.5261417122503498, + "ewc_loss": 0.006604552734643221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.604552618227899e-05, + "grad_norm": 3.409593105316162, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8554915189743042, + "num_tokens": 157968936.0, + "step": 4136 + }, + { + "epoch": 0.5262689225289403, + "ewc_loss": 0.006622448563575745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.622448563575745e-05, + "grad_norm": 3.453606367111206, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8493427038192749, + "num_tokens": 158005496.0, + "step": 4137 + }, + { + "epoch": 0.5263961328075308, + "ewc_loss": 0.006632033735513687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.63203391013667e-05, + "grad_norm": 3.3865280151367188, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8694045543670654, + "num_tokens": 158047569.0, + "step": 4138 + }, + { + "epoch": 0.5265233430861214, + "ewc_loss": 0.006584432907402515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.584433140233159e-05, + "grad_norm": 3.4128928184509277, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8588254451751709, + "num_tokens": 158085733.0, + "step": 4139 + }, + { + "epoch": 0.5266505533647119, + "ewc_loss": 0.006609657313674688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.60965743009001e-05, + "grad_norm": 3.4246840476989746, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8508948683738708, + "num_tokens": 158123906.0, + "step": 4140 + }, + { + "epoch": 0.5267777636433024, + "ewc_loss": 0.006634801626205444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.634801684413105e-05, + "grad_norm": 3.382719039916992, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8692978620529175, + "num_tokens": 158163663.0, + "step": 4141 + }, + { + "epoch": 0.5269049739218928, + "ewc_loss": 0.006572581361979246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.572581332875416e-05, + "grad_norm": 3.4422197341918945, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8506268858909607, + "num_tokens": 158205878.0, + "step": 4142 + }, + { + "epoch": 0.5270321842004834, + "ewc_loss": 0.006637202575802803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.637202750425786e-05, + "grad_norm": 3.404156446456909, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8650824427604675, + "num_tokens": 158241608.0, + "step": 4143 + }, + { + "epoch": 0.5271593944790739, + "ewc_loss": 0.006582219619303942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.582219793926924e-05, + "grad_norm": 3.4349348545074463, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8741615414619446, + "num_tokens": 158274259.0, + "step": 4144 + }, + { + "epoch": 0.5272866047576644, + "ewc_loss": 0.006621852051466703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.621851935051382e-05, + "grad_norm": 3.512277603149414, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8676888346672058, + "num_tokens": 158304793.0, + "step": 4145 + }, + { + "epoch": 0.527413815036255, + "ewc_loss": 0.006651387084275484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.651387229794636e-05, + "grad_norm": 3.321392059326172, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8736264109611511, + "num_tokens": 158349420.0, + "step": 4146 + }, + { + "epoch": 0.5275410253148455, + "ewc_loss": 0.0065262168645858765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.526216748170555e-05, + "grad_norm": 3.367656707763672, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8610356450080872, + "num_tokens": 158395834.0, + "step": 4147 + }, + { + "epoch": 0.5276682355934359, + "ewc_loss": 0.006616858299821615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.616858445340768e-05, + "grad_norm": 3.434936761856079, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8634853363037109, + "num_tokens": 158435568.0, + "step": 4148 + }, + { + "epoch": 0.5277954458720264, + "ewc_loss": 0.006624511443078518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.624511297559366e-05, + "grad_norm": 3.4127917289733887, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8709299564361572, + "num_tokens": 158473584.0, + "step": 4149 + }, + { + "epoch": 0.527922656150617, + "ewc_loss": 0.0065781050361692905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.578105239896104e-05, + "grad_norm": 3.3860793113708496, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8637563586235046, + "num_tokens": 158513435.0, + "step": 4150 + }, + { + "epoch": 0.5280498664292075, + "ewc_loss": 0.006599073298275471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.599073094548658e-05, + "grad_norm": 3.418290376663208, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8628429174423218, + "num_tokens": 158549478.0, + "step": 4151 + }, + { + "epoch": 0.528177076707798, + "ewc_loss": 0.006615699268877506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.615699385292828e-05, + "grad_norm": 3.3382322788238525, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8667123317718506, + "num_tokens": 158598742.0, + "step": 4152 + }, + { + "epoch": 0.5283042869863885, + "ewc_loss": 0.006550642196089029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.550642137881368e-05, + "grad_norm": 3.450974941253662, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8501709699630737, + "num_tokens": 158636806.0, + "step": 4153 + }, + { + "epoch": 0.528431497264979, + "ewc_loss": 0.006648806389421225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.648806447628886e-05, + "grad_norm": 3.4239137172698975, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8562752604484558, + "num_tokens": 158676043.0, + "step": 4154 + }, + { + "epoch": 0.5285587075435695, + "ewc_loss": 0.006584466435015202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.584466609638184e-05, + "grad_norm": 3.4515199661254883, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8456175327301025, + "num_tokens": 158711772.0, + "step": 4155 + }, + { + "epoch": 0.52868591782216, + "ewc_loss": 0.006613189820200205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.613189907511696e-05, + "grad_norm": 3.444847822189331, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8613003492355347, + "num_tokens": 158749608.0, + "step": 4156 + }, + { + "epoch": 0.5288131281007505, + "ewc_loss": 0.006597190164029598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.597190076718107e-05, + "grad_norm": 3.398097515106201, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8623114228248596, + "num_tokens": 158790502.0, + "step": 4157 + }, + { + "epoch": 0.5289403383793411, + "ewc_loss": 0.006570351775735617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.57035197946243e-05, + "grad_norm": 3.4465367794036865, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8392772078514099, + "num_tokens": 158828415.0, + "step": 4158 + }, + { + "epoch": 0.5290675486579316, + "ewc_loss": 0.006608845666050911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.608845433220267e-05, + "grad_norm": 3.422025203704834, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8526428937911987, + "num_tokens": 158866972.0, + "step": 4159 + }, + { + "epoch": 0.5291947589365221, + "ewc_loss": 0.006581725552678108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.581725756404921e-05, + "grad_norm": 3.4497909545898438, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8600401878356934, + "num_tokens": 158902991.0, + "step": 4160 + }, + { + "epoch": 0.5293219692151125, + "ewc_loss": 0.006615307182073593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.615307211177424e-05, + "grad_norm": 3.3626391887664795, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8692368268966675, + "num_tokens": 158945963.0, + "step": 4161 + }, + { + "epoch": 0.5294491794937031, + "ewc_loss": 0.006565336138010025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.565335934283212e-05, + "grad_norm": 3.428205728530884, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8620195984840393, + "num_tokens": 158985407.0, + "step": 4162 + }, + { + "epoch": 0.5295763897722936, + "ewc_loss": 0.006629526615142822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.629526615142822e-05, + "grad_norm": 3.384213924407959, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8539139628410339, + "num_tokens": 159024387.0, + "step": 4163 + }, + { + "epoch": 0.5297036000508841, + "ewc_loss": 0.006580175366252661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.580175249837339e-05, + "grad_norm": 3.428736686706543, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8538367748260498, + "num_tokens": 159061430.0, + "step": 4164 + }, + { + "epoch": 0.5298308103294747, + "ewc_loss": 0.006617036648094654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.617036706302315e-05, + "grad_norm": 3.4329397678375244, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8630954027175903, + "num_tokens": 159099350.0, + "step": 4165 + }, + { + "epoch": 0.5299580206080652, + "ewc_loss": 0.006599653512239456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.59965371596627e-05, + "grad_norm": 3.4666881561279297, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8364052176475525, + "num_tokens": 159136276.0, + "step": 4166 + }, + { + "epoch": 0.5300852308866556, + "ewc_loss": 0.006635581608861685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.635581667069346e-05, + "grad_norm": 3.3309431076049805, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8828833699226379, + "num_tokens": 159177102.0, + "step": 4167 + }, + { + "epoch": 0.5302124411652461, + "ewc_loss": 0.006562968250364065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.562968337675557e-05, + "grad_norm": 3.450273036956787, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8723031878471375, + "num_tokens": 159213582.0, + "step": 4168 + }, + { + "epoch": 0.5303396514438367, + "ewc_loss": 0.006674179341644049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.674179167021066e-05, + "grad_norm": 3.4303810596466064, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8603346347808838, + "num_tokens": 159254283.0, + "step": 4169 + }, + { + "epoch": 0.5304668617224272, + "ewc_loss": 0.006613558158278465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.613558070966974e-05, + "grad_norm": 3.4483070373535156, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.863121747970581, + "num_tokens": 159288557.0, + "step": 4170 + }, + { + "epoch": 0.5305940720010177, + "ewc_loss": 0.006633923389017582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.633923476329073e-05, + "grad_norm": 3.4121618270874023, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8710253834724426, + "num_tokens": 159325047.0, + "step": 4171 + }, + { + "epoch": 0.5307212822796082, + "ewc_loss": 0.006611007731407881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.611007847823203e-05, + "grad_norm": 3.4116950035095215, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8650327920913696, + "num_tokens": 159366291.0, + "step": 4172 + }, + { + "epoch": 0.5308484925581987, + "ewc_loss": 0.006612797733396292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.612797733396292e-05, + "grad_norm": 3.5047121047973633, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8546056747436523, + "num_tokens": 159401634.0, + "step": 4173 + }, + { + "epoch": 0.5309757028367892, + "ewc_loss": 0.006684995722025633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.684995605610311e-05, + "grad_norm": 3.479783296585083, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8533697128295898, + "num_tokens": 159435316.0, + "step": 4174 + }, + { + "epoch": 0.5311029131153797, + "ewc_loss": 0.006639168597757816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.639168714173138e-05, + "grad_norm": 3.4375510215759277, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8537880778312683, + "num_tokens": 159472192.0, + "step": 4175 + }, + { + "epoch": 0.5312301233939702, + "ewc_loss": 0.006621001288294792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.621001375606284e-05, + "grad_norm": 3.363002061843872, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8684579133987427, + "num_tokens": 159515342.0, + "step": 4176 + }, + { + "epoch": 0.5313573336725608, + "ewc_loss": 0.006599947810173035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.599947664653882e-05, + "grad_norm": 3.3898205757141113, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8667720556259155, + "num_tokens": 159553932.0, + "step": 4177 + }, + { + "epoch": 0.5314845439511513, + "ewc_loss": 0.006648526526987553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.64852632326074e-05, + "grad_norm": 3.512803077697754, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8619447350502014, + "num_tokens": 159584711.0, + "step": 4178 + }, + { + "epoch": 0.5316117542297417, + "ewc_loss": 0.006698004435747862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.69800429022871e-05, + "grad_norm": 3.4879679679870605, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8483420610427856, + "num_tokens": 159618784.0, + "step": 4179 + }, + { + "epoch": 0.5317389645083322, + "ewc_loss": 0.006651200819760561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.651200965279713e-05, + "grad_norm": 3.449153423309326, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8596148490905762, + "num_tokens": 159661288.0, + "step": 4180 + }, + { + "epoch": 0.5318661747869228, + "ewc_loss": 0.006644777953624725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.644777749897912e-05, + "grad_norm": 3.404289484024048, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8737490177154541, + "num_tokens": 159701286.0, + "step": 4181 + }, + { + "epoch": 0.5319933850655133, + "ewc_loss": 0.006634051445871592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.634051533183083e-05, + "grad_norm": 3.385627031326294, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8502682447433472, + "num_tokens": 159746631.0, + "step": 4182 + }, + { + "epoch": 0.5321205953441038, + "ewc_loss": 0.006641465704888105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.641465733991936e-05, + "grad_norm": 3.3837010860443115, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8656045198440552, + "num_tokens": 159787753.0, + "step": 4183 + }, + { + "epoch": 0.5322478056226944, + "ewc_loss": 0.006633048877120018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.633048906223848e-05, + "grad_norm": 3.464075803756714, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8532960414886475, + "num_tokens": 159823092.0, + "step": 4184 + }, + { + "epoch": 0.5323750159012848, + "ewc_loss": 0.0066920858807861805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.692086026305333e-05, + "grad_norm": 3.4733879566192627, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8533982038497925, + "num_tokens": 159858480.0, + "step": 4185 + }, + { + "epoch": 0.5325022261798753, + "ewc_loss": 0.006662578321993351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.662578380201012e-05, + "grad_norm": 3.4489331245422363, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.851638674736023, + "num_tokens": 159896220.0, + "step": 4186 + }, + { + "epoch": 0.5326294364584658, + "ewc_loss": 0.006649923976510763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.649924034718424e-05, + "grad_norm": 3.47784161567688, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8610042929649353, + "num_tokens": 159930995.0, + "step": 4187 + }, + { + "epoch": 0.5327566467370564, + "ewc_loss": 0.006682198494672775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.682198727503419e-05, + "grad_norm": 3.4067647457122803, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8563244342803955, + "num_tokens": 159968013.0, + "step": 4188 + }, + { + "epoch": 0.5328838570156469, + "ewc_loss": 0.006632278207689524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.632278382312506e-05, + "grad_norm": 3.8408408164978027, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8549598455429077, + "num_tokens": 160004085.0, + "step": 4189 + }, + { + "epoch": 0.5330110672942374, + "ewc_loss": 0.006955927703529596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.955927528906614e-05, + "grad_norm": 3.4200167655944824, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8729279041290283, + "num_tokens": 160040606.0, + "step": 4190 + }, + { + "epoch": 0.5331382775728278, + "ewc_loss": 0.006599459331482649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.59945944789797e-05, + "grad_norm": 3.3779520988464355, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8729677200317383, + "num_tokens": 160080792.0, + "step": 4191 + }, + { + "epoch": 0.5332654878514184, + "ewc_loss": 0.00672426912933588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.724269042024389e-05, + "grad_norm": 3.4389829635620117, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8499734401702881, + "num_tokens": 160117055.0, + "step": 4192 + }, + { + "epoch": 0.5333926981300089, + "ewc_loss": 0.006778400857001543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.778400711482391e-05, + "grad_norm": 3.4154610633850098, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8676559925079346, + "num_tokens": 160153223.0, + "step": 4193 + }, + { + "epoch": 0.5335199084085994, + "ewc_loss": 0.006736489944159985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.736489740433171e-05, + "grad_norm": 3.4750659465789795, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8447303771972656, + "num_tokens": 160190920.0, + "step": 4194 + }, + { + "epoch": 0.53364711868719, + "ewc_loss": 0.006789459381252527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.789459439460188e-05, + "grad_norm": 3.4845705032348633, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8592312335968018, + "num_tokens": 160221898.0, + "step": 4195 + }, + { + "epoch": 0.5337743289657805, + "ewc_loss": 0.006791866384446621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.79186632623896e-05, + "grad_norm": 3.392062187194824, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.86326003074646, + "num_tokens": 160267102.0, + "step": 4196 + }, + { + "epoch": 0.5339015392443709, + "ewc_loss": 0.006724464241415262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.724464037688449e-05, + "grad_norm": 3.451097249984741, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8669084310531616, + "num_tokens": 160304422.0, + "step": 4197 + }, + { + "epoch": 0.5340287495229614, + "ewc_loss": 0.006801609415560961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.801609561080113e-05, + "grad_norm": 3.470243215560913, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8586628437042236, + "num_tokens": 160339006.0, + "step": 4198 + }, + { + "epoch": 0.534155959801552, + "ewc_loss": 0.0067700231447815895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.77002317388542e-05, + "grad_norm": 3.352530002593994, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8816112279891968, + "num_tokens": 160377504.0, + "step": 4199 + }, + { + "epoch": 0.5342831700801425, + "ewc_loss": 0.006697577890008688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.697577919112518e-05, + "grad_norm": 3.4877753257751465, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8417367935180664, + "num_tokens": 160413315.0, + "step": 4200 + }, + { + "epoch": 0.534410380358733, + "ewc_loss": 0.00683657405897975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.836574175395072e-05, + "grad_norm": 3.375653028488159, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.87569260597229, + "num_tokens": 160455034.0, + "step": 4201 + }, + { + "epoch": 0.5345375906373235, + "ewc_loss": 0.0066984547302126884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.698454672005028e-05, + "grad_norm": 3.4620370864868164, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8575090169906616, + "num_tokens": 160490505.0, + "step": 4202 + }, + { + "epoch": 0.534664800915914, + "ewc_loss": 0.006785726174712181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.78572614560835e-05, + "grad_norm": 3.40841007232666, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8709695935249329, + "num_tokens": 160528712.0, + "step": 4203 + }, + { + "epoch": 0.5347920111945045, + "ewc_loss": 0.006715358234941959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.715358176734298e-05, + "grad_norm": 3.4345993995666504, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.854369580745697, + "num_tokens": 160567945.0, + "step": 4204 + }, + { + "epoch": 0.534919221473095, + "ewc_loss": 0.0067378198727965355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.737819785485044e-05, + "grad_norm": 3.532998561859131, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8683605194091797, + "num_tokens": 160596767.0, + "step": 4205 + }, + { + "epoch": 0.5350464317516855, + "ewc_loss": 0.0067865801975131035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.786580343032256e-05, + "grad_norm": 3.387000799179077, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8581477403640747, + "num_tokens": 160636912.0, + "step": 4206 + }, + { + "epoch": 0.5351736420302761, + "ewc_loss": 0.0066621568985283375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.66215710225515e-05, + "grad_norm": 3.469287633895874, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8559526205062866, + "num_tokens": 160674588.0, + "step": 4207 + }, + { + "epoch": 0.5353008523088666, + "ewc_loss": 0.006757587660104036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.757587834727019e-05, + "grad_norm": 3.4337732791900635, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8505642414093018, + "num_tokens": 160713441.0, + "step": 4208 + }, + { + "epoch": 0.5354280625874571, + "ewc_loss": 0.006692579947412014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.692580063827336e-05, + "grad_norm": 3.399510622024536, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8708229064941406, + "num_tokens": 160752716.0, + "step": 4209 + }, + { + "epoch": 0.5355552728660475, + "ewc_loss": 0.006685066036880016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.685066182399169e-05, + "grad_norm": 3.3623104095458984, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8703500628471375, + "num_tokens": 160798771.0, + "step": 4210 + }, + { + "epoch": 0.5356824831446381, + "ewc_loss": 0.0066672698594629765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.667269917670637e-05, + "grad_norm": 3.371753215789795, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8780984878540039, + "num_tokens": 160843085.0, + "step": 4211 + }, + { + "epoch": 0.5358096934232286, + "ewc_loss": 0.006671099457889795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.671099254162982e-05, + "grad_norm": 3.452927827835083, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8552495241165161, + "num_tokens": 160883073.0, + "step": 4212 + }, + { + "epoch": 0.5359369037018191, + "ewc_loss": 0.006708006374537945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.708006549160928e-05, + "grad_norm": 3.4106109142303467, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8688219785690308, + "num_tokens": 160923734.0, + "step": 4213 + }, + { + "epoch": 0.5360641139804097, + "ewc_loss": 0.0066473158076405525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.64731560391374e-05, + "grad_norm": 3.460205316543579, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8584885597229004, + "num_tokens": 160963460.0, + "step": 4214 + }, + { + "epoch": 0.5361913242590002, + "ewc_loss": 0.006695059593766928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.69505971018225e-05, + "grad_norm": 3.481337070465088, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8591257333755493, + "num_tokens": 161003691.0, + "step": 4215 + }, + { + "epoch": 0.5363185345375906, + "ewc_loss": 0.006673968397080898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.673968164250255e-05, + "grad_norm": 3.411442518234253, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8627430200576782, + "num_tokens": 161042056.0, + "step": 4216 + }, + { + "epoch": 0.5364457448161811, + "ewc_loss": 0.006637691054493189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.637690967181697e-05, + "grad_norm": 3.4120240211486816, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8847945928573608, + "num_tokens": 161078670.0, + "step": 4217 + }, + { + "epoch": 0.5365729550947717, + "ewc_loss": 0.0066521139815449715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.652114097960293e-05, + "grad_norm": 3.4690682888031006, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8750328421592712, + "num_tokens": 161113992.0, + "step": 4218 + }, + { + "epoch": 0.5367001653733622, + "ewc_loss": 0.006688303779810667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.68830398353748e-05, + "grad_norm": 3.4773592948913574, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8674455881118774, + "num_tokens": 161153763.0, + "step": 4219 + }, + { + "epoch": 0.5368273756519527, + "ewc_loss": 0.006659721955657005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.659721839241683e-05, + "grad_norm": 3.510732889175415, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8469377756118774, + "num_tokens": 161184186.0, + "step": 4220 + }, + { + "epoch": 0.5369545859305432, + "ewc_loss": 0.006676014047116041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.676014163531363e-05, + "grad_norm": 3.4336016178131104, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.859724223613739, + "num_tokens": 161219298.0, + "step": 4221 + }, + { + "epoch": 0.5370817962091337, + "ewc_loss": 0.006633400451391935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.633400334976614e-05, + "grad_norm": 3.3770055770874023, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8770876526832581, + "num_tokens": 161255252.0, + "step": 4222 + }, + { + "epoch": 0.5372090064877242, + "ewc_loss": 0.0066199214197695255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.619921623496339e-05, + "grad_norm": 3.4396204948425293, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8659327030181885, + "num_tokens": 161295890.0, + "step": 4223 + }, + { + "epoch": 0.5373362167663147, + "ewc_loss": 0.0066782208159565926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.678220961475745e-05, + "grad_norm": 3.433192729949951, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8596268892288208, + "num_tokens": 161330808.0, + "step": 4224 + }, + { + "epoch": 0.5374634270449052, + "ewc_loss": 0.006665805354714394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.665805267402902e-05, + "grad_norm": 3.444619655609131, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8690973520278931, + "num_tokens": 161370515.0, + "step": 4225 + }, + { + "epoch": 0.5375906373234958, + "ewc_loss": 0.006677006371319294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.677006604149938e-05, + "grad_norm": 3.4292328357696533, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8742915391921997, + "num_tokens": 161405257.0, + "step": 4226 + }, + { + "epoch": 0.5377178476020863, + "ewc_loss": 0.006656799931079149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.656799814663827e-05, + "grad_norm": 3.410724639892578, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8628948926925659, + "num_tokens": 161443526.0, + "step": 4227 + }, + { + "epoch": 0.5378450578806767, + "ewc_loss": 0.006664111278951168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.66411142447032e-05, + "grad_norm": 3.4777135848999023, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8559147119522095, + "num_tokens": 161481262.0, + "step": 4228 + }, + { + "epoch": 0.5379722681592672, + "ewc_loss": 0.006715661380439997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.71566158416681e-05, + "grad_norm": 3.4798130989074707, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8486019372940063, + "num_tokens": 161517756.0, + "step": 4229 + }, + { + "epoch": 0.5380994784378578, + "ewc_loss": 0.006679132115095854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.67913191136904e-05, + "grad_norm": 3.415548324584961, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8718355894088745, + "num_tokens": 161552680.0, + "step": 4230 + }, + { + "epoch": 0.5382266887164483, + "ewc_loss": 0.006651510018855333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.651510193478316e-05, + "grad_norm": 3.457301378250122, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8594775795936584, + "num_tokens": 161588607.0, + "step": 4231 + }, + { + "epoch": 0.5383538989950388, + "ewc_loss": 0.0067119537852704525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.711953756166622e-05, + "grad_norm": 3.4583890438079834, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8461447954177856, + "num_tokens": 161626427.0, + "step": 4232 + }, + { + "epoch": 0.5384811092736294, + "ewc_loss": 0.006696791388094425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.696791388094425e-05, + "grad_norm": 3.392143964767456, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8681768178939819, + "num_tokens": 161665182.0, + "step": 4233 + }, + { + "epoch": 0.5386083195522198, + "ewc_loss": 0.006655842997133732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.655843026237562e-05, + "grad_norm": 3.408175230026245, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8622488379478455, + "num_tokens": 161704685.0, + "step": 4234 + }, + { + "epoch": 0.5387355298308103, + "ewc_loss": 0.006698654964566231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.698654760839418e-05, + "grad_norm": 3.398479700088501, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8678553104400635, + "num_tokens": 161742960.0, + "step": 4235 + }, + { + "epoch": 0.5388627401094008, + "ewc_loss": 0.006672875955700874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.672876043012366e-05, + "grad_norm": 3.4469990730285645, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8626571893692017, + "num_tokens": 161778199.0, + "step": 4236 + }, + { + "epoch": 0.5389899503879914, + "ewc_loss": 0.006714736111462116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.714736082358286e-05, + "grad_norm": 3.4455504417419434, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8589322566986084, + "num_tokens": 161811235.0, + "step": 4237 + }, + { + "epoch": 0.5391171606665819, + "ewc_loss": 0.006696169264614582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.696169293718413e-05, + "grad_norm": 3.4232470989227295, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8636065721511841, + "num_tokens": 161847669.0, + "step": 4238 + }, + { + "epoch": 0.5392443709451724, + "ewc_loss": 0.006675360724329948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.67536078253761e-05, + "grad_norm": 3.415656089782715, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.849236786365509, + "num_tokens": 161888470.0, + "step": 4239 + }, + { + "epoch": 0.5393715812237628, + "ewc_loss": 0.006684331223368645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.684331310680136e-05, + "grad_norm": 3.504422426223755, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8437528014183044, + "num_tokens": 161924563.0, + "step": 4240 + }, + { + "epoch": 0.5394987915023534, + "ewc_loss": 0.006739435717463493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.739435775671154e-05, + "grad_norm": 3.4724740982055664, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8485720157623291, + "num_tokens": 161959062.0, + "step": 4241 + }, + { + "epoch": 0.5396260017809439, + "ewc_loss": 0.006684174295514822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.68417414999567e-05, + "grad_norm": 3.4125852584838867, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8664219379425049, + "num_tokens": 161994755.0, + "step": 4242 + }, + { + "epoch": 0.5397532120595344, + "ewc_loss": 0.006666274741292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.666274566669017e-05, + "grad_norm": 3.4308700561523438, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8726344108581543, + "num_tokens": 162032812.0, + "step": 4243 + }, + { + "epoch": 0.5398804223381249, + "ewc_loss": 0.006699472665786743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.699472578475252e-05, + "grad_norm": 3.4643571376800537, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8657432794570923, + "num_tokens": 162067130.0, + "step": 4244 + }, + { + "epoch": 0.5400076326167155, + "ewc_loss": 0.006725752726197243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.725752609781921e-05, + "grad_norm": 3.3922009468078613, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8644111752510071, + "num_tokens": 162107452.0, + "step": 4245 + }, + { + "epoch": 0.5401348428953059, + "ewc_loss": 0.006673223804682493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.673223833786324e-05, + "grad_norm": 3.4750399589538574, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8619056940078735, + "num_tokens": 162144220.0, + "step": 4246 + }, + { + "epoch": 0.5402620531738964, + "ewc_loss": 0.0067419386468827724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.741938705090433e-05, + "grad_norm": 3.4152162075042725, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8719475269317627, + "num_tokens": 162181798.0, + "step": 4247 + }, + { + "epoch": 0.540389263452487, + "ewc_loss": 0.0066717397421598434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.67173953843303e-05, + "grad_norm": 3.3822009563446045, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8548711538314819, + "num_tokens": 162222501.0, + "step": 4248 + }, + { + "epoch": 0.5405164737310775, + "ewc_loss": 0.006685998756438494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.685998960165307e-05, + "grad_norm": 3.3871266841888428, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8585568070411682, + "num_tokens": 162265533.0, + "step": 4249 + }, + { + "epoch": 0.540643684009668, + "ewc_loss": 0.006701444275677204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.701444362988696e-05, + "grad_norm": 3.4351911544799805, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8774496912956238, + "num_tokens": 162304984.0, + "step": 4250 + }, + { + "epoch": 0.5407708942882585, + "ewc_loss": 0.00670147268101573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.701472739223391e-05, + "grad_norm": 3.4581263065338135, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8534274697303772, + "num_tokens": 162342358.0, + "step": 4251 + }, + { + "epoch": 0.540898104566849, + "ewc_loss": 0.006701923906803131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.70192384859547e-05, + "grad_norm": 3.462043523788452, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.864051342010498, + "num_tokens": 162377343.0, + "step": 4252 + }, + { + "epoch": 0.5410253148454395, + "ewc_loss": 0.0066917091608047485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.691709131700918e-05, + "grad_norm": 3.3794312477111816, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8635895252227783, + "num_tokens": 162416416.0, + "step": 4253 + }, + { + "epoch": 0.54115252512403, + "ewc_loss": 0.0066402871161699295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.640287028858438e-05, + "grad_norm": 3.4619691371917725, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8882074356079102, + "num_tokens": 162447108.0, + "step": 4254 + }, + { + "epoch": 0.5412797354026205, + "ewc_loss": 0.006733190733939409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.7331908212509e-05, + "grad_norm": 3.4468774795532227, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8649072647094727, + "num_tokens": 162489659.0, + "step": 4255 + }, + { + "epoch": 0.5414069456812111, + "ewc_loss": 0.006681111175566912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.681110971840099e-05, + "grad_norm": 3.4476304054260254, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.852085530757904, + "num_tokens": 162525816.0, + "step": 4256 + }, + { + "epoch": 0.5415341559598016, + "ewc_loss": 0.006677850615233183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.677850615233183e-05, + "grad_norm": 3.4743547439575195, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8593361377716064, + "num_tokens": 162564154.0, + "step": 4257 + }, + { + "epoch": 0.5416613662383921, + "ewc_loss": 0.0067047616466879845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.704761472065002e-05, + "grad_norm": 3.43935227394104, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8567500114440918, + "num_tokens": 162602728.0, + "step": 4258 + }, + { + "epoch": 0.5417885765169825, + "ewc_loss": 0.0066790045239031315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.679004582110792e-05, + "grad_norm": 3.4250967502593994, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8668906688690186, + "num_tokens": 162636659.0, + "step": 4259 + }, + { + "epoch": 0.5419157867955731, + "ewc_loss": 0.00668518990278244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.68518987367861e-05, + "grad_norm": 3.4123454093933105, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8787130117416382, + "num_tokens": 162676639.0, + "step": 4260 + }, + { + "epoch": 0.5420429970741636, + "ewc_loss": 0.006678015924990177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.678015779471025e-05, + "grad_norm": 3.4376683235168457, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8444540500640869, + "num_tokens": 162716763.0, + "step": 4261 + }, + { + "epoch": 0.5421702073527541, + "ewc_loss": 0.006704355590045452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.70435547363013e-05, + "grad_norm": 3.45294451713562, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8511596322059631, + "num_tokens": 162753943.0, + "step": 4262 + }, + { + "epoch": 0.5422974176313446, + "ewc_loss": 0.006700118537992239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.700118683511391e-05, + "grad_norm": 3.4507102966308594, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8578005433082581, + "num_tokens": 162792938.0, + "step": 4263 + }, + { + "epoch": 0.5424246279099352, + "ewc_loss": 0.006700837053358555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.700836820527911e-05, + "grad_norm": 3.3863108158111572, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8593251705169678, + "num_tokens": 162838938.0, + "step": 4264 + }, + { + "epoch": 0.5425518381885256, + "ewc_loss": 0.006664510350674391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.664510146947578e-05, + "grad_norm": 3.419619560241699, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8679352402687073, + "num_tokens": 162875026.0, + "step": 4265 + }, + { + "epoch": 0.5426790484671161, + "ewc_loss": 0.006714170798659325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.714170740451664e-05, + "grad_norm": 3.461042642593384, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8656295537948608, + "num_tokens": 162910724.0, + "step": 4266 + }, + { + "epoch": 0.5428062587457066, + "ewc_loss": 0.006710795219987631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.710795423714444e-05, + "grad_norm": 3.5443482398986816, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8655906915664673, + "num_tokens": 162941602.0, + "step": 4267 + }, + { + "epoch": 0.5429334690242972, + "ewc_loss": 0.0067451111972332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.745111022610217e-05, + "grad_norm": 3.4583065509796143, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8488815426826477, + "num_tokens": 162980760.0, + "step": 4268 + }, + { + "epoch": 0.5430606793028877, + "ewc_loss": 0.006679108832031488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.679108628304675e-05, + "grad_norm": 3.4250781536102295, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8721882104873657, + "num_tokens": 163016155.0, + "step": 4269 + }, + { + "epoch": 0.5431878895814782, + "ewc_loss": 0.006690104957669973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.69010478304699e-05, + "grad_norm": 3.504880905151367, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8536891341209412, + "num_tokens": 163052519.0, + "step": 4270 + }, + { + "epoch": 0.5433150998600687, + "ewc_loss": 0.0067451815120875835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.745181599399075e-05, + "grad_norm": 3.422017812728882, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8722770810127258, + "num_tokens": 163091806.0, + "step": 4271 + }, + { + "epoch": 0.5434423101386592, + "ewc_loss": 0.006657590623944998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.657590711256489e-05, + "grad_norm": 3.393526077270508, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8780980110168457, + "num_tokens": 163128830.0, + "step": 4272 + }, + { + "epoch": 0.5435695204172497, + "ewc_loss": 0.006670066155493259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.670066068181768e-05, + "grad_norm": 3.414045810699463, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8575028777122498, + "num_tokens": 163169567.0, + "step": 4273 + }, + { + "epoch": 0.5436967306958402, + "ewc_loss": 0.006684209685772657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.684209802187979e-05, + "grad_norm": 3.4916064739227295, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8566991090774536, + "num_tokens": 163202913.0, + "step": 4274 + }, + { + "epoch": 0.5438239409744308, + "ewc_loss": 0.006724528968334198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.724528793711215e-05, + "grad_norm": 3.503767967224121, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8706158399581909, + "num_tokens": 163236741.0, + "step": 4275 + }, + { + "epoch": 0.5439511512530213, + "ewc_loss": 0.006701341830193996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.701341771986336e-05, + "grad_norm": 3.424088478088379, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8559849262237549, + "num_tokens": 163281402.0, + "step": 4276 + }, + { + "epoch": 0.5440783615316117, + "ewc_loss": 0.006654488854110241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.654488970525563e-05, + "grad_norm": 3.3639256954193115, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8704859018325806, + "num_tokens": 163325288.0, + "step": 4277 + }, + { + "epoch": 0.5442055718102022, + "ewc_loss": 0.006652238313108683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.652238516835496e-05, + "grad_norm": 3.430492401123047, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.852505624294281, + "num_tokens": 163367471.0, + "step": 4278 + }, + { + "epoch": 0.5443327820887928, + "ewc_loss": 0.0066928621381521225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.692862370982766e-05, + "grad_norm": 3.487130880355835, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.864996612071991, + "num_tokens": 163400077.0, + "step": 4279 + }, + { + "epoch": 0.5444599923673833, + "ewc_loss": 0.006698782090097666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.698782090097666e-05, + "grad_norm": 3.429682731628418, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8659600019454956, + "num_tokens": 163433739.0, + "step": 4280 + }, + { + "epoch": 0.5445872026459738, + "ewc_loss": 0.006651729810982943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.651729927398264e-05, + "grad_norm": 3.464063882827759, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8632457256317139, + "num_tokens": 163467447.0, + "step": 4281 + }, + { + "epoch": 0.5447144129245644, + "ewc_loss": 0.006689491216093302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.689491419820115e-05, + "grad_norm": 3.4187324047088623, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8601395487785339, + "num_tokens": 163506633.0, + "step": 4282 + }, + { + "epoch": 0.5448416232031548, + "ewc_loss": 0.006643642671406269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.6436427005101e-05, + "grad_norm": 3.5648274421691895, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8482652902603149, + "num_tokens": 163542773.0, + "step": 4283 + }, + { + "epoch": 0.5449688334817453, + "ewc_loss": 0.006751313805580139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.751313776476309e-05, + "grad_norm": 3.423530101776123, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8302221298217773, + "num_tokens": 163585247.0, + "step": 4284 + }, + { + "epoch": 0.5450960437603358, + "ewc_loss": 0.006615167483687401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.615167512791231e-05, + "grad_norm": 3.458171844482422, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8496479988098145, + "num_tokens": 163622802.0, + "step": 4285 + }, + { + "epoch": 0.5452232540389264, + "ewc_loss": 0.006689286325126886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.689286237815395e-05, + "grad_norm": 3.4426116943359375, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8866391181945801, + "num_tokens": 163655980.0, + "step": 4286 + }, + { + "epoch": 0.5453504643175169, + "ewc_loss": 0.006661746650934219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.661746738245711e-05, + "grad_norm": 3.3992717266082764, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8676323890686035, + "num_tokens": 163691494.0, + "step": 4287 + }, + { + "epoch": 0.5454776745961074, + "ewc_loss": 0.006643656641244888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.643656524829566e-05, + "grad_norm": 3.4643568992614746, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8741135597229004, + "num_tokens": 163725169.0, + "step": 4288 + }, + { + "epoch": 0.5456048848746978, + "ewc_loss": 0.006709975190460682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.709975423291326e-05, + "grad_norm": 3.425142765045166, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.865360677242279, + "num_tokens": 163766050.0, + "step": 4289 + }, + { + "epoch": 0.5457320951532884, + "ewc_loss": 0.00665998226031661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.659982318524271e-05, + "grad_norm": 3.496267080307007, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8570738434791565, + "num_tokens": 163799559.0, + "step": 4290 + }, + { + "epoch": 0.5458593054318789, + "ewc_loss": 0.0067191338166594505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.719133671140298e-05, + "grad_norm": 3.4516892433166504, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.856827437877655, + "num_tokens": 163837052.0, + "step": 4291 + }, + { + "epoch": 0.5459865157104694, + "ewc_loss": 0.006685608997941017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.685608968837187e-05, + "grad_norm": 3.479579210281372, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8611394762992859, + "num_tokens": 163872494.0, + "step": 4292 + }, + { + "epoch": 0.5461137259890599, + "ewc_loss": 0.00672631012275815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.726309948135167e-05, + "grad_norm": 3.411097526550293, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8540735840797424, + "num_tokens": 163913499.0, + "step": 4293 + }, + { + "epoch": 0.5462409362676505, + "ewc_loss": 0.0066630966030061245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.663096428383142e-05, + "grad_norm": 3.399367332458496, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8441457748413086, + "num_tokens": 163957965.0, + "step": 4294 + }, + { + "epoch": 0.5463681465462409, + "ewc_loss": 0.006694398820400238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.694399053230882e-05, + "grad_norm": 3.413511037826538, + "learning_rate": 1e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.8219519853591919, + "num_tokens": 164005661.0, + "step": 4295 + }, + { + "epoch": 0.5464953568248314, + "ewc_loss": 0.006685836706310511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.68583670631051e-05, + "grad_norm": 3.439666509628296, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8781996965408325, + "num_tokens": 164044717.0, + "step": 4296 + }, + { + "epoch": 0.5466225671034219, + "ewc_loss": 0.006695272866636515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.695272895740345e-05, + "grad_norm": 3.5166852474212646, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8622251749038696, + "num_tokens": 164080022.0, + "step": 4297 + }, + { + "epoch": 0.5467497773820125, + "ewc_loss": 0.00673473859205842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.734738417435437e-05, + "grad_norm": 3.5463099479675293, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8689070343971252, + "num_tokens": 164112749.0, + "step": 4298 + }, + { + "epoch": 0.546876987660603, + "ewc_loss": 0.006712995003908873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.712994945701212e-05, + "grad_norm": 3.4288599491119385, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8571053743362427, + "num_tokens": 164152517.0, + "step": 4299 + }, + { + "epoch": 0.5470041979391935, + "ewc_loss": 0.006647412199527025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.647412374150008e-05, + "grad_norm": 3.3858327865600586, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.86885005235672, + "num_tokens": 164195245.0, + "step": 4300 + }, + { + "epoch": 0.5471314082177839, + "ewc_loss": 0.006659475155174732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.659475184278563e-05, + "grad_norm": 3.4354031085968018, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.857631266117096, + "num_tokens": 164230643.0, + "step": 4301 + }, + { + "epoch": 0.5472586184963745, + "ewc_loss": 0.006698726676404476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.698726792819798e-05, + "grad_norm": 3.422306776046753, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8591997623443604, + "num_tokens": 164271089.0, + "step": 4302 + }, + { + "epoch": 0.547385828774965, + "ewc_loss": 0.006662140600383282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.662140367552638e-05, + "grad_norm": 3.4275872707366943, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8600622415542603, + "num_tokens": 164310618.0, + "step": 4303 + }, + { + "epoch": 0.5475130390535555, + "ewc_loss": 0.006670503411442041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.67050335323438e-05, + "grad_norm": 3.463306427001953, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8494298458099365, + "num_tokens": 164348630.0, + "step": 4304 + }, + { + "epoch": 0.5476402493321461, + "ewc_loss": 0.006684653460979462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.684653635602444e-05, + "grad_norm": 3.432295560836792, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8698381185531616, + "num_tokens": 164383735.0, + "step": 4305 + }, + { + "epoch": 0.5477674596107366, + "ewc_loss": 0.00665889959782362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.658899656031281e-05, + "grad_norm": 3.380915880203247, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8640869855880737, + "num_tokens": 164426115.0, + "step": 4306 + }, + { + "epoch": 0.5478946698893271, + "ewc_loss": 0.006634742021560669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.634742021560669e-05, + "grad_norm": 3.455568552017212, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8542348146438599, + "num_tokens": 164462119.0, + "step": 4307 + }, + { + "epoch": 0.5480218801679175, + "ewc_loss": 0.00670994259417057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.709942681482062e-05, + "grad_norm": 3.46116304397583, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8595530390739441, + "num_tokens": 164503030.0, + "step": 4308 + }, + { + "epoch": 0.5481490904465081, + "ewc_loss": 0.006676963530480862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.676963676000014e-05, + "grad_norm": 3.4445245265960693, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.865443229675293, + "num_tokens": 164541091.0, + "step": 4309 + }, + { + "epoch": 0.5482763007250986, + "ewc_loss": 0.00666822399944067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.668223795713857e-05, + "grad_norm": 3.462249755859375, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8564709424972534, + "num_tokens": 164577678.0, + "step": 4310 + }, + { + "epoch": 0.5484035110036891, + "ewc_loss": 0.006682662758976221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.682662933599204e-05, + "grad_norm": 3.443225145339966, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8596247434616089, + "num_tokens": 164612779.0, + "step": 4311 + }, + { + "epoch": 0.5485307212822796, + "ewc_loss": 0.006663896143436432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.66389605612494e-05, + "grad_norm": 3.3961422443389893, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8674626350402832, + "num_tokens": 164649849.0, + "step": 4312 + }, + { + "epoch": 0.5486579315608702, + "ewc_loss": 0.006649262737482786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.649262650171295e-05, + "grad_norm": 3.4370312690734863, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.859004020690918, + "num_tokens": 164690538.0, + "step": 4313 + }, + { + "epoch": 0.5487851418394606, + "ewc_loss": 0.006682010367512703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.682010280201212e-05, + "grad_norm": 3.415637493133545, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8617493510246277, + "num_tokens": 164729338.0, + "step": 4314 + }, + { + "epoch": 0.5489123521180511, + "ewc_loss": 0.006659237202256918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.659237260464579e-05, + "grad_norm": 3.448376417160034, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8724097013473511, + "num_tokens": 164766648.0, + "step": 4315 + }, + { + "epoch": 0.5490395623966416, + "ewc_loss": 0.006689185276627541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.689185102004558e-05, + "grad_norm": 3.4231112003326416, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8608849048614502, + "num_tokens": 164804254.0, + "step": 4316 + }, + { + "epoch": 0.5491667726752322, + "ewc_loss": 0.006658399011939764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.658399070147425e-05, + "grad_norm": 3.4075565338134766, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8754933476448059, + "num_tokens": 164839943.0, + "step": 4317 + }, + { + "epoch": 0.5492939829538227, + "ewc_loss": 0.006667370442301035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.667370325885713e-05, + "grad_norm": 3.3872833251953125, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8545495867729187, + "num_tokens": 164887057.0, + "step": 4318 + }, + { + "epoch": 0.5494211932324132, + "ewc_loss": 0.0066422573290765285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.642257358180359e-05, + "grad_norm": 3.453777313232422, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8618146181106567, + "num_tokens": 164928515.0, + "step": 4319 + }, + { + "epoch": 0.5495484035110036, + "ewc_loss": 0.006696857046335936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.696856871712953e-05, + "grad_norm": 3.455109119415283, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8614832758903503, + "num_tokens": 164963818.0, + "step": 4320 + }, + { + "epoch": 0.5496756137895942, + "ewc_loss": 0.0066606830805540085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.660682993242517e-05, + "grad_norm": 3.455888271331787, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8749590516090393, + "num_tokens": 165001091.0, + "step": 4321 + }, + { + "epoch": 0.5498028240681847, + "ewc_loss": 0.00666275667026639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.66275664116256e-05, + "grad_norm": 3.3897485733032227, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.845402717590332, + "num_tokens": 165046009.0, + "step": 4322 + }, + { + "epoch": 0.5499300343467752, + "ewc_loss": 0.006625577341765165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.625577225349844e-05, + "grad_norm": 3.3986570835113525, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8593108057975769, + "num_tokens": 165090729.0, + "step": 4323 + }, + { + "epoch": 0.5500572446253658, + "ewc_loss": 0.006647173780947924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.647173722740263e-05, + "grad_norm": 3.3719263076782227, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.879724383354187, + "num_tokens": 165129771.0, + "step": 4324 + }, + { + "epoch": 0.5501844549039563, + "ewc_loss": 0.006635562051087618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.635562021983787e-05, + "grad_norm": 3.4549174308776855, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8769842386245728, + "num_tokens": 165164310.0, + "step": 4325 + }, + { + "epoch": 0.5503116651825467, + "ewc_loss": 0.0066725448705255985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.67254498694092e-05, + "grad_norm": 3.411515235900879, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8585878014564514, + "num_tokens": 165202842.0, + "step": 4326 + }, + { + "epoch": 0.5504388754611372, + "ewc_loss": 0.00662586186081171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.625861715292558e-05, + "grad_norm": 3.5654454231262207, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8665748238563538, + "num_tokens": 165235998.0, + "step": 4327 + }, + { + "epoch": 0.5505660857397278, + "ewc_loss": 0.006733032874763012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.733032932970673e-05, + "grad_norm": 3.429260730743408, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8632148504257202, + "num_tokens": 165271779.0, + "step": 4328 + }, + { + "epoch": 0.5506932960183183, + "ewc_loss": 0.006589645519852638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.58964563626796e-05, + "grad_norm": 3.4407217502593994, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8567638397216797, + "num_tokens": 165306136.0, + "step": 4329 + }, + { + "epoch": 0.5508205062969088, + "ewc_loss": 0.006667379289865494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.66737905703485e-05, + "grad_norm": 3.4142191410064697, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8774893283843994, + "num_tokens": 165342401.0, + "step": 4330 + }, + { + "epoch": 0.5509477165754993, + "ewc_loss": 0.0066341375932097435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.63413738948293e-05, + "grad_norm": 3.4026060104370117, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8641664385795593, + "num_tokens": 165380120.0, + "step": 4331 + }, + { + "epoch": 0.5510749268540898, + "ewc_loss": 0.006640431005507708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.640431092819199e-05, + "grad_norm": 3.4886362552642822, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.849144458770752, + "num_tokens": 165418210.0, + "step": 4332 + }, + { + "epoch": 0.5512021371326803, + "ewc_loss": 0.006704084575176239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.704084808006883e-05, + "grad_norm": 3.4727389812469482, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8420853018760681, + "num_tokens": 165460952.0, + "step": 4333 + }, + { + "epoch": 0.5513293474112708, + "ewc_loss": 0.006670322734862566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.670322909485549e-05, + "grad_norm": 3.3721306324005127, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8752626180648804, + "num_tokens": 165500370.0, + "step": 4334 + }, + { + "epoch": 0.5514565576898613, + "ewc_loss": 0.006622237619012594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.622237560804933e-05, + "grad_norm": 3.43070387840271, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8627753257751465, + "num_tokens": 165534089.0, + "step": 4335 + }, + { + "epoch": 0.5515837679684519, + "ewc_loss": 0.006691297981888056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.691298040095717e-05, + "grad_norm": 3.4520905017852783, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8558169603347778, + "num_tokens": 165573111.0, + "step": 4336 + }, + { + "epoch": 0.5517109782470424, + "ewc_loss": 0.0066757951863110065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.675795157207176e-05, + "grad_norm": 3.4581360816955566, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.853889524936676, + "num_tokens": 165611942.0, + "step": 4337 + }, + { + "epoch": 0.5518381885256328, + "ewc_loss": 0.006678805220872164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.678805220872164e-05, + "grad_norm": 3.512610912322998, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8710207939147949, + "num_tokens": 165651703.0, + "step": 4338 + }, + { + "epoch": 0.5519653988042234, + "ewc_loss": 0.006714095827192068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.714095798088238e-05, + "grad_norm": 3.40211820602417, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.860478401184082, + "num_tokens": 165691257.0, + "step": 4339 + }, + { + "epoch": 0.5520926090828139, + "ewc_loss": 0.006626542191952467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.626542017329484e-05, + "grad_norm": 3.4345862865448, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.87828528881073, + "num_tokens": 165727947.0, + "step": 4340 + }, + { + "epoch": 0.5522198193614044, + "ewc_loss": 0.006680537015199661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.68053689878434e-05, + "grad_norm": 3.405337333679199, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.870673656463623, + "num_tokens": 165763400.0, + "step": 4341 + }, + { + "epoch": 0.5523470296399949, + "ewc_loss": 0.0066580986604094505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.658098573097959e-05, + "grad_norm": 3.3802287578582764, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8648219108581543, + "num_tokens": 165809586.0, + "step": 4342 + }, + { + "epoch": 0.5524742399185855, + "ewc_loss": 0.006634383462369442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.63438331685029e-05, + "grad_norm": 3.515796661376953, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8648821115493774, + "num_tokens": 165839565.0, + "step": 4343 + }, + { + "epoch": 0.5526014501971759, + "ewc_loss": 0.006724406499415636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.724406557623297e-05, + "grad_norm": 3.5281271934509277, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8655045032501221, + "num_tokens": 165873539.0, + "step": 4344 + }, + { + "epoch": 0.5527286604757664, + "ewc_loss": 0.006684497930109501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.684497930109501e-05, + "grad_norm": 3.342038154602051, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8719054460525513, + "num_tokens": 165920987.0, + "step": 4345 + }, + { + "epoch": 0.5528558707543569, + "ewc_loss": 0.006577400024980307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.57740019960329e-05, + "grad_norm": 3.434624671936035, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8509733080863953, + "num_tokens": 165965661.0, + "step": 4346 + }, + { + "epoch": 0.5529830810329475, + "ewc_loss": 0.0067012375220656395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.701237725792453e-05, + "grad_norm": 3.5115878582000732, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8561004996299744, + "num_tokens": 165998239.0, + "step": 4347 + }, + { + "epoch": 0.553110291311538, + "ewc_loss": 0.006699581164866686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.699580990243703e-05, + "grad_norm": 3.421767234802246, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8601952791213989, + "num_tokens": 166036489.0, + "step": 4348 + }, + { + "epoch": 0.5532375015901285, + "ewc_loss": 0.006638885010033846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.638884951826185e-05, + "grad_norm": 3.3713908195495605, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8580045700073242, + "num_tokens": 166078774.0, + "step": 4349 + }, + { + "epoch": 0.5533647118687189, + "ewc_loss": 0.006633695214986801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.633695011259988e-05, + "grad_norm": 3.4630842208862305, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8706218600273132, + "num_tokens": 166119555.0, + "step": 4350 + }, + { + "epoch": 0.5534919221473095, + "ewc_loss": 0.006697772070765495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.697772187180817e-05, + "grad_norm": 3.4561057090759277, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8613545894622803, + "num_tokens": 166159788.0, + "step": 4351 + }, + { + "epoch": 0.5536191324259, + "ewc_loss": 0.0066606756299734116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.660675717284903e-05, + "grad_norm": 3.414903402328491, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.858322262763977, + "num_tokens": 166200230.0, + "step": 4352 + }, + { + "epoch": 0.5537463427044905, + "ewc_loss": 0.006639185361564159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.639185448875651e-05, + "grad_norm": 3.435194730758667, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8676892518997192, + "num_tokens": 166237864.0, + "step": 4353 + }, + { + "epoch": 0.553873552983081, + "ewc_loss": 0.006651515141129494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.651515286648646e-05, + "grad_norm": 3.399820327758789, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8654937744140625, + "num_tokens": 166278347.0, + "step": 4354 + }, + { + "epoch": 0.5540007632616716, + "ewc_loss": 0.006619998719543219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.619998748647049e-05, + "grad_norm": 3.4867312908172607, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.852536678314209, + "num_tokens": 166315744.0, + "step": 4355 + }, + { + "epoch": 0.554127973540262, + "ewc_loss": 0.006685992237180471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.685992411803454e-05, + "grad_norm": 3.441617488861084, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8629432320594788, + "num_tokens": 166352705.0, + "step": 4356 + }, + { + "epoch": 0.5542551838188525, + "ewc_loss": 0.006634007673710585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.634007877437398e-05, + "grad_norm": 3.3876094818115234, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8675806522369385, + "num_tokens": 166393956.0, + "step": 4357 + }, + { + "epoch": 0.554382394097443, + "ewc_loss": 0.0066205356270074844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.620535714318976e-05, + "grad_norm": 3.4998936653137207, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8385454416275024, + "num_tokens": 166430172.0, + "step": 4358 + }, + { + "epoch": 0.5545096043760336, + "ewc_loss": 0.006705341394990683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.705341365886852e-05, + "grad_norm": 3.4273765087127686, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8626812100410461, + "num_tokens": 166468732.0, + "step": 4359 + }, + { + "epoch": 0.5546368146546241, + "ewc_loss": 0.006638452876359224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.638452759943902e-05, + "grad_norm": 3.381850004196167, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8576017618179321, + "num_tokens": 166511383.0, + "step": 4360 + }, + { + "epoch": 0.5547640249332146, + "ewc_loss": 0.006640081759542227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.640081846853718e-05, + "grad_norm": 3.4305684566497803, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.867355465888977, + "num_tokens": 166547674.0, + "step": 4361 + }, + { + "epoch": 0.5548912352118052, + "ewc_loss": 0.006680609658360481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.680609658360481e-05, + "grad_norm": 3.496211290359497, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8559489846229553, + "num_tokens": 166581763.0, + "step": 4362 + }, + { + "epoch": 0.5550184454903956, + "ewc_loss": 0.006703557446599007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.703557301079854e-05, + "grad_norm": 3.4696826934814453, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8607028722763062, + "num_tokens": 166618604.0, + "step": 4363 + }, + { + "epoch": 0.5551456557689861, + "ewc_loss": 0.006680464372038841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.680464139208198e-05, + "grad_norm": 3.412691593170166, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8725180625915527, + "num_tokens": 166654812.0, + "step": 4364 + }, + { + "epoch": 0.5552728660475766, + "ewc_loss": 0.006655261851847172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.655261677224189e-05, + "grad_norm": 3.4865915775299072, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8500415086746216, + "num_tokens": 166693206.0, + "step": 4365 + }, + { + "epoch": 0.5554000763261672, + "ewc_loss": 0.006721883080899715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.721883255522698e-05, + "grad_norm": 3.4066498279571533, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8613991737365723, + "num_tokens": 166734531.0, + "step": 4366 + }, + { + "epoch": 0.5555272866047577, + "ewc_loss": 0.00666723633185029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.667236448265612e-05, + "grad_norm": 3.458277702331543, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8606324195861816, + "num_tokens": 166771362.0, + "step": 4367 + }, + { + "epoch": 0.5556544968833482, + "ewc_loss": 0.006717566400766373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.717566429870203e-05, + "grad_norm": 3.5124757289886475, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8526008129119873, + "num_tokens": 166802784.0, + "step": 4368 + }, + { + "epoch": 0.5557817071619386, + "ewc_loss": 0.006741787306964397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.741787365172058e-05, + "grad_norm": 3.4813296794891357, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8582745790481567, + "num_tokens": 166835590.0, + "step": 4369 + }, + { + "epoch": 0.5559089174405292, + "ewc_loss": 0.006707257125526667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.707257125526667e-05, + "grad_norm": 3.457737445831299, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8580580353736877, + "num_tokens": 166872145.0, + "step": 4370 + }, + { + "epoch": 0.5560361277191197, + "ewc_loss": 0.00672051589936018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.720516103086993e-05, + "grad_norm": 3.411484479904175, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8589210510253906, + "num_tokens": 166912855.0, + "step": 4371 + }, + { + "epoch": 0.5561633379977102, + "ewc_loss": 0.006729956716299057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.729956658091396e-05, + "grad_norm": 3.4615461826324463, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8478438854217529, + "num_tokens": 166949482.0, + "step": 4372 + }, + { + "epoch": 0.5562905482763008, + "ewc_loss": 0.0067628961987793446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.762896373402327e-05, + "grad_norm": 3.440978527069092, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8621460199356079, + "num_tokens": 166986727.0, + "step": 4373 + }, + { + "epoch": 0.5564177585548913, + "ewc_loss": 0.006751100067049265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.751099863322452e-05, + "grad_norm": 3.40993595123291, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8550363779067993, + "num_tokens": 167025869.0, + "step": 4374 + }, + { + "epoch": 0.5565449688334817, + "ewc_loss": 0.0067528244107961655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.752824265277013e-05, + "grad_norm": 3.4325454235076904, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8720519542694092, + "num_tokens": 167059825.0, + "step": 4375 + }, + { + "epoch": 0.5566721791120722, + "ewc_loss": 0.0067756278440356255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.775627844035625e-05, + "grad_norm": 3.448317050933838, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.866856575012207, + "num_tokens": 167097375.0, + "step": 4376 + }, + { + "epoch": 0.5567993893906628, + "ewc_loss": 0.006766035221517086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.766035221517086e-05, + "grad_norm": 3.434436798095703, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8538950085639954, + "num_tokens": 167136555.0, + "step": 4377 + }, + { + "epoch": 0.5569265996692533, + "ewc_loss": 0.006761924829334021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.761925033060834e-05, + "grad_norm": 3.391706943511963, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8599562644958496, + "num_tokens": 167179870.0, + "step": 4378 + }, + { + "epoch": 0.5570538099478438, + "ewc_loss": 0.006737784016877413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.737784133292735e-05, + "grad_norm": 3.461040735244751, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8615477085113525, + "num_tokens": 167215242.0, + "step": 4379 + }, + { + "epoch": 0.5571810202264343, + "ewc_loss": 0.006789656821638346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.789656617911533e-05, + "grad_norm": 3.4865543842315674, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8696763515472412, + "num_tokens": 167250690.0, + "step": 4380 + }, + { + "epoch": 0.5573082305050248, + "ewc_loss": 0.006781402975320816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.781402771594003e-05, + "grad_norm": 3.522974729537964, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8495307564735413, + "num_tokens": 167283677.0, + "step": 4381 + }, + { + "epoch": 0.5574354407836153, + "ewc_loss": 0.006779914256185293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.779914110666141e-05, + "grad_norm": 3.3972768783569336, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.863868236541748, + "num_tokens": 167321355.0, + "step": 4382 + }, + { + "epoch": 0.5575626510622058, + "ewc_loss": 0.006702183745801449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.702183600282297e-05, + "grad_norm": 3.458336591720581, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8553968071937561, + "num_tokens": 167363103.0, + "step": 4383 + }, + { + "epoch": 0.5576898613407963, + "ewc_loss": 0.006779477000236511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.779476825613528e-05, + "grad_norm": 3.39017391204834, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8758180141448975, + "num_tokens": 167399732.0, + "step": 4384 + }, + { + "epoch": 0.5578170716193869, + "ewc_loss": 0.006711674854159355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.711675086989999e-05, + "grad_norm": 3.4987385272979736, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8613090515136719, + "num_tokens": 167434938.0, + "step": 4385 + }, + { + "epoch": 0.5579442818979774, + "ewc_loss": 0.0067963809706270695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.796381057938561e-05, + "grad_norm": 3.4559171199798584, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8656001091003418, + "num_tokens": 167472293.0, + "step": 4386 + }, + { + "epoch": 0.5580714921765678, + "ewc_loss": 0.006726888939738274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.726889114361256e-05, + "grad_norm": 3.3922722339630127, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8656392693519592, + "num_tokens": 167511015.0, + "step": 4387 + }, + { + "epoch": 0.5581987024551583, + "ewc_loss": 0.0066964291036129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.696429045405239e-05, + "grad_norm": 3.4208691120147705, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.860558807849884, + "num_tokens": 167549777.0, + "step": 4388 + }, + { + "epoch": 0.5583259127337489, + "ewc_loss": 0.006724225357174873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.724225386278704e-05, + "grad_norm": 3.3779876232147217, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8689748048782349, + "num_tokens": 167592342.0, + "step": 4389 + }, + { + "epoch": 0.5584531230123394, + "ewc_loss": 0.0066980463452637196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.698046490782872e-05, + "grad_norm": 3.4360511302948, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.866805911064148, + "num_tokens": 167630829.0, + "step": 4390 + }, + { + "epoch": 0.5585803332909299, + "ewc_loss": 0.006732326932251453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.732327165082097e-05, + "grad_norm": 3.439451217651367, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8510340452194214, + "num_tokens": 167675938.0, + "step": 4391 + }, + { + "epoch": 0.5587075435695205, + "ewc_loss": 0.006714965216815472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.714965275023133e-05, + "grad_norm": 3.4108052253723145, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8665727376937866, + "num_tokens": 167716550.0, + "step": 4392 + }, + { + "epoch": 0.5588347538481109, + "ewc_loss": 0.006679519545286894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.679519719909877e-05, + "grad_norm": 3.440922260284424, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8491617441177368, + "num_tokens": 167754401.0, + "step": 4393 + }, + { + "epoch": 0.5589619641267014, + "ewc_loss": 0.006701684556901455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.701684469589964e-05, + "grad_norm": 3.438032865524292, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8664194941520691, + "num_tokens": 167792124.0, + "step": 4394 + }, + { + "epoch": 0.5590891744052919, + "ewc_loss": 0.006682181265205145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.682181265205145e-05, + "grad_norm": 3.432478904724121, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8803207874298096, + "num_tokens": 167829226.0, + "step": 4395 + }, + { + "epoch": 0.5592163846838825, + "ewc_loss": 0.0066856155171990395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.68561551719904e-05, + "grad_norm": 3.4731972217559814, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8731414675712585, + "num_tokens": 167863439.0, + "step": 4396 + }, + { + "epoch": 0.559343594962473, + "ewc_loss": 0.006695558782666922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.695558840874583e-05, + "grad_norm": 3.3838984966278076, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8632603883743286, + "num_tokens": 167907019.0, + "step": 4397 + }, + { + "epoch": 0.5594708052410635, + "ewc_loss": 0.0066390326246619225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.639032653765753e-05, + "grad_norm": 3.4750170707702637, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8560224771499634, + "num_tokens": 167942539.0, + "step": 4398 + }, + { + "epoch": 0.5595980155196539, + "ewc_loss": 0.0067098322324454784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.709832086926326e-05, + "grad_norm": 3.39477801322937, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8623461127281189, + "num_tokens": 167982848.0, + "step": 4399 + }, + { + "epoch": 0.5597252257982445, + "ewc_loss": 0.006635906174778938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.635906174778938e-05, + "grad_norm": 3.4369261264801025, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.871592104434967, + "num_tokens": 168017500.0, + "step": 4400 + }, + { + "epoch": 0.559852436076835, + "ewc_loss": 0.006696790456771851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.696790660498664e-05, + "grad_norm": 3.47735857963562, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8474470973014832, + "num_tokens": 168052790.0, + "step": 4401 + }, + { + "epoch": 0.5599796463554255, + "ewc_loss": 0.006700762081891298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.700761878164485e-05, + "grad_norm": 3.4742393493652344, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8704543113708496, + "num_tokens": 168082541.0, + "step": 4402 + }, + { + "epoch": 0.560106856634016, + "ewc_loss": 0.00668379059061408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.683790707029402e-05, + "grad_norm": 3.382550001144409, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8673949241638184, + "num_tokens": 168128218.0, + "step": 4403 + }, + { + "epoch": 0.5602340669126066, + "ewc_loss": 0.0066421120427548885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.642111839028075e-05, + "grad_norm": 3.4452998638153076, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8658776879310608, + "num_tokens": 168165496.0, + "step": 4404 + }, + { + "epoch": 0.560361277191197, + "ewc_loss": 0.006698960438370705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.698960351059213e-05, + "grad_norm": 3.4358270168304443, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8492015600204468, + "num_tokens": 168204675.0, + "step": 4405 + }, + { + "epoch": 0.5604884874697875, + "ewc_loss": 0.00668351212516427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.683512037852779e-05, + "grad_norm": 3.444298505783081, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8568373322486877, + "num_tokens": 168244832.0, + "step": 4406 + }, + { + "epoch": 0.560615697748378, + "ewc_loss": 0.006684508174657822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.684508116450161e-05, + "grad_norm": 3.4295525550842285, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8578612804412842, + "num_tokens": 168282440.0, + "step": 4407 + }, + { + "epoch": 0.5607429080269686, + "ewc_loss": 0.006691484246402979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.69148430461064e-05, + "grad_norm": 3.4163568019866943, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8675340414047241, + "num_tokens": 168325303.0, + "step": 4408 + }, + { + "epoch": 0.5608701183055591, + "ewc_loss": 0.006679386366158724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.679386569885537e-05, + "grad_norm": 3.457920551300049, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8683359622955322, + "num_tokens": 168360628.0, + "step": 4409 + }, + { + "epoch": 0.5609973285841496, + "ewc_loss": 0.006700559519231319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.700559606542811e-05, + "grad_norm": 3.466919183731079, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8507131338119507, + "num_tokens": 168398601.0, + "step": 4410 + }, + { + "epoch": 0.5611245388627402, + "ewc_loss": 0.006701367441564798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.701367237837985e-05, + "grad_norm": 3.5104165077209473, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8496842384338379, + "num_tokens": 168436893.0, + "step": 4411 + }, + { + "epoch": 0.5612517491413306, + "ewc_loss": 0.0067253937013447285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.725393905071542e-05, + "grad_norm": 3.389042854309082, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.858183741569519, + "num_tokens": 168478794.0, + "step": 4412 + }, + { + "epoch": 0.5613789594199211, + "ewc_loss": 0.006642671301960945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.642671360168606e-05, + "grad_norm": 3.403524160385132, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8601090908050537, + "num_tokens": 168520184.0, + "step": 4413 + }, + { + "epoch": 0.5615061696985116, + "ewc_loss": 0.006696272641420364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.696272612316534e-05, + "grad_norm": 3.4863860607147217, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8595188856124878, + "num_tokens": 168554305.0, + "step": 4414 + }, + { + "epoch": 0.5616333799771022, + "ewc_loss": 0.006739222444593906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.739222590113059e-05, + "grad_norm": 3.432033061981201, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8474267721176147, + "num_tokens": 168597752.0, + "step": 4415 + }, + { + "epoch": 0.5617605902556927, + "ewc_loss": 0.006681438535451889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.681438389932737e-05, + "grad_norm": 3.4386935234069824, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8674818873405457, + "num_tokens": 168637764.0, + "step": 4416 + }, + { + "epoch": 0.5618878005342832, + "ewc_loss": 0.006714957766234875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.714957999065518e-05, + "grad_norm": 3.468289852142334, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8441655039787292, + "num_tokens": 168672260.0, + "step": 4417 + }, + { + "epoch": 0.5620150108128736, + "ewc_loss": 0.0067236279137432575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.72362803015858e-05, + "grad_norm": 3.3881211280822754, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8708224296569824, + "num_tokens": 168711478.0, + "step": 4418 + }, + { + "epoch": 0.5621422210914642, + "ewc_loss": 0.006670370697975159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.670370930805802e-05, + "grad_norm": 3.4062366485595703, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8651992082595825, + "num_tokens": 168751773.0, + "step": 4419 + }, + { + "epoch": 0.5622694313700547, + "ewc_loss": 0.006714432034641504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.714431947330013e-05, + "grad_norm": 3.471820831298828, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.871868371963501, + "num_tokens": 168786364.0, + "step": 4420 + }, + { + "epoch": 0.5623966416486452, + "ewc_loss": 0.0067396643571555614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.73966424074024e-05, + "grad_norm": 3.4259119033813477, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8599342107772827, + "num_tokens": 168826819.0, + "step": 4421 + }, + { + "epoch": 0.5625238519272358, + "ewc_loss": 0.00666832784190774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.66832784190774e-05, + "grad_norm": 3.401118755340576, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8609445095062256, + "num_tokens": 168869405.0, + "step": 4422 + }, + { + "epoch": 0.5626510622058263, + "ewc_loss": 0.006673845928162336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.673845928162336e-05, + "grad_norm": 3.4664649963378906, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8684765696525574, + "num_tokens": 168904323.0, + "step": 4423 + }, + { + "epoch": 0.5627782724844167, + "ewc_loss": 0.0067110550589859486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.71105517540127e-05, + "grad_norm": 3.402681827545166, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8539356589317322, + "num_tokens": 168948654.0, + "step": 4424 + }, + { + "epoch": 0.5629054827630072, + "ewc_loss": 0.0066459630616009235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.645963003393263e-05, + "grad_norm": 3.4403326511383057, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8681243062019348, + "num_tokens": 168984333.0, + "step": 4425 + }, + { + "epoch": 0.5630326930415978, + "ewc_loss": 0.006694300100207329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.694300100207329e-05, + "grad_norm": 3.476649522781372, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8643208742141724, + "num_tokens": 169019843.0, + "step": 4426 + }, + { + "epoch": 0.5631599033201883, + "ewc_loss": 0.006702952552586794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.702952669002116e-05, + "grad_norm": 3.4313628673553467, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.874334454536438, + "num_tokens": 169058177.0, + "step": 4427 + }, + { + "epoch": 0.5632871135987788, + "ewc_loss": 0.00665789982303977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.657899939455092e-05, + "grad_norm": 3.380542278289795, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8680503964424133, + "num_tokens": 169102268.0, + "step": 4428 + }, + { + "epoch": 0.5634143238773693, + "ewc_loss": 0.0066490937024354935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.649093847954646e-05, + "grad_norm": 3.410705089569092, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8571018576622009, + "num_tokens": 169141709.0, + "step": 4429 + }, + { + "epoch": 0.5635415341559598, + "ewc_loss": 0.006676536053419113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.6765358496923e-05, + "grad_norm": 3.5324480533599854, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8594958782196045, + "num_tokens": 169177107.0, + "step": 4430 + }, + { + "epoch": 0.5636687444345503, + "ewc_loss": 0.006718507036566734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.718507211189717e-05, + "grad_norm": 3.438788652420044, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8589782118797302, + "num_tokens": 169211531.0, + "step": 4431 + }, + { + "epoch": 0.5637959547131408, + "ewc_loss": 0.006633634679019451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.63363462081179e-05, + "grad_norm": 3.4772911071777344, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8553479909896851, + "num_tokens": 169247507.0, + "step": 4432 + }, + { + "epoch": 0.5639231649917313, + "ewc_loss": 0.0067028324119746685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.702832615701482e-05, + "grad_norm": 3.4379618167877197, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8751978874206543, + "num_tokens": 169281007.0, + "step": 4433 + }, + { + "epoch": 0.5640503752703219, + "ewc_loss": 0.0066555943340063095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.655594188487157e-05, + "grad_norm": 3.4018476009368896, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8627625703811646, + "num_tokens": 169322295.0, + "step": 4434 + }, + { + "epoch": 0.5641775855489124, + "ewc_loss": 0.006631456781178713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.631456926697865e-05, + "grad_norm": 3.427795171737671, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8588277697563171, + "num_tokens": 169358184.0, + "step": 4435 + }, + { + "epoch": 0.5643047958275028, + "ewc_loss": 0.006691457238048315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.691457383567467e-05, + "grad_norm": 3.4866700172424316, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8683322668075562, + "num_tokens": 169391518.0, + "step": 4436 + }, + { + "epoch": 0.5644320061060933, + "ewc_loss": 0.006689903326332569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.689903239021078e-05, + "grad_norm": 3.4760630130767822, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8626728653907776, + "num_tokens": 169426743.0, + "step": 4437 + }, + { + "epoch": 0.5645592163846839, + "ewc_loss": 0.0066757346503436565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.675734766758978e-05, + "grad_norm": 3.382638454437256, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8810961842536926, + "num_tokens": 169466452.0, + "step": 4438 + }, + { + "epoch": 0.5646864266632744, + "ewc_loss": 0.006645838730037212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.64583858451806e-05, + "grad_norm": 3.519106864929199, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8672028183937073, + "num_tokens": 169500295.0, + "step": 4439 + }, + { + "epoch": 0.5648136369418649, + "ewc_loss": 0.006745119113475084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.745119026163593e-05, + "grad_norm": 3.3918230533599854, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8620051145553589, + "num_tokens": 169541829.0, + "step": 4440 + }, + { + "epoch": 0.5649408472204555, + "ewc_loss": 0.006628494709730148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.628494884353131e-05, + "grad_norm": 3.4767394065856934, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8466535210609436, + "num_tokens": 169581547.0, + "step": 4441 + }, + { + "epoch": 0.5650680574990459, + "ewc_loss": 0.006726991850882769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.726991705363616e-05, + "grad_norm": 3.397085189819336, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8668146729469299, + "num_tokens": 169622107.0, + "step": 4442 + }, + { + "epoch": 0.5651952677776364, + "ewc_loss": 0.006650455296039581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.65045517962426e-05, + "grad_norm": 3.4921140670776367, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.860988199710846, + "num_tokens": 169657258.0, + "step": 4443 + }, + { + "epoch": 0.5653224780562269, + "ewc_loss": 0.006734625436365604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.734625640092418e-05, + "grad_norm": 3.4797027111053467, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8520858287811279, + "num_tokens": 169691568.0, + "step": 4444 + }, + { + "epoch": 0.5654496883348175, + "ewc_loss": 0.0067043243907392025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.704324187012389e-05, + "grad_norm": 3.443311929702759, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8684030175209045, + "num_tokens": 169732429.0, + "step": 4445 + }, + { + "epoch": 0.565576898613408, + "ewc_loss": 0.006700838916003704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.700839003315195e-05, + "grad_norm": 3.4513659477233887, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8558670282363892, + "num_tokens": 169769224.0, + "step": 4446 + }, + { + "epoch": 0.5657041088919985, + "ewc_loss": 0.006720623932778835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.720623787259683e-05, + "grad_norm": 3.425548791885376, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8620286583900452, + "num_tokens": 169812830.0, + "step": 4447 + }, + { + "epoch": 0.5658313191705889, + "ewc_loss": 0.006694445386528969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.694445619359612e-05, + "grad_norm": 3.485245704650879, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8600752353668213, + "num_tokens": 169843823.0, + "step": 4448 + }, + { + "epoch": 0.5659585294491795, + "ewc_loss": 0.006736208219081163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.736208160873502e-05, + "grad_norm": 3.440619468688965, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8644412755966187, + "num_tokens": 169882847.0, + "step": 4449 + }, + { + "epoch": 0.56608573972777, + "ewc_loss": 0.0067067076452076435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.706707790726796e-05, + "grad_norm": 3.4775681495666504, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8453096151351929, + "num_tokens": 169921264.0, + "step": 4450 + }, + { + "epoch": 0.5662129500063605, + "ewc_loss": 0.006741420831531286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.741420656908303e-05, + "grad_norm": 3.5169825553894043, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8687570095062256, + "num_tokens": 169949681.0, + "step": 4451 + }, + { + "epoch": 0.566340160284951, + "ewc_loss": 0.0067696236073970795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.769623723812401e-05, + "grad_norm": 3.432239532470703, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8758089542388916, + "num_tokens": 169986254.0, + "step": 4452 + }, + { + "epoch": 0.5664673705635416, + "ewc_loss": 0.006697514094412327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.697513890685514e-05, + "grad_norm": 3.426055669784546, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8589160442352295, + "num_tokens": 170022745.0, + "step": 4453 + }, + { + "epoch": 0.566594580842132, + "ewc_loss": 0.006741194985806942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.741195102222264e-05, + "grad_norm": 3.4381065368652344, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8738681077957153, + "num_tokens": 170057038.0, + "step": 4454 + }, + { + "epoch": 0.5667217911207225, + "ewc_loss": 0.006747217383235693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.747217412339523e-05, + "grad_norm": 3.4042775630950928, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8800272345542908, + "num_tokens": 170098460.0, + "step": 4455 + }, + { + "epoch": 0.566849001399313, + "ewc_loss": 0.006723390892148018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.723390833940357e-05, + "grad_norm": 3.4494833946228027, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8499160408973694, + "num_tokens": 170137689.0, + "step": 4456 + }, + { + "epoch": 0.5669762116779036, + "ewc_loss": 0.006756891496479511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.756891525583342e-05, + "grad_norm": 3.4384708404541016, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8643587827682495, + "num_tokens": 170176716.0, + "step": 4457 + }, + { + "epoch": 0.5671034219564941, + "ewc_loss": 0.006731903180480003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.73190297675319e-05, + "grad_norm": 3.4719674587249756, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.859317421913147, + "num_tokens": 170212621.0, + "step": 4458 + }, + { + "epoch": 0.5672306322350846, + "ewc_loss": 0.006749007850885391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.749008025508374e-05, + "grad_norm": 3.4424891471862793, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8529535531997681, + "num_tokens": 170254493.0, + "step": 4459 + }, + { + "epoch": 0.5673578425136752, + "ewc_loss": 0.006719762925058603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.719763041473925e-05, + "grad_norm": 3.427482843399048, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8856484889984131, + "num_tokens": 170290559.0, + "step": 4460 + }, + { + "epoch": 0.5674850527922656, + "ewc_loss": 0.006722759455442429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.722759280819446e-05, + "grad_norm": 3.4085628986358643, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8564386367797852, + "num_tokens": 170335079.0, + "step": 4461 + }, + { + "epoch": 0.5676122630708561, + "ewc_loss": 0.0066936262883245945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.693626346532255e-05, + "grad_norm": 3.4634265899658203, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.865018904209137, + "num_tokens": 170370665.0, + "step": 4462 + }, + { + "epoch": 0.5677394733494466, + "ewc_loss": 0.006735008209943771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.735008355462924e-05, + "grad_norm": 3.45332932472229, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8633064031600952, + "num_tokens": 170412585.0, + "step": 4463 + }, + { + "epoch": 0.5678666836280372, + "ewc_loss": 0.006701016332954168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.701016536680982e-05, + "grad_norm": 3.4296224117279053, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8688392639160156, + "num_tokens": 170450806.0, + "step": 4464 + }, + { + "epoch": 0.5679938939066277, + "ewc_loss": 0.006682964041829109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.682964158244431e-05, + "grad_norm": 3.518214225769043, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8621071577072144, + "num_tokens": 170483265.0, + "step": 4465 + }, + { + "epoch": 0.5681211041852182, + "ewc_loss": 0.00674907211214304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.749072053935379e-05, + "grad_norm": 3.4387879371643066, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8789640069007874, + "num_tokens": 170520910.0, + "step": 4466 + }, + { + "epoch": 0.5682483144638086, + "ewc_loss": 0.0066779013723134995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.677901546936482e-05, + "grad_norm": 3.4810805320739746, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8467703461647034, + "num_tokens": 170556516.0, + "step": 4467 + }, + { + "epoch": 0.5683755247423992, + "ewc_loss": 0.006733949761837721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.73394970363006e-05, + "grad_norm": 3.4867594242095947, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8601317405700684, + "num_tokens": 170595571.0, + "step": 4468 + }, + { + "epoch": 0.5685027350209897, + "ewc_loss": 0.006722435355186462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.722435500705615e-05, + "grad_norm": 3.3716697692871094, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8819332122802734, + "num_tokens": 170634117.0, + "step": 4469 + }, + { + "epoch": 0.5686299452995802, + "ewc_loss": 0.006667299661785364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.667299749096856e-05, + "grad_norm": 3.3912127017974854, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8576226830482483, + "num_tokens": 170677624.0, + "step": 4470 + }, + { + "epoch": 0.5687571555781707, + "ewc_loss": 0.006711932830512524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.711932655889541e-05, + "grad_norm": 3.4496030807495117, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.848314642906189, + "num_tokens": 170716732.0, + "step": 4471 + }, + { + "epoch": 0.5688843658567613, + "ewc_loss": 0.006733778864145279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.733778718626127e-05, + "grad_norm": 3.398876428604126, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8560018539428711, + "num_tokens": 170758521.0, + "step": 4472 + }, + { + "epoch": 0.5690115761353517, + "ewc_loss": 0.0066959429532289505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.695943011436611e-05, + "grad_norm": 3.4279532432556152, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8621485233306885, + "num_tokens": 170796799.0, + "step": 4473 + }, + { + "epoch": 0.5691387864139422, + "ewc_loss": 0.006729843094944954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.729843153152615e-05, + "grad_norm": 3.433739185333252, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.85002201795578, + "num_tokens": 170838569.0, + "step": 4474 + }, + { + "epoch": 0.5692659966925327, + "ewc_loss": 0.006719296798110008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.719296652590856e-05, + "grad_norm": 3.417128562927246, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8640691041946411, + "num_tokens": 170878206.0, + "step": 4475 + }, + { + "epoch": 0.5693932069711233, + "ewc_loss": 0.006702148821204901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.702148675685748e-05, + "grad_norm": 3.488696575164795, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8510016202926636, + "num_tokens": 170915506.0, + "step": 4476 + }, + { + "epoch": 0.5695204172497138, + "ewc_loss": 0.006760303396731615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.760303222108632e-05, + "grad_norm": 3.4509987831115723, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8581475019454956, + "num_tokens": 170952303.0, + "step": 4477 + }, + { + "epoch": 0.5696476275283043, + "ewc_loss": 0.006706364452838898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.706364365527406e-05, + "grad_norm": 3.4256703853607178, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8566639423370361, + "num_tokens": 170993757.0, + "step": 4478 + }, + { + "epoch": 0.5697748378068948, + "ewc_loss": 0.006703697144985199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.703696999466047e-05, + "grad_norm": 3.437833309173584, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8594977259635925, + "num_tokens": 171030240.0, + "step": 4479 + }, + { + "epoch": 0.5699020480854853, + "ewc_loss": 0.006724026054143906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.724026025040075e-05, + "grad_norm": 3.509248733520508, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8394432067871094, + "num_tokens": 171069654.0, + "step": 4480 + }, + { + "epoch": 0.5700292583640758, + "ewc_loss": 0.006738236639648676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.738236697856337e-05, + "grad_norm": 3.436601161956787, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8455164432525635, + "num_tokens": 171107341.0, + "step": 4481 + }, + { + "epoch": 0.5701564686426663, + "ewc_loss": 0.0066918362863361835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.691836460959166e-05, + "grad_norm": 3.4962449073791504, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8537095785140991, + "num_tokens": 171144063.0, + "step": 4482 + }, + { + "epoch": 0.5702836789212569, + "ewc_loss": 0.006733555346727371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.733555346727371e-05, + "grad_norm": 3.4574036598205566, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.857183575630188, + "num_tokens": 171180448.0, + "step": 4483 + }, + { + "epoch": 0.5704108891998474, + "ewc_loss": 0.006697952747344971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.697952630929649e-05, + "grad_norm": 3.495436191558838, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8667526841163635, + "num_tokens": 171214220.0, + "step": 4484 + }, + { + "epoch": 0.5705380994784378, + "ewc_loss": 0.006736599840223789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.736599607393146e-05, + "grad_norm": 3.417961835861206, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8679364323616028, + "num_tokens": 171252798.0, + "step": 4485 + }, + { + "epoch": 0.5706653097570283, + "ewc_loss": 0.006685066036880016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.685066182399169e-05, + "grad_norm": 3.4153289794921875, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.866783082485199, + "num_tokens": 171295434.0, + "step": 4486 + }, + { + "epoch": 0.5707925200356189, + "ewc_loss": 0.006709156092256308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.709156150463969e-05, + "grad_norm": 3.4461026191711426, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8628525733947754, + "num_tokens": 171334129.0, + "step": 4487 + }, + { + "epoch": 0.5709197303142094, + "ewc_loss": 0.006721213925629854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.721213867422193e-05, + "grad_norm": 3.4989235401153564, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8587767481803894, + "num_tokens": 171370239.0, + "step": 4488 + }, + { + "epoch": 0.5710469405927999, + "ewc_loss": 0.006722546648234129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.722546822857112e-05, + "grad_norm": 3.4623188972473145, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8763593435287476, + "num_tokens": 171402411.0, + "step": 4489 + }, + { + "epoch": 0.5711741508713905, + "ewc_loss": 0.0066929832100868225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.692983151879162e-05, + "grad_norm": 3.455850601196289, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8731130957603455, + "num_tokens": 171438736.0, + "step": 4490 + }, + { + "epoch": 0.5713013611499809, + "ewc_loss": 0.006697759032249451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.697759090457112e-05, + "grad_norm": 3.422341823577881, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8686171174049377, + "num_tokens": 171477949.0, + "step": 4491 + }, + { + "epoch": 0.5714285714285714, + "ewc_loss": 0.006698863580822945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.698863580822945e-05, + "grad_norm": 3.430290699005127, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8581297397613525, + "num_tokens": 171520953.0, + "step": 4492 + }, + { + "epoch": 0.5715557817071619, + "ewc_loss": 0.006715407595038414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.715407653246075e-05, + "grad_norm": 3.401928424835205, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.870552659034729, + "num_tokens": 171559415.0, + "step": 4493 + }, + { + "epoch": 0.5716829919857525, + "ewc_loss": 0.006676024291664362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.676024349872023e-05, + "grad_norm": 3.5138707160949707, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8689589500427246, + "num_tokens": 171593118.0, + "step": 4494 + }, + { + "epoch": 0.571810202264343, + "ewc_loss": 0.006764546502381563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.764546560589224e-05, + "grad_norm": 3.429448127746582, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8489910364151001, + "num_tokens": 171634514.0, + "step": 4495 + }, + { + "epoch": 0.5719374125429335, + "ewc_loss": 0.00667591905221343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.675918848486617e-05, + "grad_norm": 3.4177892208099365, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8646179437637329, + "num_tokens": 171675239.0, + "step": 4496 + }, + { + "epoch": 0.5720646228215239, + "ewc_loss": 0.006690523121505976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.690523150609806e-05, + "grad_norm": 3.425100326538086, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8698897361755371, + "num_tokens": 171713154.0, + "step": 4497 + }, + { + "epoch": 0.5721918331001145, + "ewc_loss": 0.006694284733384848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.694284820696339e-05, + "grad_norm": 3.483478307723999, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8554747700691223, + "num_tokens": 171750046.0, + "step": 4498 + }, + { + "epoch": 0.572319043378705, + "ewc_loss": 0.006728270091116428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.728270091116428e-05, + "grad_norm": 3.499776601791382, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8523263931274414, + "num_tokens": 171782708.0, + "step": 4499 + }, + { + "epoch": 0.5724462536572955, + "ewc_loss": 0.00673286197707057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.73286194796674e-05, + "grad_norm": 3.484754800796509, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8590865135192871, + "num_tokens": 171819421.0, + "step": 4500 + }, + { + "epoch": 0.572573463935886, + "ewc_loss": 0.006719936151057482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.719936209265143e-05, + "grad_norm": 3.4858477115631104, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8619146347045898, + "num_tokens": 171860993.0, + "step": 4501 + }, + { + "epoch": 0.5727006742144766, + "ewc_loss": 0.006728038191795349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.728037988068536e-05, + "grad_norm": 3.412200450897217, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8615705966949463, + "num_tokens": 171896894.0, + "step": 4502 + }, + { + "epoch": 0.572827884493067, + "ewc_loss": 0.006688929628580809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.6889297158923e-05, + "grad_norm": 3.4242262840270996, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8604369163513184, + "num_tokens": 171938694.0, + "step": 4503 + }, + { + "epoch": 0.5729550947716575, + "ewc_loss": 0.006714384537190199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.714384653605521e-05, + "grad_norm": 3.422405958175659, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8865403532981873, + "num_tokens": 171972606.0, + "step": 4504 + }, + { + "epoch": 0.573082305050248, + "ewc_loss": 0.006719936151057482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.719936209265143e-05, + "grad_norm": 3.460909128189087, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8642798066139221, + "num_tokens": 172010861.0, + "step": 4505 + }, + { + "epoch": 0.5732095153288386, + "ewc_loss": 0.006743920501321554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.743920675944537e-05, + "grad_norm": 3.545043468475342, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8466832637786865, + "num_tokens": 172049867.0, + "step": 4506 + }, + { + "epoch": 0.5733367256074291, + "ewc_loss": 0.006787216290831566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.787216261727735e-05, + "grad_norm": 3.4101169109344482, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8622944951057434, + "num_tokens": 172092176.0, + "step": 4507 + }, + { + "epoch": 0.5734639358860196, + "ewc_loss": 0.006683398503810167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.683398532913998e-05, + "grad_norm": 3.425684928894043, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.862895131111145, + "num_tokens": 172132789.0, + "step": 4508 + }, + { + "epoch": 0.5735911461646102, + "ewc_loss": 0.0067449999041855335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.74499970045872e-05, + "grad_norm": 3.442301034927368, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.87679123878479, + "num_tokens": 172170536.0, + "step": 4509 + }, + { + "epoch": 0.5737183564432006, + "ewc_loss": 0.006736106239259243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.736106297466904e-05, + "grad_norm": 3.4516162872314453, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8605193495750427, + "num_tokens": 172206828.0, + "step": 4510 + }, + { + "epoch": 0.5738455667217911, + "ewc_loss": 0.006745042745023966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.745042628608644e-05, + "grad_norm": 3.461463689804077, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8540579080581665, + "num_tokens": 172242311.0, + "step": 4511 + }, + { + "epoch": 0.5739727770003816, + "ewc_loss": 0.006747478619217873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.747478619217873e-05, + "grad_norm": 3.5010735988616943, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8573891520500183, + "num_tokens": 172275135.0, + "step": 4512 + }, + { + "epoch": 0.5740999872789722, + "ewc_loss": 0.006779659539461136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.779659452149644e-05, + "grad_norm": 3.4704267978668213, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8701401352882385, + "num_tokens": 172308682.0, + "step": 4513 + }, + { + "epoch": 0.5742271975575627, + "ewc_loss": 0.006753351539373398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.753351772204041e-05, + "grad_norm": 3.454530954360962, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8580342531204224, + "num_tokens": 172345834.0, + "step": 4514 + }, + { + "epoch": 0.5743544078361532, + "ewc_loss": 0.006746446713805199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.746446888428181e-05, + "grad_norm": 3.5007035732269287, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8405275344848633, + "num_tokens": 172387976.0, + "step": 4515 + }, + { + "epoch": 0.5744816181147436, + "ewc_loss": 0.006781206466257572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.78120632073842e-05, + "grad_norm": 3.425462484359741, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8617799282073975, + "num_tokens": 172427447.0, + "step": 4516 + }, + { + "epoch": 0.5746088283933342, + "ewc_loss": 0.006732794921845198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.732795009156689e-05, + "grad_norm": 3.4260571002960205, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8553690910339355, + "num_tokens": 172468686.0, + "step": 4517 + }, + { + "epoch": 0.5747360386719247, + "ewc_loss": 0.006757022347301245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.757022492820397e-05, + "grad_norm": 3.443974733352661, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8574979901313782, + "num_tokens": 172508190.0, + "step": 4518 + }, + { + "epoch": 0.5748632489505152, + "ewc_loss": 0.006778977811336517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.778977694921196e-05, + "grad_norm": 3.3882696628570557, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8768637180328369, + "num_tokens": 172552295.0, + "step": 4519 + }, + { + "epoch": 0.5749904592291057, + "ewc_loss": 0.006739083677530289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.739083619322628e-05, + "grad_norm": 3.4607927799224854, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8683358430862427, + "num_tokens": 172591065.0, + "step": 4520 + }, + { + "epoch": 0.5751176695076963, + "ewc_loss": 0.006804616190493107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.804615986766294e-05, + "grad_norm": 3.4619035720825195, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8650366067886353, + "num_tokens": 172633741.0, + "step": 4521 + }, + { + "epoch": 0.5752448797862867, + "ewc_loss": 0.006752564571797848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.752564513590187e-05, + "grad_norm": 3.5324692726135254, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8706376552581787, + "num_tokens": 172669081.0, + "step": 4522 + }, + { + "epoch": 0.5753720900648772, + "ewc_loss": 0.00678672082722187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.78672076901421e-05, + "grad_norm": 3.508882999420166, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8399586081504822, + "num_tokens": 172704956.0, + "step": 4523 + }, + { + "epoch": 0.5754993003434677, + "ewc_loss": 0.0067612286657094955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.761228723917156e-05, + "grad_norm": 3.4494004249572754, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8667381405830383, + "num_tokens": 172742270.0, + "step": 4524 + }, + { + "epoch": 0.5756265106220583, + "ewc_loss": 0.006726463325321674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.726463470840827e-05, + "grad_norm": 3.4840750694274902, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8510645627975464, + "num_tokens": 172778065.0, + "step": 4525 + }, + { + "epoch": 0.5757537209006488, + "ewc_loss": 0.0067530847154557705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.753084744559601e-05, + "grad_norm": 3.438701868057251, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8703645467758179, + "num_tokens": 172815825.0, + "step": 4526 + }, + { + "epoch": 0.5758809311792393, + "ewc_loss": 0.006719132885336876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.719132943544537e-05, + "grad_norm": 3.4699347019195557, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8626880645751953, + "num_tokens": 172849560.0, + "step": 4527 + }, + { + "epoch": 0.5760081414578297, + "ewc_loss": 0.006762350909411907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.762350676581264e-05, + "grad_norm": 3.483086585998535, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.864098072052002, + "num_tokens": 172885203.0, + "step": 4528 + }, + { + "epoch": 0.5761353517364203, + "ewc_loss": 0.006762057542800903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.762057455489412e-05, + "grad_norm": 3.469961643218994, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8615208268165588, + "num_tokens": 172922341.0, + "step": 4529 + }, + { + "epoch": 0.5762625620150108, + "ewc_loss": 0.006754667963832617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.754667992936447e-05, + "grad_norm": 3.5031795501708984, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8692853450775146, + "num_tokens": 172953363.0, + "step": 4530 + }, + { + "epoch": 0.5763897722936013, + "ewc_loss": 0.0067734913900494576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.773491622880101e-05, + "grad_norm": 3.5044074058532715, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8686906099319458, + "num_tokens": 172986253.0, + "step": 4531 + }, + { + "epoch": 0.5765169825721919, + "ewc_loss": 0.006776551250368357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.776551163056865e-05, + "grad_norm": 3.439654588699341, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8737694025039673, + "num_tokens": 173023466.0, + "step": 4532 + }, + { + "epoch": 0.5766441928507824, + "ewc_loss": 0.006741298362612724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.741298420820385e-05, + "grad_norm": 3.495319128036499, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.856235921382904, + "num_tokens": 173063329.0, + "step": 4533 + }, + { + "epoch": 0.5767714031293728, + "ewc_loss": 0.006788490805774927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.78849100950174e-05, + "grad_norm": 3.4756786823272705, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8682523965835571, + "num_tokens": 173100676.0, + "step": 4534 + }, + { + "epoch": 0.5768986134079633, + "ewc_loss": 0.006761028431355953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.761028635082766e-05, + "grad_norm": 3.4522664546966553, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8721262216567993, + "num_tokens": 173135088.0, + "step": 4535 + }, + { + "epoch": 0.5770258236865539, + "ewc_loss": 0.006754482630640268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.754482456017286e-05, + "grad_norm": 3.4303646087646484, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.853306770324707, + "num_tokens": 173175810.0, + "step": 4536 + }, + { + "epoch": 0.5771530339651444, + "ewc_loss": 0.0067437635734677315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.74376351526007e-05, + "grad_norm": 3.3640758991241455, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8637980222702026, + "num_tokens": 173220826.0, + "step": 4537 + }, + { + "epoch": 0.5772802442437349, + "ewc_loss": 0.006731533445417881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.73153335810639e-05, + "grad_norm": 3.4754765033721924, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.872323751449585, + "num_tokens": 173258411.0, + "step": 4538 + }, + { + "epoch": 0.5774074545223254, + "ewc_loss": 0.006811683066189289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.81168312439695e-05, + "grad_norm": 3.4819388389587402, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8472344875335693, + "num_tokens": 173297298.0, + "step": 4539 + }, + { + "epoch": 0.5775346648009159, + "ewc_loss": 0.006772823166102171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.772822962375358e-05, + "grad_norm": 3.5231401920318604, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8461107611656189, + "num_tokens": 173330839.0, + "step": 4540 + }, + { + "epoch": 0.5776618750795064, + "ewc_loss": 0.00679906876757741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.79906879668124e-05, + "grad_norm": 3.4898488521575928, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8541252613067627, + "num_tokens": 173370219.0, + "step": 4541 + }, + { + "epoch": 0.5777890853580969, + "ewc_loss": 0.00676464568823576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.764645513612777e-05, + "grad_norm": 3.4194178581237793, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8619382977485657, + "num_tokens": 173410425.0, + "step": 4542 + }, + { + "epoch": 0.5779162956366874, + "ewc_loss": 0.006730176974087954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.730177119607106e-05, + "grad_norm": 3.4477386474609375, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8636021018028259, + "num_tokens": 173447109.0, + "step": 4543 + }, + { + "epoch": 0.578043505915278, + "ewc_loss": 0.006773795001208782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.773795030312613e-05, + "grad_norm": 3.4266929626464844, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8659628033638, + "num_tokens": 173490064.0, + "step": 4544 + }, + { + "epoch": 0.5781707161938685, + "ewc_loss": 0.006746803410351276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.746803410351276e-05, + "grad_norm": 3.50836443901062, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.865204930305481, + "num_tokens": 173523791.0, + "step": 4545 + }, + { + "epoch": 0.5782979264724589, + "ewc_loss": 0.006788555532693863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.788555765524507e-05, + "grad_norm": 3.383876323699951, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8721169233322144, + "num_tokens": 173564069.0, + "step": 4546 + }, + { + "epoch": 0.5784251367510495, + "ewc_loss": 0.006681364960968494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.681364902760834e-05, + "grad_norm": 3.4805195331573486, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.860711932182312, + "num_tokens": 173601893.0, + "step": 4547 + }, + { + "epoch": 0.57855234702964, + "ewc_loss": 0.006807745900005102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.807746103731915e-05, + "grad_norm": 3.4459290504455566, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8732247352600098, + "num_tokens": 173642299.0, + "step": 4548 + }, + { + "epoch": 0.5786795573082305, + "ewc_loss": 0.00671966839581728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.719668454024941e-05, + "grad_norm": 3.46975040435791, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8646319508552551, + "num_tokens": 173675824.0, + "step": 4549 + }, + { + "epoch": 0.578806767586821, + "ewc_loss": 0.006764431018382311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.764430872863159e-05, + "grad_norm": 3.483027935028076, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8638761043548584, + "num_tokens": 173712192.0, + "step": 4550 + }, + { + "epoch": 0.5789339778654116, + "ewc_loss": 0.00675744004547596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.757440132787451e-05, + "grad_norm": 3.51102614402771, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8701415657997131, + "num_tokens": 173746949.0, + "step": 4551 + }, + { + "epoch": 0.579061188144002, + "ewc_loss": 0.0067697749473154545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.769775063730776e-05, + "grad_norm": 3.441460371017456, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8648200035095215, + "num_tokens": 173784527.0, + "step": 4552 + }, + { + "epoch": 0.5791883984225925, + "ewc_loss": 0.00672305328771472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.723053229507059e-05, + "grad_norm": 3.4391098022460938, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8685543537139893, + "num_tokens": 173820648.0, + "step": 4553 + }, + { + "epoch": 0.579315608701183, + "ewc_loss": 0.006757479626685381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.757479422958568e-05, + "grad_norm": 3.425330400466919, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8575812578201294, + "num_tokens": 173860427.0, + "step": 4554 + }, + { + "epoch": 0.5794428189797736, + "ewc_loss": 0.006730139255523682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.730139284627512e-05, + "grad_norm": 3.436732053756714, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8662398457527161, + "num_tokens": 173899876.0, + "step": 4555 + }, + { + "epoch": 0.5795700292583641, + "ewc_loss": 0.006741474382579327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.741474498994648e-05, + "grad_norm": 3.5074942111968994, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8721023797988892, + "num_tokens": 173937354.0, + "step": 4556 + }, + { + "epoch": 0.5796972395369546, + "ewc_loss": 0.006791894789785147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.791894702473655e-05, + "grad_norm": 3.4745798110961914, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8516324162483215, + "num_tokens": 173976039.0, + "step": 4557 + }, + { + "epoch": 0.5798244498155452, + "ewc_loss": 0.00673988601192832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.739886157447472e-05, + "grad_norm": 3.5475831031799316, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8429136872291565, + "num_tokens": 174006359.0, + "step": 4558 + }, + { + "epoch": 0.5799516600941356, + "ewc_loss": 0.006794457323849201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.794457294745371e-05, + "grad_norm": 3.4716668128967285, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8545213937759399, + "num_tokens": 174044662.0, + "step": 4559 + }, + { + "epoch": 0.5800788703727261, + "ewc_loss": 0.006737784948199987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.737784860888496e-05, + "grad_norm": 3.529801368713379, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8578502535820007, + "num_tokens": 174077133.0, + "step": 4560 + }, + { + "epoch": 0.5802060806513166, + "ewc_loss": 0.0068076616153120995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.807661702623591e-05, + "grad_norm": 3.382514238357544, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8582536578178406, + "num_tokens": 174121873.0, + "step": 4561 + }, + { + "epoch": 0.5803332909299072, + "ewc_loss": 0.006713864393532276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.713864422636107e-05, + "grad_norm": 3.454819917678833, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8646624684333801, + "num_tokens": 174158873.0, + "step": 4562 + }, + { + "epoch": 0.5804605012084977, + "ewc_loss": 0.006798883900046349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.79888398735784e-05, + "grad_norm": 3.5075416564941406, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8702019453048706, + "num_tokens": 174189222.0, + "step": 4563 + }, + { + "epoch": 0.5805877114870882, + "ewc_loss": 0.006808431353420019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.808431498939171e-05, + "grad_norm": 3.4645848274230957, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8541558384895325, + "num_tokens": 174229018.0, + "step": 4564 + }, + { + "epoch": 0.5807149217656786, + "ewc_loss": 0.006770042702555656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.770042818970978e-05, + "grad_norm": 3.4615538120269775, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8649020791053772, + "num_tokens": 174269215.0, + "step": 4565 + }, + { + "epoch": 0.5808421320442692, + "ewc_loss": 0.006797567941248417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.797567766625434e-05, + "grad_norm": 3.4630167484283447, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8707197904586792, + "num_tokens": 174303926.0, + "step": 4566 + }, + { + "epoch": 0.5809693423228597, + "ewc_loss": 0.006792018190026283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.792018393753096e-05, + "grad_norm": 3.4160170555114746, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.864870548248291, + "num_tokens": 174347491.0, + "step": 4567 + }, + { + "epoch": 0.5810965526014502, + "ewc_loss": 0.006772101391106844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.772101187380031e-05, + "grad_norm": 3.4562110900878906, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8635658621788025, + "num_tokens": 174384246.0, + "step": 4568 + }, + { + "epoch": 0.5812237628800407, + "ewc_loss": 0.006822071969509125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.822071736678481e-05, + "grad_norm": 3.4639077186584473, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8630063533782959, + "num_tokens": 174417081.0, + "step": 4569 + }, + { + "epoch": 0.5813509731586313, + "ewc_loss": 0.006802042480558157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.802042480558157e-05, + "grad_norm": 3.4453225135803223, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8523552417755127, + "num_tokens": 174457364.0, + "step": 4570 + }, + { + "epoch": 0.5814781834372217, + "ewc_loss": 0.006801787298172712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.801787094445899e-05, + "grad_norm": 3.4566409587860107, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8627375960350037, + "num_tokens": 174494304.0, + "step": 4571 + }, + { + "epoch": 0.5816053937158122, + "ewc_loss": 0.006813480984419584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.813481013523415e-05, + "grad_norm": 3.6075439453125, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8639448881149292, + "num_tokens": 174528047.0, + "step": 4572 + }, + { + "epoch": 0.5817326039944027, + "ewc_loss": 0.006898121442645788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.898121500853449e-05, + "grad_norm": 3.4394168853759766, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8674001693725586, + "num_tokens": 174565990.0, + "step": 4573 + }, + { + "epoch": 0.5818598142729933, + "ewc_loss": 0.006759743206202984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.75974297337234e-05, + "grad_norm": 3.506441593170166, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8481143116950989, + "num_tokens": 174599039.0, + "step": 4574 + }, + { + "epoch": 0.5819870245515838, + "ewc_loss": 0.0068529509007930756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.852950900793076e-05, + "grad_norm": 3.46879243850708, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8564931154251099, + "num_tokens": 174634786.0, + "step": 4575 + }, + { + "epoch": 0.5821142348301743, + "ewc_loss": 0.006806798744946718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.806798774050549e-05, + "grad_norm": 3.4289627075195312, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8532211780548096, + "num_tokens": 174677875.0, + "step": 4576 + }, + { + "epoch": 0.5822414451087647, + "ewc_loss": 0.006795557215809822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.795557419536635e-05, + "grad_norm": 3.4428293704986572, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8559839725494385, + "num_tokens": 174715421.0, + "step": 4577 + }, + { + "epoch": 0.5823686553873553, + "ewc_loss": 0.006823580712080002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.823580770287663e-05, + "grad_norm": 3.593801975250244, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8587571382522583, + "num_tokens": 174746555.0, + "step": 4578 + }, + { + "epoch": 0.5824958656659458, + "ewc_loss": 0.0069047738797962666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.904773908900097e-05, + "grad_norm": 3.4413607120513916, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8509047031402588, + "num_tokens": 174786470.0, + "step": 4579 + }, + { + "epoch": 0.5826230759445363, + "ewc_loss": 0.00676365103572607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.763650890206918e-05, + "grad_norm": 3.4338858127593994, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8588320016860962, + "num_tokens": 174827131.0, + "step": 4580 + }, + { + "epoch": 0.5827502862231269, + "ewc_loss": 0.006837164983153343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.837164983153343e-05, + "grad_norm": 3.509136915206909, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8556740283966064, + "num_tokens": 174861391.0, + "step": 4581 + }, + { + "epoch": 0.5828774965017174, + "ewc_loss": 0.006870729848742485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.870729703223333e-05, + "grad_norm": 3.489182472229004, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8671085834503174, + "num_tokens": 174897437.0, + "step": 4582 + }, + { + "epoch": 0.5830047067803078, + "ewc_loss": 0.006837303750216961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.837303953943774e-05, + "grad_norm": 3.7440545558929443, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.854387104511261, + "num_tokens": 174932504.0, + "step": 4583 + }, + { + "epoch": 0.5831319170588983, + "ewc_loss": 0.006993305403739214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.993305578362197e-05, + "grad_norm": 3.410754919052124, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8567825555801392, + "num_tokens": 174974746.0, + "step": 4584 + }, + { + "epoch": 0.5832591273374889, + "ewc_loss": 0.006726390682160854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.726390711264685e-05, + "grad_norm": 3.4101662635803223, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8566642999649048, + "num_tokens": 175016155.0, + "step": 4585 + }, + { + "epoch": 0.5833863376160794, + "ewc_loss": 0.0068408045917749405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.840804417151958e-05, + "grad_norm": 3.469951629638672, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8478209972381592, + "num_tokens": 175060595.0, + "step": 4586 + }, + { + "epoch": 0.5835135478946699, + "ewc_loss": 0.006859665270894766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.859665154479444e-05, + "grad_norm": 3.4473154544830322, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8601356744766235, + "num_tokens": 175101278.0, + "step": 4587 + }, + { + "epoch": 0.5836407581732604, + "ewc_loss": 0.006816788110882044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.816787936259061e-05, + "grad_norm": 3.440408706665039, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8554897308349609, + "num_tokens": 175139479.0, + "step": 4588 + }, + { + "epoch": 0.5837679684518509, + "ewc_loss": 0.006835341453552246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.835341628175229e-05, + "grad_norm": 3.4724526405334473, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8464587926864624, + "num_tokens": 175182376.0, + "step": 4589 + }, + { + "epoch": 0.5838951787304414, + "ewc_loss": 0.006850318517535925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.850318459328264e-05, + "grad_norm": 3.389746904373169, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8541150093078613, + "num_tokens": 175225449.0, + "step": 4590 + }, + { + "epoch": 0.5840223890090319, + "ewc_loss": 0.006792302243411541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.79230215610005e-05, + "grad_norm": 3.55997633934021, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8588306307792664, + "num_tokens": 175255959.0, + "step": 4591 + }, + { + "epoch": 0.5841495992876224, + "ewc_loss": 0.006928989663720131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.928989751031622e-05, + "grad_norm": 3.5139427185058594, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8504034280776978, + "num_tokens": 175292142.0, + "step": 4592 + }, + { + "epoch": 0.584276809566213, + "ewc_loss": 0.006829679477959871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.829679477959871e-05, + "grad_norm": 3.3791768550872803, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8551507592201233, + "num_tokens": 175336630.0, + "step": 4593 + }, + { + "epoch": 0.5844040198448035, + "ewc_loss": 0.006761368364095688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.761368422303349e-05, + "grad_norm": 3.445997476577759, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.874848484992981, + "num_tokens": 175372697.0, + "step": 4594 + }, + { + "epoch": 0.5845312301233939, + "ewc_loss": 0.00684848939999938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.848489283584058e-05, + "grad_norm": 3.4534261226654053, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8502731323242188, + "num_tokens": 175412395.0, + "step": 4595 + }, + { + "epoch": 0.5846584404019844, + "ewc_loss": 0.006819749251008034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.819749251008034e-05, + "grad_norm": 3.484729051589966, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8596792221069336, + "num_tokens": 175447295.0, + "step": 4596 + }, + { + "epoch": 0.584785650680575, + "ewc_loss": 0.0068174200132489204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.817420216975734e-05, + "grad_norm": 3.480473756790161, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.871065616607666, + "num_tokens": 175480229.0, + "step": 4597 + }, + { + "epoch": 0.5849128609591655, + "ewc_loss": 0.006818108726292849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818108522566035e-05, + "grad_norm": 3.4119417667388916, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8604909181594849, + "num_tokens": 175523081.0, + "step": 4598 + }, + { + "epoch": 0.585040071237756, + "ewc_loss": 0.0067829228937625885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.782922719139606e-05, + "grad_norm": 3.4584219455718994, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8645381927490234, + "num_tokens": 175565036.0, + "step": 4599 + }, + { + "epoch": 0.5851672815163466, + "ewc_loss": 0.0068099647760391235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.80996454320848e-05, + "grad_norm": 3.4478912353515625, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8618867993354797, + "num_tokens": 175606733.0, + "step": 4600 + }, + { + "epoch": 0.585294491794937, + "ewc_loss": 0.006789907813072205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.789907638449222e-05, + "grad_norm": 3.5123965740203857, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8756027221679688, + "num_tokens": 175637663.0, + "step": 4601 + }, + { + "epoch": 0.5854217020735275, + "ewc_loss": 0.006830455269664526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.830455095041543e-05, + "grad_norm": 3.4481868743896484, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.864362359046936, + "num_tokens": 175677381.0, + "step": 4602 + }, + { + "epoch": 0.585548912352118, + "ewc_loss": 0.00677282502874732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.772825145162642e-05, + "grad_norm": 3.46713924407959, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8725696802139282, + "num_tokens": 175713049.0, + "step": 4603 + }, + { + "epoch": 0.5856761226307086, + "ewc_loss": 0.006787515245378017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.787515303585678e-05, + "grad_norm": 3.4262866973876953, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8523281216621399, + "num_tokens": 175753681.0, + "step": 4604 + }, + { + "epoch": 0.5858033329092991, + "ewc_loss": 0.006767619866877794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.767619925085455e-05, + "grad_norm": 3.4998908042907715, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8492599725723267, + "num_tokens": 175789725.0, + "step": 4605 + }, + { + "epoch": 0.5859305431878896, + "ewc_loss": 0.006808409467339516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.808409671066329e-05, + "grad_norm": 3.5121283531188965, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8607035875320435, + "num_tokens": 175823203.0, + "step": 4606 + }, + { + "epoch": 0.5860577534664801, + "ewc_loss": 0.006813915446400642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.813915388192981e-05, + "grad_norm": 3.496135711669922, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.855472981929779, + "num_tokens": 175860506.0, + "step": 4607 + }, + { + "epoch": 0.5861849637450706, + "ewc_loss": 0.006781652569770813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.78165233694017e-05, + "grad_norm": 3.461862325668335, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8778362274169922, + "num_tokens": 175895080.0, + "step": 4608 + }, + { + "epoch": 0.5863121740236611, + "ewc_loss": 0.006777990143746138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.777990347472951e-05, + "grad_norm": 3.4721741676330566, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8558111190795898, + "num_tokens": 175937113.0, + "step": 4609 + }, + { + "epoch": 0.5864393843022516, + "ewc_loss": 0.006784325465559959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.78432552376762e-05, + "grad_norm": 3.497297525405884, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8751866817474365, + "num_tokens": 175966193.0, + "step": 4610 + }, + { + "epoch": 0.5865665945808421, + "ewc_loss": 0.006796148139983416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.796148227294907e-05, + "grad_norm": 3.458587169647217, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8549701571464539, + "num_tokens": 176003771.0, + "step": 4611 + }, + { + "epoch": 0.5866938048594327, + "ewc_loss": 0.006769872736185789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.769872561562806e-05, + "grad_norm": 3.3751602172851562, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8677729368209839, + "num_tokens": 176050069.0, + "step": 4612 + }, + { + "epoch": 0.5868210151380232, + "ewc_loss": 0.00672729779034853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.727298023179173e-05, + "grad_norm": 3.4054486751556396, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8775289058685303, + "num_tokens": 176088763.0, + "step": 4613 + }, + { + "epoch": 0.5869482254166136, + "ewc_loss": 0.006785440258681774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.785440200474113e-05, + "grad_norm": 3.453909397125244, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8626165986061096, + "num_tokens": 176127853.0, + "step": 4614 + }, + { + "epoch": 0.5870754356952042, + "ewc_loss": 0.006787814199924469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.787814345443621e-05, + "grad_norm": 3.408597946166992, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8472750186920166, + "num_tokens": 176173775.0, + "step": 4615 + }, + { + "epoch": 0.5872026459737947, + "ewc_loss": 0.006734760943800211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.734760972904041e-05, + "grad_norm": 3.484386682510376, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.865236759185791, + "num_tokens": 176210878.0, + "step": 4616 + }, + { + "epoch": 0.5873298562523852, + "ewc_loss": 0.0067866793833673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.786679296055809e-05, + "grad_norm": 3.499962568283081, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8559309840202332, + "num_tokens": 176255094.0, + "step": 4617 + }, + { + "epoch": 0.5874570665309757, + "ewc_loss": 0.006777904462069273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.777904491173103e-05, + "grad_norm": 3.4492242336273193, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.859103262424469, + "num_tokens": 176293586.0, + "step": 4618 + }, + { + "epoch": 0.5875842768095663, + "ewc_loss": 0.006741419900208712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.741419929312542e-05, + "grad_norm": 3.493290424346924, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8631761074066162, + "num_tokens": 176329203.0, + "step": 4619 + }, + { + "epoch": 0.5877114870881567, + "ewc_loss": 0.006798713933676481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.798713729949668e-05, + "grad_norm": 3.4246256351470947, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8786012530326843, + "num_tokens": 176367984.0, + "step": 4620 + }, + { + "epoch": 0.5878386973667472, + "ewc_loss": 0.006723448168486357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.723448314005509e-05, + "grad_norm": 3.4299447536468506, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8747978210449219, + "num_tokens": 176408949.0, + "step": 4621 + }, + { + "epoch": 0.5879659076453377, + "ewc_loss": 0.006763847544789314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.763847341062501e-05, + "grad_norm": 3.4489405155181885, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8775961995124817, + "num_tokens": 176451073.0, + "step": 4622 + }, + { + "epoch": 0.5880931179239283, + "ewc_loss": 0.006767588667571545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.767588638467714e-05, + "grad_norm": 3.4455010890960693, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8683292865753174, + "num_tokens": 176490623.0, + "step": 4623 + }, + { + "epoch": 0.5882203282025188, + "ewc_loss": 0.0067388382740318775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.73883841955103e-05, + "grad_norm": 3.555384397506714, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.84926837682724, + "num_tokens": 176525771.0, + "step": 4624 + }, + { + "epoch": 0.5883475384811093, + "ewc_loss": 0.006807083263993263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.807083263993263e-05, + "grad_norm": 3.3918259143829346, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8806619644165039, + "num_tokens": 176565417.0, + "step": 4625 + }, + { + "epoch": 0.5884747487596997, + "ewc_loss": 0.006679011043161154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.679011130472645e-05, + "grad_norm": 3.5454823970794678, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8594802618026733, + "num_tokens": 176599005.0, + "step": 4626 + }, + { + "epoch": 0.5886019590382903, + "ewc_loss": 0.00682269548997283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.822695286246017e-05, + "grad_norm": 3.4395318031311035, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8643325567245483, + "num_tokens": 176637947.0, + "step": 4627 + }, + { + "epoch": 0.5887291693168808, + "ewc_loss": 0.006702255457639694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.702255632262677e-05, + "grad_norm": 3.471510648727417, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8506665825843811, + "num_tokens": 176675241.0, + "step": 4628 + }, + { + "epoch": 0.5888563795954713, + "ewc_loss": 0.006766253616660833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.766253500245512e-05, + "grad_norm": 3.4332661628723145, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8552609086036682, + "num_tokens": 176715375.0, + "step": 4629 + }, + { + "epoch": 0.5889835898740619, + "ewc_loss": 0.006740792654454708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.7407927417662e-05, + "grad_norm": 3.4957778453826904, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8597331047058105, + "num_tokens": 176750977.0, + "step": 4630 + }, + { + "epoch": 0.5891108001526524, + "ewc_loss": 0.0067867557518184185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.786755693610758e-05, + "grad_norm": 3.4729344844818115, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8658453226089478, + "num_tokens": 176788968.0, + "step": 4631 + }, + { + "epoch": 0.5892380104312428, + "ewc_loss": 0.006753894500434399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.75389455864206e-05, + "grad_norm": 3.5119266510009766, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8564878702163696, + "num_tokens": 176824399.0, + "step": 4632 + }, + { + "epoch": 0.5893652207098333, + "ewc_loss": 0.0067828563041985035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.782856507925317e-05, + "grad_norm": 3.482602596282959, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8586268424987793, + "num_tokens": 176862053.0, + "step": 4633 + }, + { + "epoch": 0.5894924309884239, + "ewc_loss": 0.006746772211045027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.746772123733535e-05, + "grad_norm": 3.4609668254852295, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8778177499771118, + "num_tokens": 176896943.0, + "step": 4634 + }, + { + "epoch": 0.5896196412670144, + "ewc_loss": 0.006760248448699713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.760248652426526e-05, + "grad_norm": 3.4499433040618896, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8645524382591248, + "num_tokens": 176936263.0, + "step": 4635 + }, + { + "epoch": 0.5897468515456049, + "ewc_loss": 0.006762891076505184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.762891280231997e-05, + "grad_norm": 3.4310343265533447, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8508806824684143, + "num_tokens": 176976632.0, + "step": 4636 + }, + { + "epoch": 0.5898740618241954, + "ewc_loss": 0.0067422580905258656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.742258119629696e-05, + "grad_norm": 3.561823844909668, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.868733286857605, + "num_tokens": 177007036.0, + "step": 4637 + }, + { + "epoch": 0.5900012721027859, + "ewc_loss": 0.006839384324848652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.839384150225669e-05, + "grad_norm": 3.5420327186584473, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8524089455604553, + "num_tokens": 177042189.0, + "step": 4638 + }, + { + "epoch": 0.5901284823813764, + "ewc_loss": 0.006779141258448362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.779141403967515e-05, + "grad_norm": 3.444756031036377, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8589503765106201, + "num_tokens": 177082428.0, + "step": 4639 + }, + { + "epoch": 0.5902556926599669, + "ewc_loss": 0.006744634360074997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.744634447386488e-05, + "grad_norm": 3.408439874649048, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8655022978782654, + "num_tokens": 177124447.0, + "step": 4640 + }, + { + "epoch": 0.5903829029385574, + "ewc_loss": 0.006749236490577459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.74923649057746e-05, + "grad_norm": 3.4122776985168457, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8780660033226013, + "num_tokens": 177160516.0, + "step": 4641 + }, + { + "epoch": 0.590510113217148, + "ewc_loss": 0.006757823750376701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.757823575753719e-05, + "grad_norm": 3.471304178237915, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8707253932952881, + "num_tokens": 177198100.0, + "step": 4642 + }, + { + "epoch": 0.5906373234957385, + "ewc_loss": 0.006779639981687069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.779639807064086e-05, + "grad_norm": 3.4032623767852783, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8548544049263, + "num_tokens": 177243170.0, + "step": 4643 + }, + { + "epoch": 0.5907645337743289, + "ewc_loss": 0.0067252591252326965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.725259299855679e-05, + "grad_norm": 3.4460582733154297, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8619526624679565, + "num_tokens": 177282976.0, + "step": 4644 + }, + { + "epoch": 0.5908917440529194, + "ewc_loss": 0.006776723545044661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.776723603252321e-05, + "grad_norm": 3.451904058456421, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8462071418762207, + "num_tokens": 177323725.0, + "step": 4645 + }, + { + "epoch": 0.59101895433151, + "ewc_loss": 0.006755736190825701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.75573610351421e-05, + "grad_norm": 3.546312093734741, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8634079694747925, + "num_tokens": 177352178.0, + "step": 4646 + }, + { + "epoch": 0.5911461646101005, + "ewc_loss": 0.006811549421399832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.811549246776849e-05, + "grad_norm": 3.4241034984588623, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8605977892875671, + "num_tokens": 177392322.0, + "step": 4647 + }, + { + "epoch": 0.591273374888691, + "ewc_loss": 0.00670085521414876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.700855010421947e-05, + "grad_norm": 3.4582667350769043, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8598067164421082, + "num_tokens": 177432013.0, + "step": 4648 + }, + { + "epoch": 0.5914005851672816, + "ewc_loss": 0.006771181710064411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.771181506337598e-05, + "grad_norm": 3.4305758476257324, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8749850392341614, + "num_tokens": 177473542.0, + "step": 4649 + }, + { + "epoch": 0.591527795445872, + "ewc_loss": 0.006732375826686621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.732375913998112e-05, + "grad_norm": 3.435274362564087, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8590538501739502, + "num_tokens": 177516590.0, + "step": 4650 + }, + { + "epoch": 0.5916550057244625, + "ewc_loss": 0.006744477432221174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.744477286702022e-05, + "grad_norm": 3.457186222076416, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8762896656990051, + "num_tokens": 177553750.0, + "step": 4651 + }, + { + "epoch": 0.591782216003053, + "ewc_loss": 0.006750582717359066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.750582542736083e-05, + "grad_norm": 3.534515380859375, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8562782406806946, + "num_tokens": 177586093.0, + "step": 4652 + }, + { + "epoch": 0.5919094262816436, + "ewc_loss": 0.006795929744839668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.795929948566481e-05, + "grad_norm": 3.43550181388855, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8591954708099365, + "num_tokens": 177626800.0, + "step": 4653 + }, + { + "epoch": 0.5920366365602341, + "ewc_loss": 0.006709357723593712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.709357694489881e-05, + "grad_norm": 3.5026543140411377, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.860648512840271, + "num_tokens": 177662336.0, + "step": 4654 + }, + { + "epoch": 0.5921638468388246, + "ewc_loss": 0.006783869117498398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.783869321225211e-05, + "grad_norm": 3.40476393699646, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8698575496673584, + "num_tokens": 177706922.0, + "step": 4655 + }, + { + "epoch": 0.592291057117415, + "ewc_loss": 0.006698941346257925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.698941433569416e-05, + "grad_norm": 3.4826014041900635, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8608448505401611, + "num_tokens": 177743922.0, + "step": 4656 + }, + { + "epoch": 0.5924182673960056, + "ewc_loss": 0.0067872898653149605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.787289748899639e-05, + "grad_norm": 3.4945905208587646, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8697352409362793, + "num_tokens": 177777069.0, + "step": 4657 + }, + { + "epoch": 0.5925454776745961, + "ewc_loss": 0.006757026072591543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.757026130799204e-05, + "grad_norm": 3.4647209644317627, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8707440495491028, + "num_tokens": 177810959.0, + "step": 4658 + }, + { + "epoch": 0.5926726879531866, + "ewc_loss": 0.006740350276231766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.740350363543257e-05, + "grad_norm": 3.527641773223877, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.853550136089325, + "num_tokens": 177846366.0, + "step": 4659 + }, + { + "epoch": 0.5927998982317771, + "ewc_loss": 0.006788620725274086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.788620521547273e-05, + "grad_norm": 3.480440378189087, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8747849464416504, + "num_tokens": 177881450.0, + "step": 4660 + }, + { + "epoch": 0.5929271085103677, + "ewc_loss": 0.006753330118954182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.753329944331199e-05, + "grad_norm": 3.442478895187378, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8634001016616821, + "num_tokens": 177922931.0, + "step": 4661 + }, + { + "epoch": 0.5930543187889582, + "ewc_loss": 0.006737096700817347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.737096555298194e-05, + "grad_norm": 3.4663965702056885, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8448804616928101, + "num_tokens": 177962722.0, + "step": 4662 + }, + { + "epoch": 0.5931815290675486, + "ewc_loss": 0.006770830601453781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.770830805180594e-05, + "grad_norm": 3.4579660892486572, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8620398640632629, + "num_tokens": 177999956.0, + "step": 4663 + }, + { + "epoch": 0.5933087393461391, + "ewc_loss": 0.006746762432157993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.746762664988637e-05, + "grad_norm": 3.517519950866699, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8518714308738708, + "num_tokens": 178032450.0, + "step": 4664 + }, + { + "epoch": 0.5934359496247297, + "ewc_loss": 0.006781536154448986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.781535921618342e-05, + "grad_norm": 3.4579763412475586, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8624933958053589, + "num_tokens": 178075028.0, + "step": 4665 + }, + { + "epoch": 0.5935631599033202, + "ewc_loss": 0.0067319320514798164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.731932080583647e-05, + "grad_norm": 3.5285112857818604, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8587302565574646, + "num_tokens": 178108267.0, + "step": 4666 + }, + { + "epoch": 0.5936903701819107, + "ewc_loss": 0.006809164769947529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.809164915466681e-05, + "grad_norm": 3.486851930618286, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8626489043235779, + "num_tokens": 178148091.0, + "step": 4667 + }, + { + "epoch": 0.5938175804605013, + "ewc_loss": 0.006754983216524124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.754983041901141e-05, + "grad_norm": 3.496114730834961, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8747550249099731, + "num_tokens": 178185626.0, + "step": 4668 + }, + { + "epoch": 0.5939447907390917, + "ewc_loss": 0.006789920851588249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.789920735172927e-05, + "grad_norm": 3.4605979919433594, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8692083954811096, + "num_tokens": 178224165.0, + "step": 4669 + }, + { + "epoch": 0.5940720010176822, + "ewc_loss": 0.006750741042196751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.750741158612072e-05, + "grad_norm": 3.467038631439209, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.861261248588562, + "num_tokens": 178264118.0, + "step": 4670 + }, + { + "epoch": 0.5941992112962727, + "ewc_loss": 0.006770318374037743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.770318577764556e-05, + "grad_norm": 3.5237183570861816, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8735923767089844, + "num_tokens": 178295634.0, + "step": 4671 + }, + { + "epoch": 0.5943264215748633, + "ewc_loss": 0.006801075302064419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.801075505791232e-05, + "grad_norm": 3.4855830669403076, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8682222366333008, + "num_tokens": 178327076.0, + "step": 4672 + }, + { + "epoch": 0.5944536318534538, + "ewc_loss": 0.006767024751752615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.767024751752615e-05, + "grad_norm": 3.546518087387085, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8542107343673706, + "num_tokens": 178357469.0, + "step": 4673 + }, + { + "epoch": 0.5945808421320443, + "ewc_loss": 0.006821388378739357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.821388524258509e-05, + "grad_norm": 3.4624722003936768, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8690804243087769, + "num_tokens": 178393830.0, + "step": 4674 + }, + { + "epoch": 0.5947080524106347, + "ewc_loss": 0.006762912962585688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.76291310810484e-05, + "grad_norm": 3.495920419692993, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8617473840713501, + "num_tokens": 178427560.0, + "step": 4675 + }, + { + "epoch": 0.5948352626892253, + "ewc_loss": 0.006813328247517347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.813328218413517e-05, + "grad_norm": 3.461496591567993, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8642393350601196, + "num_tokens": 178464666.0, + "step": 4676 + }, + { + "epoch": 0.5949624729678158, + "ewc_loss": 0.006797794718295336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.797794776502997e-05, + "grad_norm": 3.4057774543762207, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8523246049880981, + "num_tokens": 178508771.0, + "step": 4677 + }, + { + "epoch": 0.5950896832464063, + "ewc_loss": 0.006776334252208471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.776334339519963e-05, + "grad_norm": 3.462374210357666, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8664295077323914, + "num_tokens": 178548516.0, + "step": 4678 + }, + { + "epoch": 0.5952168935249968, + "ewc_loss": 0.006829901598393917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.829901394667104e-05, + "grad_norm": 3.5146820545196533, + "learning_rate": 1e-06, + "loss": 0.5288, + "mean_token_accuracy": 0.8267385959625244, + "num_tokens": 178591276.0, + "step": 4679 + }, + { + "epoch": 0.5953441038035874, + "ewc_loss": 0.006824200507253408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.824200681876391e-05, + "grad_norm": 3.4461371898651123, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8616124391555786, + "num_tokens": 178632580.0, + "step": 4680 + }, + { + "epoch": 0.5954713140821778, + "ewc_loss": 0.006770790088921785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.770790059817955e-05, + "grad_norm": 3.4646153450012207, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8582637906074524, + "num_tokens": 178672858.0, + "step": 4681 + }, + { + "epoch": 0.5955985243607683, + "ewc_loss": 0.0068075889721512794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.807588943047449e-05, + "grad_norm": 3.4631237983703613, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8653022646903992, + "num_tokens": 178706435.0, + "step": 4682 + }, + { + "epoch": 0.5957257346393589, + "ewc_loss": 0.006802001036703587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.802001007599756e-05, + "grad_norm": 3.4853768348693848, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8600277304649353, + "num_tokens": 178746309.0, + "step": 4683 + }, + { + "epoch": 0.5958529449179494, + "ewc_loss": 0.006811387836933136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.811387720517814e-05, + "grad_norm": 3.4574837684631348, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8449888229370117, + "num_tokens": 178787639.0, + "step": 4684 + }, + { + "epoch": 0.5959801551965399, + "ewc_loss": 0.006795484572649002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.795484659960493e-05, + "grad_norm": 3.4553918838500977, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8672378063201904, + "num_tokens": 178828271.0, + "step": 4685 + }, + { + "epoch": 0.5961073654751304, + "ewc_loss": 0.006803175434470177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.803175347158685e-05, + "grad_norm": 3.4595491886138916, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.882438063621521, + "num_tokens": 178862862.0, + "step": 4686 + }, + { + "epoch": 0.5962345757537209, + "ewc_loss": 0.006793819833546877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.793819920858368e-05, + "grad_norm": 3.4605460166931152, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8740328550338745, + "num_tokens": 178897969.0, + "step": 4687 + }, + { + "epoch": 0.5963617860323114, + "ewc_loss": 0.006800046190619469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.800045957788825e-05, + "grad_norm": 3.4013595581054688, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8657727241516113, + "num_tokens": 178942002.0, + "step": 4688 + }, + { + "epoch": 0.5964889963109019, + "ewc_loss": 0.006757453083992004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.757453229511157e-05, + "grad_norm": 3.428121566772461, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8716071248054504, + "num_tokens": 178983473.0, + "step": 4689 + }, + { + "epoch": 0.5966162065894924, + "ewc_loss": 0.0067838155664503574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.783815479138866e-05, + "grad_norm": 3.4702281951904297, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8520845174789429, + "num_tokens": 179022370.0, + "step": 4690 + }, + { + "epoch": 0.596743416868083, + "ewc_loss": 0.006786834914237261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.786835001548752e-05, + "grad_norm": 3.420583486557007, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8772505521774292, + "num_tokens": 179064841.0, + "step": 4691 + }, + { + "epoch": 0.5968706271466735, + "ewc_loss": 0.006747403647750616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.747403676854447e-05, + "grad_norm": 3.5233888626098633, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8356713056564331, + "num_tokens": 179099399.0, + "step": 4692 + }, + { + "epoch": 0.5969978374252639, + "ewc_loss": 0.006827818229794502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.827818288002163e-05, + "grad_norm": 3.5347986221313477, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.853506863117218, + "num_tokens": 179133969.0, + "step": 4693 + }, + { + "epoch": 0.5971250477038544, + "ewc_loss": 0.00678815133869648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.788151222281158e-05, + "grad_norm": 3.5156073570251465, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8607074022293091, + "num_tokens": 179165456.0, + "step": 4694 + }, + { + "epoch": 0.597252257982445, + "ewc_loss": 0.006777307018637657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.777307135052979e-05, + "grad_norm": 3.4982707500457764, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8696355819702148, + "num_tokens": 179198209.0, + "step": 4695 + }, + { + "epoch": 0.5973794682610355, + "ewc_loss": 0.006786694750189781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.786694575566798e-05, + "grad_norm": 3.4986889362335205, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8470146656036377, + "num_tokens": 179233872.0, + "step": 4696 + }, + { + "epoch": 0.597506678539626, + "ewc_loss": 0.006789915729314089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.789915642002597e-05, + "grad_norm": 3.391659736633301, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8837215900421143, + "num_tokens": 179272721.0, + "step": 4697 + }, + { + "epoch": 0.5976338888182166, + "ewc_loss": 0.006730436813086271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.730436871293932e-05, + "grad_norm": 3.552107334136963, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8608897924423218, + "num_tokens": 179305781.0, + "step": 4698 + }, + { + "epoch": 0.597761099096807, + "ewc_loss": 0.006881469860672951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.881469744257629e-05, + "grad_norm": 3.420625686645508, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8676091432571411, + "num_tokens": 179350049.0, + "step": 4699 + }, + { + "epoch": 0.5978883093753975, + "ewc_loss": 0.006746362429112196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.746362487319857e-05, + "grad_norm": 3.4654340744018555, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8621740341186523, + "num_tokens": 179388063.0, + "step": 4700 + }, + { + "epoch": 0.598015519653988, + "ewc_loss": 0.00681662280112505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.816622772021219e-05, + "grad_norm": 3.4641542434692383, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8687655925750732, + "num_tokens": 179427607.0, + "step": 4701 + }, + { + "epoch": 0.5981427299325786, + "ewc_loss": 0.006796946283429861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.796946399845183e-05, + "grad_norm": 3.4907400608062744, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8459861278533936, + "num_tokens": 179471456.0, + "step": 4702 + }, + { + "epoch": 0.5982699402111691, + "ewc_loss": 0.006818743888288736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818743713665754e-05, + "grad_norm": 3.418536901473999, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8736915588378906, + "num_tokens": 179514318.0, + "step": 4703 + }, + { + "epoch": 0.5983971504897596, + "ewc_loss": 0.00676095811650157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.760958058293909e-05, + "grad_norm": 3.4593684673309326, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8746879696846008, + "num_tokens": 179553307.0, + "step": 4704 + }, + { + "epoch": 0.59852436076835, + "ewc_loss": 0.006806249264627695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.806249439250678e-05, + "grad_norm": 3.4522645473480225, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8542128205299377, + "num_tokens": 179597288.0, + "step": 4705 + }, + { + "epoch": 0.5986515710469406, + "ewc_loss": 0.006785185541957617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.785185541957617e-05, + "grad_norm": 3.6040732860565186, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.862163245677948, + "num_tokens": 179629583.0, + "step": 4706 + }, + { + "epoch": 0.5987787813255311, + "ewc_loss": 0.006868135184049606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.868135096738115e-05, + "grad_norm": 3.485093355178833, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8565242290496826, + "num_tokens": 179672104.0, + "step": 4707 + }, + { + "epoch": 0.5989059916041216, + "ewc_loss": 0.006755961570888758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.755961658200249e-05, + "grad_norm": 3.481689453125, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.854185938835144, + "num_tokens": 179712951.0, + "step": 4708 + }, + { + "epoch": 0.5990332018827121, + "ewc_loss": 0.006783806718885899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.783806747989729e-05, + "grad_norm": 3.5129947662353516, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8620487451553345, + "num_tokens": 179754626.0, + "step": 4709 + }, + { + "epoch": 0.5991604121613027, + "ewc_loss": 0.00678427517414093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.784275319660082e-05, + "grad_norm": 3.4694364070892334, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8682667016983032, + "num_tokens": 179790068.0, + "step": 4710 + }, + { + "epoch": 0.5992876224398932, + "ewc_loss": 0.006756393704563379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.756393850082532e-05, + "grad_norm": 3.5455121994018555, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8427122235298157, + "num_tokens": 179823936.0, + "step": 4711 + }, + { + "epoch": 0.5994148327184836, + "ewc_loss": 0.006818675901740789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818676047259942e-05, + "grad_norm": 3.4611458778381348, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8646077513694763, + "num_tokens": 179860920.0, + "step": 4712 + }, + { + "epoch": 0.5995420429970741, + "ewc_loss": 0.0067444234155118465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.744423444615677e-05, + "grad_norm": 3.448768377304077, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.853227972984314, + "num_tokens": 179899848.0, + "step": 4713 + }, + { + "epoch": 0.5996692532756647, + "ewc_loss": 0.006775375455617905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.775375368306413e-05, + "grad_norm": 3.4605672359466553, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8794040679931641, + "num_tokens": 179934462.0, + "step": 4714 + }, + { + "epoch": 0.5997964635542552, + "ewc_loss": 0.006792232394218445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.792232306906953e-05, + "grad_norm": 3.4849343299865723, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8621464967727661, + "num_tokens": 179973955.0, + "step": 4715 + }, + { + "epoch": 0.5999236738328457, + "ewc_loss": 0.006806021090596914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.806020974181592e-05, + "grad_norm": 3.441098928451538, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8639727830886841, + "num_tokens": 180015256.0, + "step": 4716 + }, + { + "epoch": 0.6000508841114363, + "ewc_loss": 0.006782794836908579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.782794662285596e-05, + "grad_norm": 3.436772584915161, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8710822463035583, + "num_tokens": 180054145.0, + "step": 4717 + }, + { + "epoch": 0.6001780943900267, + "ewc_loss": 0.006797208916395903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.797209061915055e-05, + "grad_norm": 3.4404566287994385, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8660332560539246, + "num_tokens": 180093411.0, + "step": 4718 + }, + { + "epoch": 0.6003053046686172, + "ewc_loss": 0.006799199618399143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.799199763918296e-05, + "grad_norm": 3.3979766368865967, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8707028031349182, + "num_tokens": 180135799.0, + "step": 4719 + }, + { + "epoch": 0.6004325149472077, + "ewc_loss": 0.006766828242689371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.766828300897032e-05, + "grad_norm": 3.483264207839966, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8577969074249268, + "num_tokens": 180172623.0, + "step": 4720 + }, + { + "epoch": 0.6005597252257983, + "ewc_loss": 0.006819488946348429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.819488771725446e-05, + "grad_norm": 3.4287195205688477, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8574819564819336, + "num_tokens": 180213833.0, + "step": 4721 + }, + { + "epoch": 0.6006869355043888, + "ewc_loss": 0.006766917183995247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.766917067579925e-05, + "grad_norm": 3.4216368198394775, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8706232309341431, + "num_tokens": 180254982.0, + "step": 4722 + }, + { + "epoch": 0.6008141457829793, + "ewc_loss": 0.006782369688153267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.782369746360928e-05, + "grad_norm": 3.5027880668640137, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8716871738433838, + "num_tokens": 180288544.0, + "step": 4723 + }, + { + "epoch": 0.6009413560615697, + "ewc_loss": 0.006811256520450115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.811256753280759e-05, + "grad_norm": 3.4485950469970703, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8596744537353516, + "num_tokens": 180330233.0, + "step": 4724 + }, + { + "epoch": 0.6010685663401603, + "ewc_loss": 0.0067665548995137215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.766554724890739e-05, + "grad_norm": 3.522221326828003, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8658874034881592, + "num_tokens": 180361516.0, + "step": 4725 + }, + { + "epoch": 0.6011957766187508, + "ewc_loss": 0.006829835940152407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.829835911048576e-05, + "grad_norm": 3.524164915084839, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8607069253921509, + "num_tokens": 180404380.0, + "step": 4726 + }, + { + "epoch": 0.6013229868973413, + "ewc_loss": 0.0068043433129787445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.804343138355762e-05, + "grad_norm": 3.522878646850586, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.856372058391571, + "num_tokens": 180436916.0, + "step": 4727 + }, + { + "epoch": 0.6014501971759318, + "ewc_loss": 0.006809376645833254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.809376645833254e-05, + "grad_norm": 3.468122959136963, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.844692051410675, + "num_tokens": 180476895.0, + "step": 4728 + }, + { + "epoch": 0.6015774074545224, + "ewc_loss": 0.006781214382499456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.781214324291795e-05, + "grad_norm": 3.420099973678589, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8629633188247681, + "num_tokens": 180515248.0, + "step": 4729 + }, + { + "epoch": 0.6017046177331128, + "ewc_loss": 0.006786300800740719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.786300946259871e-05, + "grad_norm": 3.4475903511047363, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8692854046821594, + "num_tokens": 180554575.0, + "step": 4730 + }, + { + "epoch": 0.6018318280117033, + "ewc_loss": 0.006811732426285744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.811732600908726e-05, + "grad_norm": 3.455962896347046, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8526439070701599, + "num_tokens": 180594485.0, + "step": 4731 + }, + { + "epoch": 0.6019590382902938, + "ewc_loss": 0.006803227588534355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.803227734053507e-05, + "grad_norm": 3.448357582092285, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8487997055053711, + "num_tokens": 180634746.0, + "step": 4732 + }, + { + "epoch": 0.6020862485688844, + "ewc_loss": 0.0068032145500183105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.803214637329802e-05, + "grad_norm": 3.480329990386963, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8533263206481934, + "num_tokens": 180674946.0, + "step": 4733 + }, + { + "epoch": 0.6022134588474749, + "ewc_loss": 0.006837819702923298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.83781981933862e-05, + "grad_norm": 3.4304869174957275, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8622716069221497, + "num_tokens": 180715538.0, + "step": 4734 + }, + { + "epoch": 0.6023406691260654, + "ewc_loss": 0.006790031213313341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.790031329728663e-05, + "grad_norm": 3.4416697025299072, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8614211082458496, + "num_tokens": 180755298.0, + "step": 4735 + }, + { + "epoch": 0.6024678794046558, + "ewc_loss": 0.0068236906081438065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.823690637247637e-05, + "grad_norm": 3.459747076034546, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8550576567649841, + "num_tokens": 180797104.0, + "step": 4736 + }, + { + "epoch": 0.6025950896832464, + "ewc_loss": 0.006834513507783413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.834513624198735e-05, + "grad_norm": 3.4093821048736572, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8725109696388245, + "num_tokens": 180839845.0, + "step": 4737 + }, + { + "epoch": 0.6027222999618369, + "ewc_loss": 0.006771888583898544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.771888729417697e-05, + "grad_norm": 3.43715763092041, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8761569857597351, + "num_tokens": 180878849.0, + "step": 4738 + }, + { + "epoch": 0.6028495102404274, + "ewc_loss": 0.006820513866841793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.820513954153284e-05, + "grad_norm": 3.486851453781128, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8612484931945801, + "num_tokens": 180918427.0, + "step": 4739 + }, + { + "epoch": 0.602976720519018, + "ewc_loss": 0.006824932526797056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.824932643212378e-05, + "grad_norm": 3.4793615341186523, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8555043935775757, + "num_tokens": 180956206.0, + "step": 4740 + }, + { + "epoch": 0.6031039307976085, + "ewc_loss": 0.006797454319894314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.797454261686653e-05, + "grad_norm": 3.4582359790802, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8686516284942627, + "num_tokens": 180996945.0, + "step": 4741 + }, + { + "epoch": 0.6032311410761989, + "ewc_loss": 0.006772206164896488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.772205961169675e-05, + "grad_norm": 3.484410285949707, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8689891695976257, + "num_tokens": 181031763.0, + "step": 4742 + }, + { + "epoch": 0.6033583513547894, + "ewc_loss": 0.006796136498451233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.796136585762724e-05, + "grad_norm": 3.436784505844116, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8706029653549194, + "num_tokens": 181070661.0, + "step": 4743 + }, + { + "epoch": 0.60348556163338, + "ewc_loss": 0.0067426180467009544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.742618279531598e-05, + "grad_norm": 3.40620756149292, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8494489192962646, + "num_tokens": 181112769.0, + "step": 4744 + }, + { + "epoch": 0.6036127719119705, + "ewc_loss": 0.006750167813152075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.750167813152075e-05, + "grad_norm": 3.4601035118103027, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8633830547332764, + "num_tokens": 181151155.0, + "step": 4745 + }, + { + "epoch": 0.603739982190561, + "ewc_loss": 0.006789905950427055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.789906183257699e-05, + "grad_norm": 3.4448952674865723, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8625240325927734, + "num_tokens": 181188689.0, + "step": 4746 + }, + { + "epoch": 0.6038671924691515, + "ewc_loss": 0.0067509850487113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.750984903192148e-05, + "grad_norm": 3.4227867126464844, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8663216829299927, + "num_tokens": 181230031.0, + "step": 4747 + }, + { + "epoch": 0.603994402747742, + "ewc_loss": 0.006744715850800276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.744715938111767e-05, + "grad_norm": 3.429814338684082, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8628292083740234, + "num_tokens": 181271464.0, + "step": 4748 + }, + { + "epoch": 0.6041216130263325, + "ewc_loss": 0.006764634512364864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.764634599676356e-05, + "grad_norm": 3.463991403579712, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8582741022109985, + "num_tokens": 181312607.0, + "step": 4749 + }, + { + "epoch": 0.604248823304923, + "ewc_loss": 0.006778382696211338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.778382521588355e-05, + "grad_norm": 3.486523151397705, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8728795051574707, + "num_tokens": 181348628.0, + "step": 4750 + }, + { + "epoch": 0.6043760335835135, + "ewc_loss": 0.006769329775124788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.769329775124788e-05, + "grad_norm": 3.464012384414673, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.854560136795044, + "num_tokens": 181387975.0, + "step": 4751 + }, + { + "epoch": 0.6045032438621041, + "ewc_loss": 0.006755664013326168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.755664071533829e-05, + "grad_norm": 3.46370005607605, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.874542236328125, + "num_tokens": 181424163.0, + "step": 4752 + }, + { + "epoch": 0.6046304541406946, + "ewc_loss": 0.0067632501013576984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.763249984942377e-05, + "grad_norm": 3.5339314937591553, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8523539900779724, + "num_tokens": 181457002.0, + "step": 4753 + }, + { + "epoch": 0.604757664419285, + "ewc_loss": 0.006809597834944725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.809597834944725e-05, + "grad_norm": 3.5011067390441895, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8586384057998657, + "num_tokens": 181494410.0, + "step": 4754 + }, + { + "epoch": 0.6048848746978756, + "ewc_loss": 0.006758308038115501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.758308154530823e-05, + "grad_norm": 3.425837993621826, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8598059415817261, + "num_tokens": 181534595.0, + "step": 4755 + }, + { + "epoch": 0.6050120849764661, + "ewc_loss": 0.006738333962857723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.738334195688367e-05, + "grad_norm": 3.4869022369384766, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8685897588729858, + "num_tokens": 181570654.0, + "step": 4756 + }, + { + "epoch": 0.6051392952550566, + "ewc_loss": 0.00679124565795064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.79124568705447e-05, + "grad_norm": 3.4612069129943848, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8568594455718994, + "num_tokens": 181608366.0, + "step": 4757 + }, + { + "epoch": 0.6052665055336471, + "ewc_loss": 0.006769460625946522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.769460742361844e-05, + "grad_norm": 3.472536325454712, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8504438400268555, + "num_tokens": 181647168.0, + "step": 4758 + }, + { + "epoch": 0.6053937158122377, + "ewc_loss": 0.006789799313992262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.78979922668077e-05, + "grad_norm": 3.417351484298706, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8661152720451355, + "num_tokens": 181689177.0, + "step": 4759 + }, + { + "epoch": 0.6055209260908282, + "ewc_loss": 0.006763215642422438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.76321578794159e-05, + "grad_norm": 3.5052716732025146, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.868249773979187, + "num_tokens": 181723742.0, + "step": 4760 + }, + { + "epoch": 0.6056481363694186, + "ewc_loss": 0.0068313367664813995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.831336941104382e-05, + "grad_norm": 3.4590814113616943, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8710847496986389, + "num_tokens": 181761539.0, + "step": 4761 + }, + { + "epoch": 0.6057753466480091, + "ewc_loss": 0.006782427430152893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.78242722642608e-05, + "grad_norm": 3.5136542320251465, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8612273931503296, + "num_tokens": 181799674.0, + "step": 4762 + }, + { + "epoch": 0.6059025569265997, + "ewc_loss": 0.006821848917752504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.821849092375487e-05, + "grad_norm": 3.433590888977051, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8504838943481445, + "num_tokens": 181838784.0, + "step": 4763 + }, + { + "epoch": 0.6060297672051902, + "ewc_loss": 0.006767488084733486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.767488230252638e-05, + "grad_norm": 3.468452215194702, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8573682308197021, + "num_tokens": 181875669.0, + "step": 4764 + }, + { + "epoch": 0.6061569774837807, + "ewc_loss": 0.006803479976952076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.80348020978272e-05, + "grad_norm": 3.4928953647613525, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8462211489677429, + "num_tokens": 181912450.0, + "step": 4765 + }, + { + "epoch": 0.6062841877623713, + "ewc_loss": 0.006820516660809517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.82051686453633e-05, + "grad_norm": 3.484872579574585, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8623825311660767, + "num_tokens": 181947691.0, + "step": 4766 + }, + { + "epoch": 0.6064113980409617, + "ewc_loss": 0.006801044102758169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.801044219173491e-05, + "grad_norm": 3.4878456592559814, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8668360710144043, + "num_tokens": 181982364.0, + "step": 4767 + }, + { + "epoch": 0.6065386083195522, + "ewc_loss": 0.006818115245550871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818115070927888e-05, + "grad_norm": 3.5446407794952393, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8631147146224976, + "num_tokens": 182017071.0, + "step": 4768 + }, + { + "epoch": 0.6066658185981427, + "ewc_loss": 0.006854104809463024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.854104867670685e-05, + "grad_norm": 3.4354629516601562, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8620814681053162, + "num_tokens": 182057234.0, + "step": 4769 + }, + { + "epoch": 0.6067930288767333, + "ewc_loss": 0.006755448877811432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.755448703188449e-05, + "grad_norm": 3.445634365081787, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8731528520584106, + "num_tokens": 182092450.0, + "step": 4770 + }, + { + "epoch": 0.6069202391553238, + "ewc_loss": 0.006818124558776617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818124529672787e-05, + "grad_norm": 3.54026198387146, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8548591136932373, + "num_tokens": 182126398.0, + "step": 4771 + }, + { + "epoch": 0.6070474494339143, + "ewc_loss": 0.006850846577435732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.850846693851054e-05, + "grad_norm": 3.4998281002044678, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8777248859405518, + "num_tokens": 182160848.0, + "step": 4772 + }, + { + "epoch": 0.6071746597125047, + "ewc_loss": 0.006808111909776926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.808112084399909e-05, + "grad_norm": 3.3846993446350098, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8723515272140503, + "num_tokens": 182202596.0, + "step": 4773 + }, + { + "epoch": 0.6073018699910953, + "ewc_loss": 0.006759307812899351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.759307871107012e-05, + "grad_norm": 3.460947036743164, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.863010823726654, + "num_tokens": 182239235.0, + "step": 4774 + }, + { + "epoch": 0.6074290802696858, + "ewc_loss": 0.006854627281427383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.854627281427383e-05, + "grad_norm": 3.4887375831604004, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8761025667190552, + "num_tokens": 182273662.0, + "step": 4775 + }, + { + "epoch": 0.6075562905482763, + "ewc_loss": 0.006831126753240824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.831126665929332e-05, + "grad_norm": 3.506767988204956, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8475379943847656, + "num_tokens": 182311991.0, + "step": 4776 + }, + { + "epoch": 0.6076835008268668, + "ewc_loss": 0.006845443043857813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.845442840131e-05, + "grad_norm": 3.444890022277832, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8641505241394043, + "num_tokens": 182350722.0, + "step": 4777 + }, + { + "epoch": 0.6078107111054574, + "ewc_loss": 0.006802428979426622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.80242883390747e-05, + "grad_norm": 3.502694845199585, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8647738695144653, + "num_tokens": 182386755.0, + "step": 4778 + }, + { + "epoch": 0.6079379213840478, + "ewc_loss": 0.0068739792332053185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.873979145893827e-05, + "grad_norm": 3.4914286136627197, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8785739541053772, + "num_tokens": 182423128.0, + "step": 4779 + }, + { + "epoch": 0.6080651316626383, + "ewc_loss": 0.006824406795203686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.824406591476873e-05, + "grad_norm": 3.492077112197876, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8500285148620605, + "num_tokens": 182461518.0, + "step": 4780 + }, + { + "epoch": 0.6081923419412288, + "ewc_loss": 0.006840668153017759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.840668356744573e-05, + "grad_norm": 3.507319211959839, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8640721440315247, + "num_tokens": 182495707.0, + "step": 4781 + }, + { + "epoch": 0.6083195522198194, + "ewc_loss": 0.0068367100320756435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.836710235802457e-05, + "grad_norm": 3.3917624950408936, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8540366888046265, + "num_tokens": 182539058.0, + "step": 4782 + }, + { + "epoch": 0.6084467624984099, + "ewc_loss": 0.006776855327188969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.776855298085138e-05, + "grad_norm": 3.557805299758911, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8490099906921387, + "num_tokens": 182572482.0, + "step": 4783 + }, + { + "epoch": 0.6085739727770004, + "ewc_loss": 0.00690886378288269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.90886372467503e-05, + "grad_norm": 3.5027916431427, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8678604960441589, + "num_tokens": 182607665.0, + "step": 4784 + }, + { + "epoch": 0.6087011830555908, + "ewc_loss": 0.006826859433203936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.826859316788614e-05, + "grad_norm": 3.4254045486450195, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8645986318588257, + "num_tokens": 182650503.0, + "step": 4785 + }, + { + "epoch": 0.6088283933341814, + "ewc_loss": 0.006810949184000492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.810948980273679e-05, + "grad_norm": 3.5253257751464844, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.872506856918335, + "num_tokens": 182681662.0, + "step": 4786 + }, + { + "epoch": 0.6089556036127719, + "ewc_loss": 0.006899506784975529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.89950684318319e-05, + "grad_norm": 3.5381412506103516, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.851292610168457, + "num_tokens": 182714912.0, + "step": 4787 + }, + { + "epoch": 0.6090828138913624, + "ewc_loss": 0.00687926821410656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.879268039483577e-05, + "grad_norm": 3.550507068634033, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8597704172134399, + "num_tokens": 182748813.0, + "step": 4788 + }, + { + "epoch": 0.609210024169953, + "ewc_loss": 0.006890803575515747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.890803342685103e-05, + "grad_norm": 3.4915359020233154, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8470346927642822, + "num_tokens": 182786123.0, + "step": 4789 + }, + { + "epoch": 0.6093372344485435, + "ewc_loss": 0.0068550738506019115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.855074025224894e-05, + "grad_norm": 3.432941436767578, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8762658834457397, + "num_tokens": 182820550.0, + "step": 4790 + }, + { + "epoch": 0.6094644447271339, + "ewc_loss": 0.006841329857707024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.841329741291702e-05, + "grad_norm": 3.4265520572662354, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8639922142028809, + "num_tokens": 182863135.0, + "step": 4791 + }, + { + "epoch": 0.6095916550057244, + "ewc_loss": 0.0068563721142709255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.856372056063265e-05, + "grad_norm": 3.4311678409576416, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8654192686080933, + "num_tokens": 182905193.0, + "step": 4792 + }, + { + "epoch": 0.609718865284315, + "ewc_loss": 0.006859524641185999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.85952472849749e-05, + "grad_norm": 3.447997808456421, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8624204397201538, + "num_tokens": 182944934.0, + "step": 4793 + }, + { + "epoch": 0.6098460755629055, + "ewc_loss": 0.006864974275231361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.864974420750514e-05, + "grad_norm": 3.4470221996307373, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8498724102973938, + "num_tokens": 182984604.0, + "step": 4794 + }, + { + "epoch": 0.609973285841496, + "ewc_loss": 0.006867329124361277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.867328920634463e-05, + "grad_norm": 3.493889808654785, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8620834946632385, + "num_tokens": 183020421.0, + "step": 4795 + }, + { + "epoch": 0.6101004961200865, + "ewc_loss": 0.006879625376313925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.879625289002433e-05, + "grad_norm": 3.463573455810547, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8540225625038147, + "num_tokens": 183057635.0, + "step": 4796 + }, + { + "epoch": 0.610227706398677, + "ewc_loss": 0.0068438989110291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.84389888192527e-05, + "grad_norm": 3.4955360889434814, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8502646684646606, + "num_tokens": 183094689.0, + "step": 4797 + }, + { + "epoch": 0.6103549166772675, + "ewc_loss": 0.006879333406686783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.879333523102105e-05, + "grad_norm": 3.5205776691436768, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.866275429725647, + "num_tokens": 183129118.0, + "step": 4798 + }, + { + "epoch": 0.610482126955858, + "ewc_loss": 0.006876135710626841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.876135739730671e-05, + "grad_norm": 3.486922025680542, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8614000678062439, + "num_tokens": 183163365.0, + "step": 4799 + }, + { + "epoch": 0.6106093372344485, + "ewc_loss": 0.006860207300633192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.860207213321701e-05, + "grad_norm": 3.492696523666382, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8667229413986206, + "num_tokens": 183199271.0, + "step": 4800 + }, + { + "epoch": 0.6107365475130391, + "ewc_loss": 0.006873012986034155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.873012898722664e-05, + "grad_norm": 3.4348042011260986, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8823821544647217, + "num_tokens": 183234523.0, + "step": 4801 + }, + { + "epoch": 0.6108637577916296, + "ewc_loss": 0.006835202686488628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.835202657384798e-05, + "grad_norm": 3.526691198348999, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8625084757804871, + "num_tokens": 183267098.0, + "step": 4802 + }, + { + "epoch": 0.61099096807022, + "ewc_loss": 0.006909012328833342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.909012154210359e-05, + "grad_norm": 3.479776382446289, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8632473945617676, + "num_tokens": 183306825.0, + "step": 4803 + }, + { + "epoch": 0.6111181783488105, + "ewc_loss": 0.006844291929155588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.844291783636436e-05, + "grad_norm": 3.4384565353393555, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8621512055397034, + "num_tokens": 183348712.0, + "step": 4804 + }, + { + "epoch": 0.6112453886274011, + "ewc_loss": 0.0068281558342278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.828155892435461e-05, + "grad_norm": 3.4364426136016846, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8740311861038208, + "num_tokens": 183383797.0, + "step": 4805 + }, + { + "epoch": 0.6113725989059916, + "ewc_loss": 0.0068499017506837845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.849901546956971e-05, + "grad_norm": 3.5101912021636963, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8744329810142517, + "num_tokens": 183417484.0, + "step": 4806 + }, + { + "epoch": 0.6114998091845821, + "ewc_loss": 0.006883321329951286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.883321475470439e-05, + "grad_norm": 3.4907190799713135, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8687022924423218, + "num_tokens": 183453808.0, + "step": 4807 + }, + { + "epoch": 0.6116270194631727, + "ewc_loss": 0.006841994822025299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.841994763817638e-05, + "grad_norm": 3.3921539783477783, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8704390525817871, + "num_tokens": 183496693.0, + "step": 4808 + }, + { + "epoch": 0.6117542297417632, + "ewc_loss": 0.006797275505959988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.797275273129344e-05, + "grad_norm": 3.473710536956787, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8652608394622803, + "num_tokens": 183536727.0, + "step": 4809 + }, + { + "epoch": 0.6118814400203536, + "ewc_loss": 0.006874879822134972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.874879909446463e-05, + "grad_norm": 3.4674201011657715, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8651518821716309, + "num_tokens": 183571809.0, + "step": 4810 + }, + { + "epoch": 0.6120086502989441, + "ewc_loss": 0.006835608743131161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.83560865581967e-05, + "grad_norm": 3.4698293209075928, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8616464138031006, + "num_tokens": 183610761.0, + "step": 4811 + }, + { + "epoch": 0.6121358605775347, + "ewc_loss": 0.006822710391134024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.822710565757006e-05, + "grad_norm": 3.513331651687622, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8613826036453247, + "num_tokens": 183648304.0, + "step": 4812 + }, + { + "epoch": 0.6122630708561252, + "ewc_loss": 0.006841377820819616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.841377762611955e-05, + "grad_norm": 3.4616997241973877, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8592832088470459, + "num_tokens": 183684201.0, + "step": 4813 + }, + { + "epoch": 0.6123902811347157, + "ewc_loss": 0.006806498859077692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.806499004596844e-05, + "grad_norm": 3.479632616043091, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8558050394058228, + "num_tokens": 183726595.0, + "step": 4814 + }, + { + "epoch": 0.6125174914133062, + "ewc_loss": 0.006818267051130533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818267138442025e-05, + "grad_norm": 3.5003576278686523, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8715161085128784, + "num_tokens": 183762222.0, + "step": 4815 + }, + { + "epoch": 0.6126447016918967, + "ewc_loss": 0.0068264189176261425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.826419121352956e-05, + "grad_norm": 3.495148181915283, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.875744104385376, + "num_tokens": 183797167.0, + "step": 4816 + }, + { + "epoch": 0.6127719119704872, + "ewc_loss": 0.006816152948886156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.816152745159343e-05, + "grad_norm": 3.4323816299438477, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8618682622909546, + "num_tokens": 183839591.0, + "step": 4817 + }, + { + "epoch": 0.6128991222490777, + "ewc_loss": 0.00676914444193244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.769144238205627e-05, + "grad_norm": 3.4752118587493896, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8630213141441345, + "num_tokens": 183877862.0, + "step": 4818 + }, + { + "epoch": 0.6130263325276682, + "ewc_loss": 0.006807423662394285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.807423778809607e-05, + "grad_norm": 3.448307991027832, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8698694705963135, + "num_tokens": 183915964.0, + "step": 4819 + }, + { + "epoch": 0.6131535428062588, + "ewc_loss": 0.006777620874345303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.77762072882615e-05, + "grad_norm": 3.5128462314605713, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8582323789596558, + "num_tokens": 183952823.0, + "step": 4820 + }, + { + "epoch": 0.6132807530848493, + "ewc_loss": 0.006816455628722906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.816455424996093e-05, + "grad_norm": 3.4606382846832275, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8544296026229858, + "num_tokens": 183991650.0, + "step": 4821 + }, + { + "epoch": 0.6134079633634397, + "ewc_loss": 0.006772225722670555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.772225606255233e-05, + "grad_norm": 3.461843967437744, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8642600178718567, + "num_tokens": 184027768.0, + "step": 4822 + }, + { + "epoch": 0.6135351736420303, + "ewc_loss": 0.006795051041990519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.795051012886688e-05, + "grad_norm": 3.4721035957336426, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8761399984359741, + "num_tokens": 184063754.0, + "step": 4823 + }, + { + "epoch": 0.6136623839206208, + "ewc_loss": 0.006797840818762779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.797840615035966e-05, + "grad_norm": 3.427946090698242, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8568753004074097, + "num_tokens": 184105286.0, + "step": 4824 + }, + { + "epoch": 0.6137895941992113, + "ewc_loss": 0.0067777312360703945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.777731323381886e-05, + "grad_norm": 3.5421054363250732, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.867919385433197, + "num_tokens": 184139312.0, + "step": 4825 + }, + { + "epoch": 0.6139168044778018, + "ewc_loss": 0.006844015792012215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.844016024842858e-05, + "grad_norm": 3.4255340099334717, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8622442483901978, + "num_tokens": 184181541.0, + "step": 4826 + }, + { + "epoch": 0.6140440147563924, + "ewc_loss": 0.00675055431202054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.750554166501388e-05, + "grad_norm": 3.516859531402588, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.863639771938324, + "num_tokens": 184216857.0, + "step": 4827 + }, + { + "epoch": 0.6141712250349828, + "ewc_loss": 0.0068413568660616875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.841356662334874e-05, + "grad_norm": 3.4527692794799805, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8684313893318176, + "num_tokens": 184253318.0, + "step": 4828 + }, + { + "epoch": 0.6142984353135733, + "ewc_loss": 0.006769808009266853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.76980780554004e-05, + "grad_norm": 3.4210424423217773, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8747488856315613, + "num_tokens": 184298625.0, + "step": 4829 + }, + { + "epoch": 0.6144256455921638, + "ewc_loss": 0.006776613183319569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.776613008696586e-05, + "grad_norm": 3.519901752471924, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8616786003112793, + "num_tokens": 184334569.0, + "step": 4830 + }, + { + "epoch": 0.6145528558707544, + "ewc_loss": 0.006848595570772886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.848595512565225e-05, + "grad_norm": 3.499403476715088, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8470519781112671, + "num_tokens": 184377477.0, + "step": 4831 + }, + { + "epoch": 0.6146800661493449, + "ewc_loss": 0.006801693234592676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.801693234592676e-05, + "grad_norm": 3.4596076011657715, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8507508039474487, + "num_tokens": 184416671.0, + "step": 4832 + }, + { + "epoch": 0.6148072764279354, + "ewc_loss": 0.0067933485843241215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.793348438804969e-05, + "grad_norm": 3.4816298484802246, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.859281599521637, + "num_tokens": 184453174.0, + "step": 4833 + }, + { + "epoch": 0.6149344867065258, + "ewc_loss": 0.0068233259953558445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.823326111771166e-05, + "grad_norm": 3.4979007244110107, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.870177149772644, + "num_tokens": 184488313.0, + "step": 4834 + }, + { + "epoch": 0.6150616969851164, + "ewc_loss": 0.006816181819885969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.8161818489898e-05, + "grad_norm": 3.5131959915161133, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8440989851951599, + "num_tokens": 184523733.0, + "step": 4835 + }, + { + "epoch": 0.6151889072637069, + "ewc_loss": 0.006828326731920242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.828326877439395e-05, + "grad_norm": 3.4358067512512207, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8605689406394958, + "num_tokens": 184568399.0, + "step": 4836 + }, + { + "epoch": 0.6153161175422974, + "ewc_loss": 0.006774951703846455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.774951907573268e-05, + "grad_norm": 3.4750046730041504, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8630538582801819, + "num_tokens": 184603302.0, + "step": 4837 + }, + { + "epoch": 0.615443327820888, + "ewc_loss": 0.006839110050350428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.839109846623614e-05, + "grad_norm": 3.469738721847534, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8825421929359436, + "num_tokens": 184637346.0, + "step": 4838 + }, + { + "epoch": 0.6155705380994785, + "ewc_loss": 0.0068150172010064125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.815016968175769e-05, + "grad_norm": 3.5059008598327637, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.870263934135437, + "num_tokens": 184671687.0, + "step": 4839 + }, + { + "epoch": 0.6156977483780689, + "ewc_loss": 0.006830861791968346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.830861821072176e-05, + "grad_norm": 3.47225284576416, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.873553991317749, + "num_tokens": 184706867.0, + "step": 4840 + }, + { + "epoch": 0.6158249586566594, + "ewc_loss": 0.00681422371417284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.814223888795823e-05, + "grad_norm": 3.4907641410827637, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8600916862487793, + "num_tokens": 184743916.0, + "step": 4841 + }, + { + "epoch": 0.61595216893525, + "ewc_loss": 0.006837768014520407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.837768160039559e-05, + "grad_norm": 3.4549214839935303, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.873417317867279, + "num_tokens": 184783247.0, + "step": 4842 + }, + { + "epoch": 0.6160793792138405, + "ewc_loss": 0.006799984257668257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.799984112149104e-05, + "grad_norm": 3.408548355102539, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8640897274017334, + "num_tokens": 184826501.0, + "step": 4843 + }, + { + "epoch": 0.616206589492431, + "ewc_loss": 0.006798929069191217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.798929098295048e-05, + "grad_norm": 3.449021577835083, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8621624708175659, + "num_tokens": 184868743.0, + "step": 4844 + }, + { + "epoch": 0.6163337997710215, + "ewc_loss": 0.006826961413025856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.826961180195212e-05, + "grad_norm": 3.477550506591797, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8584048748016357, + "num_tokens": 184908992.0, + "step": 4845 + }, + { + "epoch": 0.616461010049612, + "ewc_loss": 0.0068186125718057156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818612746428698e-05, + "grad_norm": 3.5375945568084717, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8515521287918091, + "num_tokens": 184942456.0, + "step": 4846 + }, + { + "epoch": 0.6165882203282025, + "ewc_loss": 0.006844921037554741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.844921153970063e-05, + "grad_norm": 3.563612461090088, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8545432090759277, + "num_tokens": 184975834.0, + "step": 4847 + }, + { + "epoch": 0.616715430606793, + "ewc_loss": 0.006845664232969284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.845664029242471e-05, + "grad_norm": 3.453352451324463, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8852479457855225, + "num_tokens": 185012295.0, + "step": 4848 + }, + { + "epoch": 0.6168426408853835, + "ewc_loss": 0.006771668791770935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.771668995497748e-05, + "grad_norm": 3.4631271362304688, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8691247701644897, + "num_tokens": 185049272.0, + "step": 4849 + }, + { + "epoch": 0.6169698511639741, + "ewc_loss": 0.006817292887717485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.817292887717485e-05, + "grad_norm": 3.5311813354492188, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8750146627426147, + "num_tokens": 185081054.0, + "step": 4850 + }, + { + "epoch": 0.6170970614425646, + "ewc_loss": 0.006851160898804665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.851161015219986e-05, + "grad_norm": 3.4498984813690186, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8591219186782837, + "num_tokens": 185119809.0, + "step": 4851 + }, + { + "epoch": 0.617224271721155, + "ewc_loss": 0.006779197603464127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.779197428841144e-05, + "grad_norm": 3.4366650581359863, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8744471073150635, + "num_tokens": 185161462.0, + "step": 4852 + }, + { + "epoch": 0.6173514819997455, + "ewc_loss": 0.006804041098803282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.804041186114773e-05, + "grad_norm": 3.5449116230010986, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8539917469024658, + "num_tokens": 185196238.0, + "step": 4853 + }, + { + "epoch": 0.6174786922783361, + "ewc_loss": 0.006871908437460661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.87190840835683e-05, + "grad_norm": 3.4518015384674072, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8624894022941589, + "num_tokens": 185239317.0, + "step": 4854 + }, + { + "epoch": 0.6176059025569266, + "ewc_loss": 0.006780225317925215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.780225521652028e-05, + "grad_norm": 3.466188430786133, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8728442788124084, + "num_tokens": 185273555.0, + "step": 4855 + }, + { + "epoch": 0.6177331128355171, + "ewc_loss": 0.006816179491579533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.816179666202515e-05, + "grad_norm": 3.450873851776123, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8718082904815674, + "num_tokens": 185309721.0, + "step": 4856 + }, + { + "epoch": 0.6178603231141077, + "ewc_loss": 0.006811761762946844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.811761704739183e-05, + "grad_norm": 3.475909948348999, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8558443784713745, + "num_tokens": 185350846.0, + "step": 4857 + }, + { + "epoch": 0.6179875333926982, + "ewc_loss": 0.006808927748352289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.808927719248459e-05, + "grad_norm": 3.468431234359741, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8567707538604736, + "num_tokens": 185392472.0, + "step": 4858 + }, + { + "epoch": 0.6181147436712886, + "ewc_loss": 0.006811350584030151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.811350613133982e-05, + "grad_norm": 3.4616150856018066, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8593779802322388, + "num_tokens": 185438250.0, + "step": 4859 + }, + { + "epoch": 0.6182419539498791, + "ewc_loss": 0.00679061422124505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.790614133933559e-05, + "grad_norm": 3.444586992263794, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8613536357879639, + "num_tokens": 185479455.0, + "step": 4860 + }, + { + "epoch": 0.6183691642284697, + "ewc_loss": 0.006796981208026409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.796981324441731e-05, + "grad_norm": 3.5097594261169434, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8600971102714539, + "num_tokens": 185518719.0, + "step": 4861 + }, + { + "epoch": 0.6184963745070602, + "ewc_loss": 0.006849254015833139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.849253986729309e-05, + "grad_norm": 3.515765905380249, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8501473069190979, + "num_tokens": 185556671.0, + "step": 4862 + }, + { + "epoch": 0.6186235847856507, + "ewc_loss": 0.006821801885962486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.821801798650995e-05, + "grad_norm": 3.481341600418091, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8612682819366455, + "num_tokens": 185595063.0, + "step": 4863 + }, + { + "epoch": 0.6187507950642412, + "ewc_loss": 0.006792409811168909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.79240984027274e-05, + "grad_norm": 3.4557037353515625, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8544421195983887, + "num_tokens": 185635626.0, + "step": 4864 + }, + { + "epoch": 0.6188780053428317, + "ewc_loss": 0.006797104142606258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.79710428812541e-05, + "grad_norm": 3.499812126159668, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8529576659202576, + "num_tokens": 185671687.0, + "step": 4865 + }, + { + "epoch": 0.6190052156214222, + "ewc_loss": 0.00683708256110549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.837082764832303e-05, + "grad_norm": 3.4666261672973633, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8667474985122681, + "num_tokens": 185706459.0, + "step": 4866 + }, + { + "epoch": 0.6191324259000127, + "ewc_loss": 0.006790348328649998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.790348561480641e-05, + "grad_norm": 3.472137689590454, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8815368413925171, + "num_tokens": 185743162.0, + "step": 4867 + }, + { + "epoch": 0.6192596361786032, + "ewc_loss": 0.006818744353950024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818744441261515e-05, + "grad_norm": 3.445546865463257, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8479431867599487, + "num_tokens": 185784036.0, + "step": 4868 + }, + { + "epoch": 0.6193868464571938, + "ewc_loss": 0.006800660863518715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.800660776207224e-05, + "grad_norm": 3.486158847808838, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8600818514823914, + "num_tokens": 185821407.0, + "step": 4869 + }, + { + "epoch": 0.6195140567357843, + "ewc_loss": 0.006845093797892332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.845093594165519e-05, + "grad_norm": 3.5191800594329834, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8693705797195435, + "num_tokens": 185857789.0, + "step": 4870 + }, + { + "epoch": 0.6196412670143747, + "ewc_loss": 0.006857254542410374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.857254629721865e-05, + "grad_norm": 3.441605806350708, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8755639791488647, + "num_tokens": 185896989.0, + "step": 4871 + }, + { + "epoch": 0.6197684772929652, + "ewc_loss": 0.0067937858402729034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.793785723857582e-05, + "grad_norm": 3.4315221309661865, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8769041895866394, + "num_tokens": 185938123.0, + "step": 4872 + }, + { + "epoch": 0.6198956875715558, + "ewc_loss": 0.0068273083306849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.827308243373409e-05, + "grad_norm": 3.5074384212493896, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8637112379074097, + "num_tokens": 185973440.0, + "step": 4873 + }, + { + "epoch": 0.6200228978501463, + "ewc_loss": 0.006869245320558548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.86924540787004e-05, + "grad_norm": 3.542771100997925, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8639446496963501, + "num_tokens": 186010069.0, + "step": 4874 + }, + { + "epoch": 0.6201501081287368, + "ewc_loss": 0.0068429261445999146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.842926086392254e-05, + "grad_norm": 3.483182191848755, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8584791421890259, + "num_tokens": 186048365.0, + "step": 4875 + }, + { + "epoch": 0.6202773184073274, + "ewc_loss": 0.0068146297708153725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.814629887230694e-05, + "grad_norm": 3.500903606414795, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8698251247406006, + "num_tokens": 186085358.0, + "step": 4876 + }, + { + "epoch": 0.6204045286859178, + "ewc_loss": 0.006825991906225681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.825992022641003e-05, + "grad_norm": 3.5007402896881104, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8665668964385986, + "num_tokens": 186118476.0, + "step": 4877 + }, + { + "epoch": 0.6205317389645083, + "ewc_loss": 0.006819769740104675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.819769623689353e-05, + "grad_norm": 3.5010409355163574, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8602956533432007, + "num_tokens": 186155487.0, + "step": 4878 + }, + { + "epoch": 0.6206589492430988, + "ewc_loss": 0.0068159024231135845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.815902452217415e-05, + "grad_norm": 3.4446423053741455, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8695300221443176, + "num_tokens": 186192941.0, + "step": 4879 + }, + { + "epoch": 0.6207861595216894, + "ewc_loss": 0.0067957560531795025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.795756053179502e-05, + "grad_norm": 3.463650703430176, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.867659330368042, + "num_tokens": 186233860.0, + "step": 4880 + }, + { + "epoch": 0.6209133698002799, + "ewc_loss": 0.006820177659392357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.820177804911509e-05, + "grad_norm": 3.498721122741699, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.852798581123352, + "num_tokens": 186273081.0, + "step": 4881 + }, + { + "epoch": 0.6210405800788704, + "ewc_loss": 0.0068364678882062435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.836467946413904e-05, + "grad_norm": 3.4643304347991943, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8683640360832214, + "num_tokens": 186310596.0, + "step": 4882 + }, + { + "epoch": 0.6211677903574608, + "ewc_loss": 0.006798644550144672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.798644608352333e-05, + "grad_norm": 3.522526264190674, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8583465218544006, + "num_tokens": 186346262.0, + "step": 4883 + }, + { + "epoch": 0.6212950006360514, + "ewc_loss": 0.0068526724353432655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.852672231616452e-05, + "grad_norm": 3.4673640727996826, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8654632568359375, + "num_tokens": 186386508.0, + "step": 4884 + }, + { + "epoch": 0.6214222109146419, + "ewc_loss": 0.006796026136726141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.796025991206989e-05, + "grad_norm": 3.454056978225708, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.859207272529602, + "num_tokens": 186425184.0, + "step": 4885 + }, + { + "epoch": 0.6215494211932324, + "ewc_loss": 0.006794522982090712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.794522778363898e-05, + "grad_norm": 3.469759702682495, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8633276224136353, + "num_tokens": 186463072.0, + "step": 4886 + }, + { + "epoch": 0.621676631471823, + "ewc_loss": 0.006821267772465944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.821267743362114e-05, + "grad_norm": 3.4424808025360107, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8729462623596191, + "num_tokens": 186504062.0, + "step": 4887 + }, + { + "epoch": 0.6218038417504135, + "ewc_loss": 0.006787937134504318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.787937309127301e-05, + "grad_norm": 3.4754831790924072, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8545225262641907, + "num_tokens": 186544564.0, + "step": 4888 + }, + { + "epoch": 0.6219310520290039, + "ewc_loss": 0.00681323790922761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.813237996539101e-05, + "grad_norm": 3.460214376449585, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8493615388870239, + "num_tokens": 186587196.0, + "step": 4889 + }, + { + "epoch": 0.6220582623075944, + "ewc_loss": 0.006792399100959301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.792398926336318e-05, + "grad_norm": 3.546300172805786, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8643955588340759, + "num_tokens": 186620673.0, + "step": 4890 + }, + { + "epoch": 0.622185472586185, + "ewc_loss": 0.006856171879917383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.856171967228875e-05, + "grad_norm": 3.5186946392059326, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.867455244064331, + "num_tokens": 186654028.0, + "step": 4891 + }, + { + "epoch": 0.6223126828647755, + "ewc_loss": 0.006805668119341135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.805668090237305e-05, + "grad_norm": 3.4628443717956543, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8608680367469788, + "num_tokens": 186693395.0, + "step": 4892 + }, + { + "epoch": 0.622439893143366, + "ewc_loss": 0.006789692211896181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.789692270103842e-05, + "grad_norm": 3.4885756969451904, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8455073237419128, + "num_tokens": 186734440.0, + "step": 4893 + }, + { + "epoch": 0.6225671034219565, + "ewc_loss": 0.0068255155347287655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.825515447417274e-05, + "grad_norm": 3.4802281856536865, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8666456937789917, + "num_tokens": 186769637.0, + "step": 4894 + }, + { + "epoch": 0.622694313700547, + "ewc_loss": 0.006819139700382948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.819139525759965e-05, + "grad_norm": 3.4692695140838623, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8590361475944519, + "num_tokens": 186810188.0, + "step": 4895 + }, + { + "epoch": 0.6228215239791375, + "ewc_loss": 0.006805318873375654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.805318844271824e-05, + "grad_norm": 3.473172426223755, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8540772795677185, + "num_tokens": 186847512.0, + "step": 4896 + }, + { + "epoch": 0.622948734257728, + "ewc_loss": 0.006824849639087915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.824849697295576e-05, + "grad_norm": 3.5741240978240967, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8732805252075195, + "num_tokens": 186878721.0, + "step": 4897 + }, + { + "epoch": 0.6230759445363185, + "ewc_loss": 0.006883666850626469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.883667083457112e-05, + "grad_norm": 3.441514730453491, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8674814701080322, + "num_tokens": 186920239.0, + "step": 4898 + }, + { + "epoch": 0.6232031548149091, + "ewc_loss": 0.006772557273507118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.77255738992244e-05, + "grad_norm": 3.443896532058716, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8566824793815613, + "num_tokens": 186964158.0, + "step": 4899 + }, + { + "epoch": 0.6233303650934996, + "ewc_loss": 0.0068303728476166725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.830372876720503e-05, + "grad_norm": 3.440384864807129, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8789259791374207, + "num_tokens": 187003176.0, + "step": 4900 + }, + { + "epoch": 0.62345757537209, + "ewc_loss": 0.006835324224084616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.835324165876955e-05, + "grad_norm": 3.444324493408203, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8621813058853149, + "num_tokens": 187046152.0, + "step": 4901 + }, + { + "epoch": 0.6235847856506805, + "ewc_loss": 0.006814209278672934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.814209336880594e-05, + "grad_norm": 3.4552149772644043, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8754860162734985, + "num_tokens": 187086629.0, + "step": 4902 + }, + { + "epoch": 0.6237119959292711, + "ewc_loss": 0.006822831463068724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.822831346653402e-05, + "grad_norm": 3.5190916061401367, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8574012517929077, + "num_tokens": 187122856.0, + "step": 4903 + }, + { + "epoch": 0.6238392062078616, + "ewc_loss": 0.006863616872578859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.863616727059707e-05, + "grad_norm": 3.451342821121216, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8773105144500732, + "num_tokens": 187161568.0, + "step": 4904 + }, + { + "epoch": 0.6239664164864521, + "ewc_loss": 0.006791132967919111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.79113290971145e-05, + "grad_norm": 3.44696044921875, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.864491879940033, + "num_tokens": 187202831.0, + "step": 4905 + }, + { + "epoch": 0.6240936267650427, + "ewc_loss": 0.006812991574406624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.81299134157598e-05, + "grad_norm": 3.4625391960144043, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8705750107765198, + "num_tokens": 187239059.0, + "step": 4906 + }, + { + "epoch": 0.6242208370436332, + "ewc_loss": 0.006808022502809763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.808022590121254e-05, + "grad_norm": 3.52606201171875, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8577641844749451, + "num_tokens": 187272940.0, + "step": 4907 + }, + { + "epoch": 0.6243480473222236, + "ewc_loss": 0.00684194965288043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.84194965288043e-05, + "grad_norm": 3.5379247665405273, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8759510517120361, + "num_tokens": 187310963.0, + "step": 4908 + }, + { + "epoch": 0.6244752576008141, + "ewc_loss": 0.0068191103637218475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.819110421929508e-05, + "grad_norm": 3.465056896209717, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8818039894104004, + "num_tokens": 187345698.0, + "step": 4909 + }, + { + "epoch": 0.6246024678794047, + "ewc_loss": 0.006770514417439699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.770514301024377e-05, + "grad_norm": 3.4746804237365723, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8624905347824097, + "num_tokens": 187384762.0, + "step": 4910 + }, + { + "epoch": 0.6247296781579952, + "ewc_loss": 0.006809361279010773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.809361366322264e-05, + "grad_norm": 3.494189739227295, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8544182777404785, + "num_tokens": 187424288.0, + "step": 4911 + }, + { + "epoch": 0.6248568884365857, + "ewc_loss": 0.006820871960371733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.820871931267902e-05, + "grad_norm": 3.464604377746582, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8720828294754028, + "num_tokens": 187461481.0, + "step": 4912 + }, + { + "epoch": 0.6249840987151762, + "ewc_loss": 0.006785496138036251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.785496225347742e-05, + "grad_norm": 3.503199577331543, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8785465955734253, + "num_tokens": 187497690.0, + "step": 4913 + }, + { + "epoch": 0.6251113089937667, + "ewc_loss": 0.006838104221969843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.838104309281334e-05, + "grad_norm": 3.5012502670288086, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8578632473945618, + "num_tokens": 187537825.0, + "step": 4914 + }, + { + "epoch": 0.6252385192723572, + "ewc_loss": 0.006809080019593239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.809079786762595e-05, + "grad_norm": 3.408480644226074, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.873496413230896, + "num_tokens": 187583998.0, + "step": 4915 + }, + { + "epoch": 0.6253657295509477, + "ewc_loss": 0.006755836308002472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.755836511729285e-05, + "grad_norm": 3.49800443649292, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8628877401351929, + "num_tokens": 187621962.0, + "step": 4916 + }, + { + "epoch": 0.6254929398295382, + "ewc_loss": 0.006825219839811325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.825220043538138e-05, + "grad_norm": 3.491743803024292, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.870864987373352, + "num_tokens": 187655913.0, + "step": 4917 + }, + { + "epoch": 0.6256201501081288, + "ewc_loss": 0.00679361168295145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.793611828470603e-05, + "grad_norm": 3.498277425765991, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.855804443359375, + "num_tokens": 187692433.0, + "step": 4918 + }, + { + "epoch": 0.6257473603867193, + "ewc_loss": 0.006803601048886776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.803600990679115e-05, + "grad_norm": 3.5138885974884033, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8747813701629639, + "num_tokens": 187724311.0, + "step": 4919 + }, + { + "epoch": 0.6258745706653097, + "ewc_loss": 0.006820878479629755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.820878479629755e-05, + "grad_norm": 3.455014705657959, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8720495700836182, + "num_tokens": 187766614.0, + "step": 4920 + }, + { + "epoch": 0.6260017809439002, + "ewc_loss": 0.006782012525945902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.782012496842071e-05, + "grad_norm": 3.486307382583618, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8743661046028137, + "num_tokens": 187804847.0, + "step": 4921 + }, + { + "epoch": 0.6261289912224908, + "ewc_loss": 0.006814747117459774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.814747030148283e-05, + "grad_norm": 3.4580531120300293, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8660383224487305, + "num_tokens": 187842791.0, + "step": 4922 + }, + { + "epoch": 0.6262562015010813, + "ewc_loss": 0.006789959967136383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.789960025344044e-05, + "grad_norm": 3.533207654953003, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.855510413646698, + "num_tokens": 187878304.0, + "step": 4923 + }, + { + "epoch": 0.6263834117796718, + "ewc_loss": 0.006851923651993275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.851923535577953e-05, + "grad_norm": 3.4740054607391357, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8656967878341675, + "num_tokens": 187915022.0, + "step": 4924 + }, + { + "epoch": 0.6265106220582624, + "ewc_loss": 0.006795603781938553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.795603985665366e-05, + "grad_norm": 3.529794931411743, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8653871417045593, + "num_tokens": 187950533.0, + "step": 4925 + }, + { + "epoch": 0.6266378323368528, + "ewc_loss": 0.006847540382295847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.847540498711169e-05, + "grad_norm": 3.484764337539673, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8604080080986023, + "num_tokens": 187989893.0, + "step": 4926 + }, + { + "epoch": 0.6267650426154433, + "ewc_loss": 0.006818727124482393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818726978963241e-05, + "grad_norm": 3.5172927379608154, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8579025268554688, + "num_tokens": 188025031.0, + "step": 4927 + }, + { + "epoch": 0.6268922528940338, + "ewc_loss": 0.006846318952739239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.846318865427747e-05, + "grad_norm": 3.4962234497070312, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8547797799110413, + "num_tokens": 188060797.0, + "step": 4928 + }, + { + "epoch": 0.6270194631726244, + "ewc_loss": 0.006828321143984795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.828321056673303e-05, + "grad_norm": 3.5081887245178223, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8560446500778198, + "num_tokens": 188096271.0, + "step": 4929 + }, + { + "epoch": 0.6271466734512149, + "ewc_loss": 0.006844942457973957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.844942254247144e-05, + "grad_norm": 3.442955732345581, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8643659353256226, + "num_tokens": 188136697.0, + "step": 4930 + }, + { + "epoch": 0.6272738837298054, + "ewc_loss": 0.006818580906838179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818580732215196e-05, + "grad_norm": 3.4737417697906494, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8626283407211304, + "num_tokens": 188174715.0, + "step": 4931 + }, + { + "epoch": 0.6274010940083958, + "ewc_loss": 0.006859961431473494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.859961285954341e-05, + "grad_norm": 3.5435497760772705, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8423894643783569, + "num_tokens": 188216379.0, + "step": 4932 + }, + { + "epoch": 0.6275283042869864, + "ewc_loss": 0.006890720222145319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.890720396768302e-05, + "grad_norm": 3.4659643173217773, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8680907487869263, + "num_tokens": 188252183.0, + "step": 4933 + }, + { + "epoch": 0.6276555145655769, + "ewc_loss": 0.006821081507951021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.821081478847191e-05, + "grad_norm": 3.4339075088500977, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8666691780090332, + "num_tokens": 188291128.0, + "step": 4934 + }, + { + "epoch": 0.6277827248441674, + "ewc_loss": 0.006847403012216091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.847402983112261e-05, + "grad_norm": 3.4857513904571533, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8469433784484863, + "num_tokens": 188334256.0, + "step": 4935 + }, + { + "epoch": 0.627909935122758, + "ewc_loss": 0.006878431420773268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.878431304357946e-05, + "grad_norm": 3.481816053390503, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8575058579444885, + "num_tokens": 188373207.0, + "step": 4936 + }, + { + "epoch": 0.6280371454013485, + "ewc_loss": 0.006856447085738182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.856446998426691e-05, + "grad_norm": 3.4641804695129395, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8594440221786499, + "num_tokens": 188414937.0, + "step": 4937 + }, + { + "epoch": 0.6281643556799389, + "ewc_loss": 0.00684714550152421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.847145414212719e-05, + "grad_norm": 3.439729690551758, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8601171970367432, + "num_tokens": 188455398.0, + "step": 4938 + }, + { + "epoch": 0.6282915659585294, + "ewc_loss": 0.006839736830443144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.839737034169957e-05, + "grad_norm": 3.498539447784424, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8635626435279846, + "num_tokens": 188496561.0, + "step": 4939 + }, + { + "epoch": 0.62841877623712, + "ewc_loss": 0.00687436992302537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.874369864817709e-05, + "grad_norm": 3.514181613922119, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8680976033210754, + "num_tokens": 188531318.0, + "step": 4940 + }, + { + "epoch": 0.6285459865157105, + "ewc_loss": 0.006854355800896883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.854355888208374e-05, + "grad_norm": 3.535059690475464, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8541350364685059, + "num_tokens": 188566807.0, + "step": 4941 + }, + { + "epoch": 0.628673196794301, + "ewc_loss": 0.00687850546091795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.87850551912561e-05, + "grad_norm": 3.452014923095703, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8592526316642761, + "num_tokens": 188608062.0, + "step": 4942 + }, + { + "epoch": 0.6288004070728915, + "ewc_loss": 0.006806112825870514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.806112651247531e-05, + "grad_norm": 3.4555916786193848, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8756693601608276, + "num_tokens": 188644953.0, + "step": 4943 + }, + { + "epoch": 0.628927617351482, + "ewc_loss": 0.006838948931545019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.838949047960341e-05, + "grad_norm": 3.3995912075042725, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8775473833084106, + "num_tokens": 188687720.0, + "step": 4944 + }, + { + "epoch": 0.6290548276300725, + "ewc_loss": 0.006795006804168224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.795006629545242e-05, + "grad_norm": 3.491975784301758, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8568828105926514, + "num_tokens": 188730077.0, + "step": 4945 + }, + { + "epoch": 0.629182037908663, + "ewc_loss": 0.006871365010738373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.871364894323051e-05, + "grad_norm": 3.4593448638916016, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8693315386772156, + "num_tokens": 188768306.0, + "step": 4946 + }, + { + "epoch": 0.6293092481872535, + "ewc_loss": 0.006813229992985725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.813229992985725e-05, + "grad_norm": 3.5414676666259766, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8424733877182007, + "num_tokens": 188807480.0, + "step": 4947 + }, + { + "epoch": 0.6294364584658441, + "ewc_loss": 0.006872869562357664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.872869562357664e-05, + "grad_norm": 3.602567672729492, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8541994094848633, + "num_tokens": 188836383.0, + "step": 4948 + }, + { + "epoch": 0.6295636687444346, + "ewc_loss": 0.006884724833071232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.884725007694215e-05, + "grad_norm": 3.4396610260009766, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8673737049102783, + "num_tokens": 188876079.0, + "step": 4949 + }, + { + "epoch": 0.629690879023025, + "ewc_loss": 0.006769133266061544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.769133324269205e-05, + "grad_norm": 3.4647719860076904, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8613181114196777, + "num_tokens": 188912936.0, + "step": 4950 + }, + { + "epoch": 0.6298180893016155, + "ewc_loss": 0.006849534809589386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.849534838693216e-05, + "grad_norm": 3.4805219173431396, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8632419109344482, + "num_tokens": 188951475.0, + "step": 4951 + }, + { + "epoch": 0.6299452995802061, + "ewc_loss": 0.0068288566544651985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.828856567153707e-05, + "grad_norm": 3.4039745330810547, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8488401174545288, + "num_tokens": 188996972.0, + "step": 4952 + }, + { + "epoch": 0.6300725098587966, + "ewc_loss": 0.0067971632815897465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.797163223382086e-05, + "grad_norm": 3.44761323928833, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8817785382270813, + "num_tokens": 189030706.0, + "step": 4953 + }, + { + "epoch": 0.6301997201373871, + "ewc_loss": 0.006854057777673006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.854057573946193e-05, + "grad_norm": 3.4720804691314697, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8852741718292236, + "num_tokens": 189065638.0, + "step": 4954 + }, + { + "epoch": 0.6303269304159776, + "ewc_loss": 0.006853093393146992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.853093509562314e-05, + "grad_norm": 3.4686784744262695, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8769069910049438, + "num_tokens": 189101747.0, + "step": 4955 + }, + { + "epoch": 0.6304541406945681, + "ewc_loss": 0.006848935503512621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.848935299785808e-05, + "grad_norm": 3.446042776107788, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8593024611473083, + "num_tokens": 189142965.0, + "step": 4956 + }, + { + "epoch": 0.6305813509731586, + "ewc_loss": 0.0068228235468268394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.822823343100026e-05, + "grad_norm": 3.511870861053467, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8699377775192261, + "num_tokens": 189177822.0, + "step": 4957 + }, + { + "epoch": 0.6307085612517491, + "ewc_loss": 0.006880912929773331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.880913133500144e-05, + "grad_norm": 3.48321795463562, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8579738736152649, + "num_tokens": 189215710.0, + "step": 4958 + }, + { + "epoch": 0.6308357715303397, + "ewc_loss": 0.006827058736234903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.827058678027242e-05, + "grad_norm": 3.5809991359710693, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.837096095085144, + "num_tokens": 189250788.0, + "step": 4959 + }, + { + "epoch": 0.6309629818089302, + "ewc_loss": 0.00690267700701952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.90267697791569e-05, + "grad_norm": 3.6323394775390625, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.861556887626648, + "num_tokens": 189278451.0, + "step": 4960 + }, + { + "epoch": 0.6310901920875207, + "ewc_loss": 0.006907562725245953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.907562783453614e-05, + "grad_norm": 3.4514644145965576, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8673702478408813, + "num_tokens": 189316857.0, + "step": 4961 + }, + { + "epoch": 0.6312174023661112, + "ewc_loss": 0.006789823062717915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.789823237340897e-05, + "grad_norm": 3.4456465244293213, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8754258155822754, + "num_tokens": 189356600.0, + "step": 4962 + }, + { + "epoch": 0.6313446126447017, + "ewc_loss": 0.006864654365926981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.86465427861549e-05, + "grad_norm": 3.4947166442871094, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8580674529075623, + "num_tokens": 189393134.0, + "step": 4963 + }, + { + "epoch": 0.6314718229232922, + "ewc_loss": 0.00689028762280941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.890287477290258e-05, + "grad_norm": 3.459355592727661, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.848141610622406, + "num_tokens": 189433431.0, + "step": 4964 + }, + { + "epoch": 0.6315990332018827, + "ewc_loss": 0.006845882162451744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.845882307970896e-05, + "grad_norm": 3.4901225566864014, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8498282432556152, + "num_tokens": 189473738.0, + "step": 4965 + }, + { + "epoch": 0.6317262434804732, + "ewc_loss": 0.006872792262583971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.872792437206954e-05, + "grad_norm": 3.501507520675659, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8769171237945557, + "num_tokens": 189509930.0, + "step": 4966 + }, + { + "epoch": 0.6318534537590638, + "ewc_loss": 0.00687854178249836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.878541898913682e-05, + "grad_norm": 3.490647792816162, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8679796457290649, + "num_tokens": 189547801.0, + "step": 4967 + }, + { + "epoch": 0.6319806640376543, + "ewc_loss": 0.0068670944310724735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.867094634799287e-05, + "grad_norm": 3.48480224609375, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8762618899345398, + "num_tokens": 189582226.0, + "step": 4968 + }, + { + "epoch": 0.6321078743162447, + "ewc_loss": 0.0068808565847575665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.880856381030753e-05, + "grad_norm": 3.4817867279052734, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8616983890533447, + "num_tokens": 189622915.0, + "step": 4969 + }, + { + "epoch": 0.6322350845948352, + "ewc_loss": 0.006867163814604282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.867163756396621e-05, + "grad_norm": 3.475376844406128, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8717514872550964, + "num_tokens": 189658182.0, + "step": 4970 + }, + { + "epoch": 0.6323622948734258, + "ewc_loss": 0.00685661006718874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.856609979877248e-05, + "grad_norm": 3.4718425273895264, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8666490912437439, + "num_tokens": 189696627.0, + "step": 4971 + }, + { + "epoch": 0.6324895051520163, + "ewc_loss": 0.006862464360892773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.86246421537362e-05, + "grad_norm": 3.5850799083709717, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8614065051078796, + "num_tokens": 189729826.0, + "step": 4972 + }, + { + "epoch": 0.6326167154306068, + "ewc_loss": 0.006924753542989492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.924753688508645e-05, + "grad_norm": 3.4701828956604004, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.868302583694458, + "num_tokens": 189771504.0, + "step": 4973 + }, + { + "epoch": 0.6327439257091974, + "ewc_loss": 0.006814825814217329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.814825610490516e-05, + "grad_norm": 3.4489877223968506, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8804339170455933, + "num_tokens": 189809220.0, + "step": 4974 + }, + { + "epoch": 0.6328711359877878, + "ewc_loss": 0.006850718520581722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.850718636997044e-05, + "grad_norm": 3.5394387245178223, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8731915950775146, + "num_tokens": 189849586.0, + "step": 4975 + }, + { + "epoch": 0.6329983462663783, + "ewc_loss": 0.006885451730340719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.885451875859872e-05, + "grad_norm": 3.5076067447662354, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.864301323890686, + "num_tokens": 189885054.0, + "step": 4976 + }, + { + "epoch": 0.6331255565449688, + "ewc_loss": 0.006826295983046293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.826296157669276e-05, + "grad_norm": 3.535907030105591, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8569463491439819, + "num_tokens": 189918286.0, + "step": 4977 + }, + { + "epoch": 0.6332527668235594, + "ewc_loss": 0.006850073114037514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.850073259556666e-05, + "grad_norm": 3.475914716720581, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.844950258731842, + "num_tokens": 189956845.0, + "step": 4978 + }, + { + "epoch": 0.6333799771021499, + "ewc_loss": 0.006808442994952202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.808443140471354e-05, + "grad_norm": 3.429229259490967, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8808407187461853, + "num_tokens": 189994827.0, + "step": 4979 + }, + { + "epoch": 0.6335071873807404, + "ewc_loss": 0.006793334614485502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.793334614485502e-05, + "grad_norm": 3.44711971282959, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8586023449897766, + "num_tokens": 190036066.0, + "step": 4980 + }, + { + "epoch": 0.6336343976593308, + "ewc_loss": 0.006829647347331047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.829647463746369e-05, + "grad_norm": 3.4884707927703857, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8750196695327759, + "num_tokens": 190073326.0, + "step": 4981 + }, + { + "epoch": 0.6337616079379214, + "ewc_loss": 0.006836078595370054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.836078682681546e-05, + "grad_norm": 3.464752435684204, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8617703914642334, + "num_tokens": 190109125.0, + "step": 4982 + }, + { + "epoch": 0.6338888182165119, + "ewc_loss": 0.006822865456342697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.822865543654189e-05, + "grad_norm": 3.4847099781036377, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8694196939468384, + "num_tokens": 190148520.0, + "step": 4983 + }, + { + "epoch": 0.6340160284951024, + "ewc_loss": 0.006833508610725403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.833508814452216e-05, + "grad_norm": 3.4922738075256348, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8654779195785522, + "num_tokens": 190184272.0, + "step": 4984 + }, + { + "epoch": 0.6341432387736929, + "ewc_loss": 0.0068221562542021275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.822156137786806e-05, + "grad_norm": 3.490032196044922, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8628458976745605, + "num_tokens": 190221483.0, + "step": 4985 + }, + { + "epoch": 0.6342704490522835, + "ewc_loss": 0.006827895995229483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.827896140748635e-05, + "grad_norm": 3.410839319229126, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8752177953720093, + "num_tokens": 190266430.0, + "step": 4986 + }, + { + "epoch": 0.6343976593308739, + "ewc_loss": 0.00678684376180172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.786843732697889e-05, + "grad_norm": 3.419023036956787, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.875131368637085, + "num_tokens": 190308575.0, + "step": 4987 + }, + { + "epoch": 0.6345248696094644, + "ewc_loss": 0.006800612900406122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.80061275488697e-05, + "grad_norm": 3.480889081954956, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8743226528167725, + "num_tokens": 190344584.0, + "step": 4988 + }, + { + "epoch": 0.6346520798880549, + "ewc_loss": 0.006823682691901922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.823682633694261e-05, + "grad_norm": 3.577455759048462, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.872220516204834, + "num_tokens": 190376497.0, + "step": 4989 + }, + { + "epoch": 0.6347792901666455, + "ewc_loss": 0.006853976286947727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.853976083220914e-05, + "grad_norm": 3.48414945602417, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8524562120437622, + "num_tokens": 190416879.0, + "step": 4990 + }, + { + "epoch": 0.634906500445236, + "ewc_loss": 0.00675777904689312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.757779192412272e-05, + "grad_norm": 3.457158088684082, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8841500282287598, + "num_tokens": 190454684.0, + "step": 4991 + }, + { + "epoch": 0.6350337107238265, + "ewc_loss": 0.006785155739635229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.785155710531399e-05, + "grad_norm": 3.589207410812378, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8571946620941162, + "num_tokens": 190490524.0, + "step": 4992 + }, + { + "epoch": 0.635160921002417, + "ewc_loss": 0.006853173486888409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.85317354509607e-05, + "grad_norm": 3.478485107421875, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.871151328086853, + "num_tokens": 190526558.0, + "step": 4993 + }, + { + "epoch": 0.6352881312810075, + "ewc_loss": 0.006748688407242298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.748688610969111e-05, + "grad_norm": 3.4231817722320557, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8642858266830444, + "num_tokens": 190571206.0, + "step": 4994 + }, + { + "epoch": 0.635415341559598, + "ewc_loss": 0.0067596035078167915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.759603274986148e-05, + "grad_norm": 3.463104486465454, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8516515493392944, + "num_tokens": 190611480.0, + "step": 4995 + }, + { + "epoch": 0.6355425518381885, + "ewc_loss": 0.006791580934077501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.791581108700484e-05, + "grad_norm": 3.551945924758911, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8377799987792969, + "num_tokens": 190647699.0, + "step": 4996 + }, + { + "epoch": 0.6356697621167791, + "ewc_loss": 0.006831703707575798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.831703649368137e-05, + "grad_norm": 3.4791767597198486, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.864033579826355, + "num_tokens": 190684863.0, + "step": 4997 + }, + { + "epoch": 0.6357969723953696, + "ewc_loss": 0.006764763966202736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.764764111721888e-05, + "grad_norm": 3.4988348484039307, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8504153490066528, + "num_tokens": 190725006.0, + "step": 4998 + }, + { + "epoch": 0.63592418267396, + "ewc_loss": 0.006825431250035763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.82543104630895e-05, + "grad_norm": 3.536133289337158, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8602187633514404, + "num_tokens": 190757431.0, + "step": 4999 + }, + { + "epoch": 0.6360513929525505, + "ewc_loss": 0.006833361927419901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.83336184010841e-05, + "grad_norm": 3.509232759475708, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8687276840209961, + "num_tokens": 190793118.0, + "step": 5000 + }, + { + "epoch": 0.6361786032311411, + "ewc_loss": 0.0068183899857103825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818390102125704e-05, + "grad_norm": 3.4142236709594727, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.87420254945755, + "num_tokens": 190837105.0, + "step": 5001 + }, + { + "epoch": 0.6363058135097316, + "ewc_loss": 0.006776551250368357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.776551163056865e-05, + "grad_norm": 3.491098165512085, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8581207394599915, + "num_tokens": 190875372.0, + "step": 5002 + }, + { + "epoch": 0.6364330237883221, + "ewc_loss": 0.0068479785695672035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.847978511359543e-05, + "grad_norm": 3.4429125785827637, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8695483207702637, + "num_tokens": 190921307.0, + "step": 5003 + }, + { + "epoch": 0.6365602340669126, + "ewc_loss": 0.006794715765863657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.794715591240674e-05, + "grad_norm": 3.4483439922332764, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8653233051300049, + "num_tokens": 190967919.0, + "step": 5004 + }, + { + "epoch": 0.6366874443455031, + "ewc_loss": 0.006808462552726269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.808462785556912e-05, + "grad_norm": 3.487105369567871, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8634921312332153, + "num_tokens": 191005668.0, + "step": 5005 + }, + { + "epoch": 0.6368146546240936, + "ewc_loss": 0.006828179117292166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.828179175499827e-05, + "grad_norm": 3.5234053134918213, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8524898886680603, + "num_tokens": 191044455.0, + "step": 5006 + }, + { + "epoch": 0.6369418649026841, + "ewc_loss": 0.006822341587394476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.822341674705967e-05, + "grad_norm": 3.4176437854766846, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8690539002418518, + "num_tokens": 191086369.0, + "step": 5007 + }, + { + "epoch": 0.6370690751812746, + "ewc_loss": 0.0067658498883247375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.765849684597924e-05, + "grad_norm": 3.5016379356384277, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.859684407711029, + "num_tokens": 191127070.0, + "step": 5008 + }, + { + "epoch": 0.6371962854598652, + "ewc_loss": 0.006834921892732382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.83492180542089e-05, + "grad_norm": 3.4509568214416504, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8773989677429199, + "num_tokens": 191163623.0, + "step": 5009 + }, + { + "epoch": 0.6373234957384557, + "ewc_loss": 0.006761026568710804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.761026452295482e-05, + "grad_norm": 3.5806331634521484, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8612877130508423, + "num_tokens": 191196790.0, + "step": 5010 + }, + { + "epoch": 0.6374507060170462, + "ewc_loss": 0.006861899048089981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.861898873466998e-05, + "grad_norm": 3.546053171157837, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8628177642822266, + "num_tokens": 191230265.0, + "step": 5011 + }, + { + "epoch": 0.6375779162956366, + "ewc_loss": 0.006785122212022543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.785122241126373e-05, + "grad_norm": 3.5234296321868896, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8565351963043213, + "num_tokens": 191266781.0, + "step": 5012 + }, + { + "epoch": 0.6377051265742272, + "ewc_loss": 0.006791478488594294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.791478517698124e-05, + "grad_norm": 3.4558017253875732, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.855111837387085, + "num_tokens": 191305869.0, + "step": 5013 + }, + { + "epoch": 0.6378323368528177, + "ewc_loss": 0.00679083913564682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.790838961023837e-05, + "grad_norm": 3.5020487308502197, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8525789380073547, + "num_tokens": 191343498.0, + "step": 5014 + }, + { + "epoch": 0.6379595471314082, + "ewc_loss": 0.006820521783083677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.82052195770666e-05, + "grad_norm": 3.511878728866577, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.849765419960022, + "num_tokens": 191383260.0, + "step": 5015 + }, + { + "epoch": 0.6380867574099988, + "ewc_loss": 0.006830879487097263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.83087928337045e-05, + "grad_norm": 3.541451930999756, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8534513711929321, + "num_tokens": 191417350.0, + "step": 5016 + }, + { + "epoch": 0.6382139676885893, + "ewc_loss": 0.006845083553344011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.845083407824859e-05, + "grad_norm": 3.5092928409576416, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8652744293212891, + "num_tokens": 191454336.0, + "step": 5017 + }, + { + "epoch": 0.6383411779671797, + "ewc_loss": 0.006822856143116951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.82285608490929e-05, + "grad_norm": 3.4918341636657715, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8665437698364258, + "num_tokens": 191489977.0, + "step": 5018 + }, + { + "epoch": 0.6384683882457702, + "ewc_loss": 0.006834360770881176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.834360829088837e-05, + "grad_norm": 3.500683307647705, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8538994789123535, + "num_tokens": 191524622.0, + "step": 5019 + }, + { + "epoch": 0.6385955985243608, + "ewc_loss": 0.0068739173002541065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.873917300254107e-05, + "grad_norm": 3.492713212966919, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8538846969604492, + "num_tokens": 191565389.0, + "step": 5020 + }, + { + "epoch": 0.6387228088029513, + "ewc_loss": 0.006862810347229242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.862810550956056e-05, + "grad_norm": 3.509476661682129, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.882797122001648, + "num_tokens": 191604640.0, + "step": 5021 + }, + { + "epoch": 0.6388500190815418, + "ewc_loss": 0.006865647621452808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.865647446829826e-05, + "grad_norm": 3.4145257472991943, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8641149997711182, + "num_tokens": 191651029.0, + "step": 5022 + }, + { + "epoch": 0.6389772293601323, + "ewc_loss": 0.0068120164796710014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.81201636325568e-05, + "grad_norm": 3.4437255859375, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8639793395996094, + "num_tokens": 191690524.0, + "step": 5023 + }, + { + "epoch": 0.6391044396387228, + "ewc_loss": 0.006857242900878191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.857242988189682e-05, + "grad_norm": 3.4871585369110107, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8590700626373291, + "num_tokens": 191733992.0, + "step": 5024 + }, + { + "epoch": 0.6392316499173133, + "ewc_loss": 0.006869344972074032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.869345088489354e-05, + "grad_norm": 3.4761621952056885, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8764024972915649, + "num_tokens": 191769717.0, + "step": 5025 + }, + { + "epoch": 0.6393588601959038, + "ewc_loss": 0.006858219392597675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.858219421701506e-05, + "grad_norm": 3.4637720584869385, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.849362313747406, + "num_tokens": 191813334.0, + "step": 5026 + }, + { + "epoch": 0.6394860704744944, + "ewc_loss": 0.0068476381711661816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.847637996543199e-05, + "grad_norm": 3.50154709815979, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8660362958908081, + "num_tokens": 191848295.0, + "step": 5027 + }, + { + "epoch": 0.6396132807530849, + "ewc_loss": 0.006860824767500162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.860824942123145e-05, + "grad_norm": 3.4579100608825684, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.857704222202301, + "num_tokens": 191885329.0, + "step": 5028 + }, + { + "epoch": 0.6397404910316754, + "ewc_loss": 0.006833643652498722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.833643419668078e-05, + "grad_norm": 3.548717975616455, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8758873343467712, + "num_tokens": 191918070.0, + "step": 5029 + }, + { + "epoch": 0.6398677013102658, + "ewc_loss": 0.006912631914019585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.912631943123415e-05, + "grad_norm": 3.4012980461120605, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8670680522918701, + "num_tokens": 191966084.0, + "step": 5030 + }, + { + "epoch": 0.6399949115888564, + "ewc_loss": 0.00679280050098896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.792800559196621e-05, + "grad_norm": 3.505136013031006, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8627766370773315, + "num_tokens": 192004907.0, + "step": 5031 + }, + { + "epoch": 0.6401221218674469, + "ewc_loss": 0.0069006565026938915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.90065644448623e-05, + "grad_norm": 3.541361093521118, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8446868658065796, + "num_tokens": 192042942.0, + "step": 5032 + }, + { + "epoch": 0.6402493321460374, + "ewc_loss": 0.006880502216517925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.880502041894943e-05, + "grad_norm": 3.4942314624786377, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8659298419952393, + "num_tokens": 192081133.0, + "step": 5033 + }, + { + "epoch": 0.6403765424246279, + "ewc_loss": 0.006843192502856255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.843192386440933e-05, + "grad_norm": 3.4856905937194824, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8647266626358032, + "num_tokens": 192120171.0, + "step": 5034 + }, + { + "epoch": 0.6405037527032185, + "ewc_loss": 0.006853088270872831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.853088416391984e-05, + "grad_norm": 3.4663193225860596, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8654842972755432, + "num_tokens": 192159583.0, + "step": 5035 + }, + { + "epoch": 0.6406309629818089, + "ewc_loss": 0.006841457914561033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.841457798145711e-05, + "grad_norm": 3.5304150581359863, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8623340129852295, + "num_tokens": 192199513.0, + "step": 5036 + }, + { + "epoch": 0.6407581732603994, + "ewc_loss": 0.006884512025862932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.884511822136119e-05, + "grad_norm": 3.5848515033721924, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8666404485702515, + "num_tokens": 192228352.0, + "step": 5037 + }, + { + "epoch": 0.6408853835389899, + "ewc_loss": 0.006882011890411377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.882011803099886e-05, + "grad_norm": 3.455601930618286, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8595839738845825, + "num_tokens": 192264480.0, + "step": 5038 + }, + { + "epoch": 0.6410125938175805, + "ewc_loss": 0.006803189869970083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.803189899073914e-05, + "grad_norm": 3.4899227619171143, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8613706827163696, + "num_tokens": 192303307.0, + "step": 5039 + }, + { + "epoch": 0.641139804096171, + "ewc_loss": 0.006868232507258654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.868232594570145e-05, + "grad_norm": 3.4873275756835938, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8672806620597839, + "num_tokens": 192342064.0, + "step": 5040 + }, + { + "epoch": 0.6412670143747615, + "ewc_loss": 0.006857642438262701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.857642438262701e-05, + "grad_norm": 3.4828572273254395, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8715880513191223, + "num_tokens": 192379297.0, + "step": 5041 + }, + { + "epoch": 0.6413942246533519, + "ewc_loss": 0.0068564144894480705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.856414256617427e-05, + "grad_norm": 3.466279983520508, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8685136437416077, + "num_tokens": 192416087.0, + "step": 5042 + }, + { + "epoch": 0.6415214349319425, + "ewc_loss": 0.006868814583867788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.86881467117928e-05, + "grad_norm": 3.4991774559020996, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8687444925308228, + "num_tokens": 192454421.0, + "step": 5043 + }, + { + "epoch": 0.641648645210533, + "ewc_loss": 0.0068849558010697365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.884955655550584e-05, + "grad_norm": 3.4661083221435547, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8612319231033325, + "num_tokens": 192493860.0, + "step": 5044 + }, + { + "epoch": 0.6417758554891235, + "ewc_loss": 0.0068561662919819355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.856166146462783e-05, + "grad_norm": 3.490375280380249, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8556865453720093, + "num_tokens": 192533672.0, + "step": 5045 + }, + { + "epoch": 0.641903065767714, + "ewc_loss": 0.006890187039971352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.890187069075182e-05, + "grad_norm": 3.5348610877990723, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8510743379592896, + "num_tokens": 192569569.0, + "step": 5046 + }, + { + "epoch": 0.6420302760463046, + "ewc_loss": 0.006896520033478737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.896520062582567e-05, + "grad_norm": 3.4465603828430176, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8719077706336975, + "num_tokens": 192610716.0, + "step": 5047 + }, + { + "epoch": 0.642157486324895, + "ewc_loss": 0.0068422057665884495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.84220576658845e-05, + "grad_norm": 3.4904563426971436, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8592228889465332, + "num_tokens": 192648040.0, + "step": 5048 + }, + { + "epoch": 0.6422846966034855, + "ewc_loss": 0.0069088442251086235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.908844079589471e-05, + "grad_norm": 3.4769999980926514, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8613613843917847, + "num_tokens": 192690063.0, + "step": 5049 + }, + { + "epoch": 0.6424119068820761, + "ewc_loss": 0.006882591173052788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.882590969325975e-05, + "grad_norm": 3.506697416305542, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8581339716911316, + "num_tokens": 192732067.0, + "step": 5050 + }, + { + "epoch": 0.6425391171606666, + "ewc_loss": 0.006886439863592386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.886439950903878e-05, + "grad_norm": 3.4767866134643555, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.873356819152832, + "num_tokens": 192768598.0, + "step": 5051 + }, + { + "epoch": 0.6426663274392571, + "ewc_loss": 0.006857915781438351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.857916014268994e-05, + "grad_norm": 3.5509300231933594, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8354002833366394, + "num_tokens": 192805420.0, + "step": 5052 + }, + { + "epoch": 0.6427935377178476, + "ewc_loss": 0.006904030218720436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.904030306031927e-05, + "grad_norm": 3.4249420166015625, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8593548536300659, + "num_tokens": 192852082.0, + "step": 5053 + }, + { + "epoch": 0.6429207479964381, + "ewc_loss": 0.006802820134907961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.802820280427113e-05, + "grad_norm": 3.462536334991455, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8651273250579834, + "num_tokens": 192892846.0, + "step": 5054 + }, + { + "epoch": 0.6430479582750286, + "ewc_loss": 0.006869153119623661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.869153003208339e-05, + "grad_norm": 3.418102979660034, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8813905715942383, + "num_tokens": 192932091.0, + "step": 5055 + }, + { + "epoch": 0.6431751685536191, + "ewc_loss": 0.006818101275712252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.818101246608421e-05, + "grad_norm": 3.5832231044769287, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8657527565956116, + "num_tokens": 192960628.0, + "step": 5056 + }, + { + "epoch": 0.6433023788322096, + "ewc_loss": 0.0069308471865952015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.930847303010523e-05, + "grad_norm": 3.473257541656494, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8568958640098572, + "num_tokens": 193002075.0, + "step": 5057 + }, + { + "epoch": 0.6434295891108002, + "ewc_loss": 0.006803401745855808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.803401629440486e-05, + "grad_norm": 3.478576183319092, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.852921724319458, + "num_tokens": 193039720.0, + "step": 5058 + }, + { + "epoch": 0.6435567993893907, + "ewc_loss": 0.006857587955892086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.857587868580595e-05, + "grad_norm": 3.5179190635681152, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.865426778793335, + "num_tokens": 193072806.0, + "step": 5059 + }, + { + "epoch": 0.6436840096679812, + "ewc_loss": 0.006875298917293549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.87529900460504e-05, + "grad_norm": 3.4970149993896484, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8723810911178589, + "num_tokens": 193108478.0, + "step": 5060 + }, + { + "epoch": 0.6438112199465716, + "ewc_loss": 0.006846812088042498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.846812175353989e-05, + "grad_norm": 3.451293706893921, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8676049709320068, + "num_tokens": 193150663.0, + "step": 5061 + }, + { + "epoch": 0.6439384302251622, + "ewc_loss": 0.006833020597696304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.833020597696304e-05, + "grad_norm": 3.42879319190979, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8668131828308105, + "num_tokens": 193191499.0, + "step": 5062 + }, + { + "epoch": 0.6440656405037527, + "ewc_loss": 0.006828950252383947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.82895042700693e-05, + "grad_norm": 3.4636707305908203, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8558571338653564, + "num_tokens": 193236103.0, + "step": 5063 + }, + { + "epoch": 0.6441928507823432, + "ewc_loss": 0.006863979157060385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.863979069748893e-05, + "grad_norm": 3.5586776733398438, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8418021202087402, + "num_tokens": 193274095.0, + "step": 5064 + }, + { + "epoch": 0.6443200610609338, + "ewc_loss": 0.006899353116750717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.89935332047753e-05, + "grad_norm": 3.4479482173919678, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8693112730979919, + "num_tokens": 193316988.0, + "step": 5065 + }, + { + "epoch": 0.6444472713395243, + "ewc_loss": 0.006810068618506193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.810068589402363e-05, + "grad_norm": 3.4541494846343994, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8623573780059814, + "num_tokens": 193357430.0, + "step": 5066 + }, + { + "epoch": 0.6445744816181147, + "ewc_loss": 0.0068457769230008125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.845776806585491e-05, + "grad_norm": 3.491138219833374, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8708584308624268, + "num_tokens": 193390392.0, + "step": 5067 + }, + { + "epoch": 0.6447016918967052, + "ewc_loss": 0.006858423817902803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.858423876110464e-05, + "grad_norm": 3.4566762447357178, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8606865406036377, + "num_tokens": 193430961.0, + "step": 5068 + }, + { + "epoch": 0.6448289021752958, + "ewc_loss": 0.006827543489634991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.827543256804347e-05, + "grad_norm": 3.4452195167541504, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8618991374969482, + "num_tokens": 193473767.0, + "step": 5069 + }, + { + "epoch": 0.6449561124538863, + "ewc_loss": 0.006826508790254593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.82650861563161e-05, + "grad_norm": 3.5610620975494385, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8519678115844727, + "num_tokens": 193511375.0, + "step": 5070 + }, + { + "epoch": 0.6450833227324768, + "ewc_loss": 0.006909293122589588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.909293006174266e-05, + "grad_norm": 3.4909656047821045, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8740047812461853, + "num_tokens": 193551296.0, + "step": 5071 + }, + { + "epoch": 0.6452105330110673, + "ewc_loss": 0.006833592429757118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.833592487964779e-05, + "grad_norm": 3.429166316986084, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8668828010559082, + "num_tokens": 193596484.0, + "step": 5072 + }, + { + "epoch": 0.6453377432896578, + "ewc_loss": 0.006812033709138632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.812033825553954e-05, + "grad_norm": 3.4724667072296143, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8705697059631348, + "num_tokens": 193637513.0, + "step": 5073 + }, + { + "epoch": 0.6454649535682483, + "ewc_loss": 0.006836518179625273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.836518150521442e-05, + "grad_norm": 3.547969341278076, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8587192893028259, + "num_tokens": 193672808.0, + "step": 5074 + }, + { + "epoch": 0.6455921638468388, + "ewc_loss": 0.006871372926980257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.871372897876427e-05, + "grad_norm": 3.4677481651306152, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8783981204032898, + "num_tokens": 193713018.0, + "step": 5075 + }, + { + "epoch": 0.6457193741254293, + "ewc_loss": 0.006786458194255829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.786458106944337e-05, + "grad_norm": 3.4743776321411133, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.865165650844574, + "num_tokens": 193757649.0, + "step": 5076 + }, + { + "epoch": 0.6458465844040199, + "ewc_loss": 0.006828244775533676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.828244659118354e-05, + "grad_norm": 3.543551445007324, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8615698218345642, + "num_tokens": 193792390.0, + "step": 5077 + }, + { + "epoch": 0.6459737946826104, + "ewc_loss": 0.006861093454062939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.861093424959108e-05, + "grad_norm": 3.4832770824432373, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.869773805141449, + "num_tokens": 193827721.0, + "step": 5078 + }, + { + "epoch": 0.6461010049612008, + "ewc_loss": 0.006790024694055319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.79002478136681e-05, + "grad_norm": 3.5360107421875, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8545862436294556, + "num_tokens": 193867217.0, + "step": 5079 + }, + { + "epoch": 0.6462282152397913, + "ewc_loss": 0.0068482388742268085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.84823899064213e-05, + "grad_norm": 3.540083885192871, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8712388873100281, + "num_tokens": 193900128.0, + "step": 5080 + }, + { + "epoch": 0.6463554255183819, + "ewc_loss": 0.006823189556598663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.82318932376802e-05, + "grad_norm": 3.519674062728882, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8725461959838867, + "num_tokens": 193935744.0, + "step": 5081 + }, + { + "epoch": 0.6464826357969724, + "ewc_loss": 0.006823175586760044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.823175499448553e-05, + "grad_norm": 3.442476272583008, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8621813058853149, + "num_tokens": 193977344.0, + "step": 5082 + }, + { + "epoch": 0.6466098460755629, + "ewc_loss": 0.006783901248127222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.783901335438713e-05, + "grad_norm": 3.5321273803710938, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8388907313346863, + "num_tokens": 194014295.0, + "step": 5083 + }, + { + "epoch": 0.6467370563541535, + "ewc_loss": 0.006873103324323893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.87310312059708e-05, + "grad_norm": 3.47790789604187, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.850225031375885, + "num_tokens": 194055245.0, + "step": 5084 + }, + { + "epoch": 0.6468642666327439, + "ewc_loss": 0.006822615396231413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.82261525071226e-05, + "grad_norm": 3.534771203994751, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8566185235977173, + "num_tokens": 194089313.0, + "step": 5085 + }, + { + "epoch": 0.6469914769113344, + "ewc_loss": 0.006870439276099205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.870439392514527e-05, + "grad_norm": 3.427412271499634, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8696125149726868, + "num_tokens": 194128293.0, + "step": 5086 + }, + { + "epoch": 0.6471186871899249, + "ewc_loss": 0.006796188186854124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.796188245061785e-05, + "grad_norm": 3.483275890350342, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8571794033050537, + "num_tokens": 194166632.0, + "step": 5087 + }, + { + "epoch": 0.6472458974685155, + "ewc_loss": 0.0068796975538134575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.879697320982814e-05, + "grad_norm": 3.465596914291382, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8509871959686279, + "num_tokens": 194209176.0, + "step": 5088 + }, + { + "epoch": 0.647373107747106, + "ewc_loss": 0.006845466326922178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.845466123195365e-05, + "grad_norm": 3.524097204208374, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8537020087242126, + "num_tokens": 194245617.0, + "step": 5089 + }, + { + "epoch": 0.6475003180256965, + "ewc_loss": 0.006898613180965185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.898613355588168e-05, + "grad_norm": 3.455167055130005, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8715382814407349, + "num_tokens": 194285983.0, + "step": 5090 + }, + { + "epoch": 0.6476275283042869, + "ewc_loss": 0.006836862303316593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.836862303316593e-05, + "grad_norm": 3.5614991188049316, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8612109422683716, + "num_tokens": 194317088.0, + "step": 5091 + }, + { + "epoch": 0.6477547385828775, + "ewc_loss": 0.006929637398570776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.929637311259285e-05, + "grad_norm": 3.499603271484375, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.869326651096344, + "num_tokens": 194354025.0, + "step": 5092 + }, + { + "epoch": 0.647881948861468, + "ewc_loss": 0.006857967469841242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.857967673568055e-05, + "grad_norm": 3.4724838733673096, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8677893877029419, + "num_tokens": 194388539.0, + "step": 5093 + }, + { + "epoch": 0.6480091591400585, + "ewc_loss": 0.006869226694107056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.869226490380242e-05, + "grad_norm": 3.48736572265625, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8667535781860352, + "num_tokens": 194424518.0, + "step": 5094 + }, + { + "epoch": 0.648136369418649, + "ewc_loss": 0.006890708114951849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.890708027640358e-05, + "grad_norm": 3.4701504707336426, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.860731840133667, + "num_tokens": 194463310.0, + "step": 5095 + }, + { + "epoch": 0.6482635796972396, + "ewc_loss": 0.006885244511067867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.885244511067867e-05, + "grad_norm": 3.4916961193084717, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8629262447357178, + "num_tokens": 194499911.0, + "step": 5096 + }, + { + "epoch": 0.64839078997583, + "ewc_loss": 0.006894586607813835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.894586840644479e-05, + "grad_norm": 3.528907299041748, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8551149964332581, + "num_tokens": 194536731.0, + "step": 5097 + }, + { + "epoch": 0.6485180002544205, + "ewc_loss": 0.006920590531080961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.920590385561809e-05, + "grad_norm": 3.518672227859497, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8531689047813416, + "num_tokens": 194574014.0, + "step": 5098 + }, + { + "epoch": 0.648645210533011, + "ewc_loss": 0.006900493986904621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.900494190631434e-05, + "grad_norm": 3.4781081676483154, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8553814888000488, + "num_tokens": 194611944.0, + "step": 5099 + }, + { + "epoch": 0.6487724208116016, + "ewc_loss": 0.006902800872921944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.90280066919513e-05, + "grad_norm": 3.5057854652404785, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8724923729896545, + "num_tokens": 194649734.0, + "step": 5100 + }, + { + "epoch": 0.6488996310901921, + "ewc_loss": 0.0069267405197024345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.926740752533078e-05, + "grad_norm": 3.45450758934021, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8719568848609924, + "num_tokens": 194688344.0, + "step": 5101 + }, + { + "epoch": 0.6490268413687826, + "ewc_loss": 0.006881408393383026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.88140862621367e-05, + "grad_norm": 3.4817800521850586, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8497790098190308, + "num_tokens": 194728582.0, + "step": 5102 + }, + { + "epoch": 0.649154051647373, + "ewc_loss": 0.0069179898127913475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.9179899583105e-05, + "grad_norm": 3.494011640548706, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8625508546829224, + "num_tokens": 194762753.0, + "step": 5103 + }, + { + "epoch": 0.6492812619259636, + "ewc_loss": 0.006922983564436436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.922983448021114e-05, + "grad_norm": 3.43769907951355, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8775091767311096, + "num_tokens": 194802325.0, + "step": 5104 + }, + { + "epoch": 0.6494084722045541, + "ewc_loss": 0.006889240350574255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.889240466989577e-05, + "grad_norm": 3.4495468139648438, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8625755310058594, + "num_tokens": 194846349.0, + "step": 5105 + }, + { + "epoch": 0.6495356824831446, + "ewc_loss": 0.006905050948262215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.905051122885197e-05, + "grad_norm": 3.5068767070770264, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8654814958572388, + "num_tokens": 194880334.0, + "step": 5106 + }, + { + "epoch": 0.6496628927617352, + "ewc_loss": 0.00694007333368063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.940073217265308e-05, + "grad_norm": 3.524724245071411, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8546786308288574, + "num_tokens": 194919343.0, + "step": 5107 + }, + { + "epoch": 0.6497901030403257, + "ewc_loss": 0.006917075254023075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.917075370438397e-05, + "grad_norm": 3.4627163410186768, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8643409013748169, + "num_tokens": 194958800.0, + "step": 5108 + }, + { + "epoch": 0.6499173133189162, + "ewc_loss": 0.0068808915093541145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.880891305627301e-05, + "grad_norm": 3.498887777328491, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8371107578277588, + "num_tokens": 194997249.0, + "step": 5109 + }, + { + "epoch": 0.6500445235975066, + "ewc_loss": 0.006926662288606167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.926662172190845e-05, + "grad_norm": 3.5025599002838135, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8651208877563477, + "num_tokens": 195035879.0, + "step": 5110 + }, + { + "epoch": 0.6501717338760972, + "ewc_loss": 0.006885488983243704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.885488983243704e-05, + "grad_norm": 3.4290239810943604, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.859860897064209, + "num_tokens": 195075925.0, + "step": 5111 + }, + { + "epoch": 0.6502989441546877, + "ewc_loss": 0.006865228526294231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.865228351671249e-05, + "grad_norm": 3.520242691040039, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8662220239639282, + "num_tokens": 195110185.0, + "step": 5112 + }, + { + "epoch": 0.6504261544332782, + "ewc_loss": 0.0069467234425246716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.946723442524672e-05, + "grad_norm": 3.4233884811401367, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8553930521011353, + "num_tokens": 195154666.0, + "step": 5113 + }, + { + "epoch": 0.6505533647118688, + "ewc_loss": 0.006845945492386818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.84594560880214e-05, + "grad_norm": 3.516946792602539, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8745290637016296, + "num_tokens": 195188998.0, + "step": 5114 + }, + { + "epoch": 0.6506805749904593, + "ewc_loss": 0.006942762993276119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.942763138795272e-05, + "grad_norm": 3.4791128635406494, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.87004554271698, + "num_tokens": 195224124.0, + "step": 5115 + }, + { + "epoch": 0.6508077852690497, + "ewc_loss": 0.006892122328281403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.892122473800555e-05, + "grad_norm": 3.602311849594116, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8639224767684937, + "num_tokens": 195253601.0, + "step": 5116 + }, + { + "epoch": 0.6509349955476402, + "ewc_loss": 0.0069642928428947926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.96429269737564e-05, + "grad_norm": 3.5031650066375732, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8549308776855469, + "num_tokens": 195289045.0, + "step": 5117 + }, + { + "epoch": 0.6510622058262308, + "ewc_loss": 0.006876267492771149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.876267434563488e-05, + "grad_norm": 3.469273090362549, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8650698065757751, + "num_tokens": 195326983.0, + "step": 5118 + }, + { + "epoch": 0.6511894161048213, + "ewc_loss": 0.0069025494158267975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.902549648657441e-05, + "grad_norm": 3.5177383422851562, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8755278587341309, + "num_tokens": 195360425.0, + "step": 5119 + }, + { + "epoch": 0.6513166263834118, + "ewc_loss": 0.0069296881556510925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.929688242962584e-05, + "grad_norm": 3.466031789779663, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8530306816101074, + "num_tokens": 195402648.0, + "step": 5120 + }, + { + "epoch": 0.6514438366620023, + "ewc_loss": 0.006874534301459789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.874534301459789e-05, + "grad_norm": 3.4653542041778564, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8807450532913208, + "num_tokens": 195440945.0, + "step": 5121 + }, + { + "epoch": 0.6515710469405928, + "ewc_loss": 0.006908802781254053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.90880260663107e-05, + "grad_norm": 3.44734263420105, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8653815388679504, + "num_tokens": 195481872.0, + "step": 5122 + }, + { + "epoch": 0.6516982572191833, + "ewc_loss": 0.006883383262902498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.883383321110159e-05, + "grad_norm": 3.4396250247955322, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8678276538848877, + "num_tokens": 195525460.0, + "step": 5123 + }, + { + "epoch": 0.6518254674977738, + "ewc_loss": 0.006872713100165129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.872713129268959e-05, + "grad_norm": 3.477581262588501, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8662847280502319, + "num_tokens": 195563522.0, + "step": 5124 + }, + { + "epoch": 0.6519526777763643, + "ewc_loss": 0.0069032772444188595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.90327724441886e-05, + "grad_norm": 3.500138998031616, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8733831644058228, + "num_tokens": 195598114.0, + "step": 5125 + }, + { + "epoch": 0.6520798880549549, + "ewc_loss": 0.00689427275210619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.894272519275546e-05, + "grad_norm": 3.46317458152771, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8596859574317932, + "num_tokens": 195639826.0, + "step": 5126 + }, + { + "epoch": 0.6522070983335454, + "ewc_loss": 0.006856909953057766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.856909749330953e-05, + "grad_norm": 3.465179443359375, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8710471391677856, + "num_tokens": 195682047.0, + "step": 5127 + }, + { + "epoch": 0.6523343086121358, + "ewc_loss": 0.006880469154566526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.880469300085679e-05, + "grad_norm": 3.524655818939209, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8683533668518066, + "num_tokens": 195722453.0, + "step": 5128 + }, + { + "epoch": 0.6524615188907263, + "ewc_loss": 0.006905013229697943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.905013287905604e-05, + "grad_norm": 3.4727227687835693, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8597790002822876, + "num_tokens": 195759182.0, + "step": 5129 + }, + { + "epoch": 0.6525887291693169, + "ewc_loss": 0.006858508102595806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.858508277218789e-05, + "grad_norm": 3.48771333694458, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8662642240524292, + "num_tokens": 195795641.0, + "step": 5130 + }, + { + "epoch": 0.6527159394479074, + "ewc_loss": 0.006882750894874334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.882751040393487e-05, + "grad_norm": 3.45809006690979, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8742998838424683, + "num_tokens": 195831200.0, + "step": 5131 + }, + { + "epoch": 0.6528431497264979, + "ewc_loss": 0.006859976798295975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.859976565465331e-05, + "grad_norm": 3.4570677280426025, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.865760087966919, + "num_tokens": 195875110.0, + "step": 5132 + }, + { + "epoch": 0.6529703600050885, + "ewc_loss": 0.006869227159768343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.869227217976004e-05, + "grad_norm": 3.432807683944702, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8671143054962158, + "num_tokens": 195916724.0, + "step": 5133 + }, + { + "epoch": 0.6530975702836789, + "ewc_loss": 0.006851870566606522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.85187042108737e-05, + "grad_norm": 3.548417568206787, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8581020832061768, + "num_tokens": 195956219.0, + "step": 5134 + }, + { + "epoch": 0.6532247805622694, + "ewc_loss": 0.0069060479290783405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.90604792907834e-05, + "grad_norm": 3.4976911544799805, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8669147491455078, + "num_tokens": 195991563.0, + "step": 5135 + }, + { + "epoch": 0.6533519908408599, + "ewc_loss": 0.006857593543827534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.857593689346686e-05, + "grad_norm": 3.479189395904541, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8708728551864624, + "num_tokens": 196031864.0, + "step": 5136 + }, + { + "epoch": 0.6534792011194505, + "ewc_loss": 0.006837260909378529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.83726102579385e-05, + "grad_norm": 3.5230281352996826, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8553884625434875, + "num_tokens": 196067208.0, + "step": 5137 + }, + { + "epoch": 0.653606411398041, + "ewc_loss": 0.006870071869343519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.87007195665501e-05, + "grad_norm": 3.530333995819092, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.867235541343689, + "num_tokens": 196103934.0, + "step": 5138 + }, + { + "epoch": 0.6537336216766315, + "ewc_loss": 0.006872186437249184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.872186349937692e-05, + "grad_norm": 3.5068867206573486, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8593268394470215, + "num_tokens": 196140886.0, + "step": 5139 + }, + { + "epoch": 0.6538608319552219, + "ewc_loss": 0.006852836813777685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.852836668258533e-05, + "grad_norm": 3.464308261871338, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8736480474472046, + "num_tokens": 196178940.0, + "step": 5140 + }, + { + "epoch": 0.6539880422338125, + "ewc_loss": 0.006846490781754255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.846490578027442e-05, + "grad_norm": 3.412419080734253, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8648450970649719, + "num_tokens": 196229461.0, + "step": 5141 + }, + { + "epoch": 0.654115252512403, + "ewc_loss": 0.006822424009442329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.822423893027008e-05, + "grad_norm": 3.4789276123046875, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8768945932388306, + "num_tokens": 196265793.0, + "step": 5142 + }, + { + "epoch": 0.6542424627909935, + "ewc_loss": 0.006870971992611885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.870971992611885e-05, + "grad_norm": 3.503912925720215, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8724019527435303, + "num_tokens": 196304919.0, + "step": 5143 + }, + { + "epoch": 0.654369673069584, + "ewc_loss": 0.006843296345323324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.843296432634816e-05, + "grad_norm": 3.4221532344818115, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8794432878494263, + "num_tokens": 196344761.0, + "step": 5144 + }, + { + "epoch": 0.6544968833481746, + "ewc_loss": 0.006801516283303499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.801516428822652e-05, + "grad_norm": 3.510739803314209, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8797316551208496, + "num_tokens": 196382625.0, + "step": 5145 + }, + { + "epoch": 0.654624093626765, + "ewc_loss": 0.006879636086523533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.879636202938855e-05, + "grad_norm": 3.445223093032837, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8756052255630493, + "num_tokens": 196422279.0, + "step": 5146 + }, + { + "epoch": 0.6547513039053555, + "ewc_loss": 0.006794150453060865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.794150249334052e-05, + "grad_norm": 3.4773130416870117, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8611890077590942, + "num_tokens": 196463684.0, + "step": 5147 + }, + { + "epoch": 0.654878514183946, + "ewc_loss": 0.00684043625369668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.84043625369668e-05, + "grad_norm": 3.453575372695923, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8675837516784668, + "num_tokens": 196506319.0, + "step": 5148 + }, + { + "epoch": 0.6550057244625366, + "ewc_loss": 0.006800927687436342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.800927803851664e-05, + "grad_norm": 3.4919402599334717, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8579717874526978, + "num_tokens": 196546367.0, + "step": 5149 + }, + { + "epoch": 0.6551329347411271, + "ewc_loss": 0.0068366569466888905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.836657121311873e-05, + "grad_norm": 3.629901170730591, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8511616587638855, + "num_tokens": 196576598.0, + "step": 5150 + }, + { + "epoch": 0.6552601450197176, + "ewc_loss": 0.006904066074639559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.904065958224237e-05, + "grad_norm": 3.523495674133301, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8589823842048645, + "num_tokens": 196610633.0, + "step": 5151 + }, + { + "epoch": 0.655387355298308, + "ewc_loss": 0.006806603167206049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.806603050790727e-05, + "grad_norm": 3.5254485607147217, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8551757335662842, + "num_tokens": 196646183.0, + "step": 5152 + }, + { + "epoch": 0.6555145655768986, + "ewc_loss": 0.006843565497547388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.84356564306654e-05, + "grad_norm": 3.4019715785980225, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8736428022384644, + "num_tokens": 196689454.0, + "step": 5153 + }, + { + "epoch": 0.6556417758554891, + "ewc_loss": 0.006791092921048403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.791092891944572e-05, + "grad_norm": 3.482896327972412, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8488611578941345, + "num_tokens": 196731820.0, + "step": 5154 + }, + { + "epoch": 0.6557689861340796, + "ewc_loss": 0.006880546920001507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.88054715283215e-05, + "grad_norm": 3.5856053829193115, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8565168976783752, + "num_tokens": 196768542.0, + "step": 5155 + }, + { + "epoch": 0.6558961964126702, + "ewc_loss": 0.006897918414324522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.897918501636013e-05, + "grad_norm": 3.4758894443511963, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8497258424758911, + "num_tokens": 196810300.0, + "step": 5156 + }, + { + "epoch": 0.6560234066912607, + "ewc_loss": 0.006797730922698975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.797730748075992e-05, + "grad_norm": 3.4887797832489014, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.847640335559845, + "num_tokens": 196847271.0, + "step": 5157 + }, + { + "epoch": 0.6561506169698512, + "ewc_loss": 0.006852140184491873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.852140359114856e-05, + "grad_norm": 3.5045504570007324, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8792530298233032, + "num_tokens": 196880439.0, + "step": 5158 + }, + { + "epoch": 0.6562778272484416, + "ewc_loss": 0.006855104584246874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.855104584246874e-05, + "grad_norm": 3.5337202548980713, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8565632104873657, + "num_tokens": 196914911.0, + "step": 5159 + }, + { + "epoch": 0.6564050375270322, + "ewc_loss": 0.006879695691168308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.879695865791291e-05, + "grad_norm": 3.5108559131622314, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8641173243522644, + "num_tokens": 196949419.0, + "step": 5160 + }, + { + "epoch": 0.6565322478056227, + "ewc_loss": 0.006863931659609079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.863931776024401e-05, + "grad_norm": 3.5070793628692627, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8568926453590393, + "num_tokens": 196989401.0, + "step": 5161 + }, + { + "epoch": 0.6566594580842132, + "ewc_loss": 0.006873813923448324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.873813981655985e-05, + "grad_norm": 3.4766695499420166, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8579419851303101, + "num_tokens": 197028377.0, + "step": 5162 + }, + { + "epoch": 0.6567866683628037, + "ewc_loss": 0.006871193181723356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.871193181723356e-05, + "grad_norm": 3.5230557918548584, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.844849705696106, + "num_tokens": 197068334.0, + "step": 5163 + }, + { + "epoch": 0.6569138786413943, + "ewc_loss": 0.006907003466039896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.907003262313083e-05, + "grad_norm": 3.443512439727783, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8666527271270752, + "num_tokens": 197105978.0, + "step": 5164 + }, + { + "epoch": 0.6570410889199847, + "ewc_loss": 0.006844417657703161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.844417657703161e-05, + "grad_norm": 3.4287939071655273, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8482566475868225, + "num_tokens": 197148641.0, + "step": 5165 + }, + { + "epoch": 0.6571682991985752, + "ewc_loss": 0.006869306322187185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.869306525913998e-05, + "grad_norm": 3.4994587898254395, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8439979553222656, + "num_tokens": 197188820.0, + "step": 5166 + }, + { + "epoch": 0.6572955094771658, + "ewc_loss": 0.0069184270687401295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.918427243363112e-05, + "grad_norm": 3.5111076831817627, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8522651791572571, + "num_tokens": 197227109.0, + "step": 5167 + }, + { + "epoch": 0.6574227197557563, + "ewc_loss": 0.0069006746634840965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.900674634380266e-05, + "grad_norm": 3.499321937561035, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8685177564620972, + "num_tokens": 197264083.0, + "step": 5168 + }, + { + "epoch": 0.6575499300343468, + "ewc_loss": 0.006890862248837948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.890862277941778e-05, + "grad_norm": 3.4872188568115234, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8602596521377563, + "num_tokens": 197301349.0, + "step": 5169 + }, + { + "epoch": 0.6576771403129373, + "ewc_loss": 0.006903397850692272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.903398025315255e-05, + "grad_norm": 3.4423635005950928, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.853801965713501, + "num_tokens": 197348176.0, + "step": 5170 + }, + { + "epoch": 0.6578043505915278, + "ewc_loss": 0.006877364125102758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.877363921375945e-05, + "grad_norm": 3.565033435821533, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8567385673522949, + "num_tokens": 197380490.0, + "step": 5171 + }, + { + "epoch": 0.6579315608701183, + "ewc_loss": 0.0069778854958713055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.977885641390458e-05, + "grad_norm": 3.495530366897583, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8514240980148315, + "num_tokens": 197423075.0, + "step": 5172 + }, + { + "epoch": 0.6580587711487088, + "ewc_loss": 0.006883906666189432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.883906462462619e-05, + "grad_norm": 3.515589714050293, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8683943152427673, + "num_tokens": 197461730.0, + "step": 5173 + }, + { + "epoch": 0.6581859814272993, + "ewc_loss": 0.006931816227734089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.931816460564733e-05, + "grad_norm": 3.4732768535614014, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8534644246101379, + "num_tokens": 197505801.0, + "step": 5174 + }, + { + "epoch": 0.6583131917058899, + "ewc_loss": 0.00689011299982667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.890112854307517e-05, + "grad_norm": 3.566270112991333, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.880871057510376, + "num_tokens": 197534643.0, + "step": 5175 + }, + { + "epoch": 0.6584404019844804, + "ewc_loss": 0.00697603914886713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.976039003347978e-05, + "grad_norm": 3.4474759101867676, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8689465522766113, + "num_tokens": 197576490.0, + "step": 5176 + }, + { + "epoch": 0.6585676122630708, + "ewc_loss": 0.006866051349788904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.866051262477413e-05, + "grad_norm": 3.493870496749878, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8779827952384949, + "num_tokens": 197612745.0, + "step": 5177 + }, + { + "epoch": 0.6586948225416613, + "ewc_loss": 0.006935946177691221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.935946294106543e-05, + "grad_norm": 3.5167834758758545, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8595613837242126, + "num_tokens": 197653756.0, + "step": 5178 + }, + { + "epoch": 0.6588220328202519, + "ewc_loss": 0.0069219437427818775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.921943713678047e-05, + "grad_norm": 3.461857557296753, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8876371383666992, + "num_tokens": 197694417.0, + "step": 5179 + }, + { + "epoch": 0.6589492430988424, + "ewc_loss": 0.006884746253490448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.884746107971296e-05, + "grad_norm": 3.5283405780792236, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.858837366104126, + "num_tokens": 197732738.0, + "step": 5180 + }, + { + "epoch": 0.6590764533774329, + "ewc_loss": 0.00693309772759676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.93309775670059e-05, + "grad_norm": 3.6453990936279297, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.854973316192627, + "num_tokens": 197759408.0, + "step": 5181 + }, + { + "epoch": 0.6592036636560235, + "ewc_loss": 0.006985736545175314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.985736399656162e-05, + "grad_norm": 3.5165750980377197, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8662004470825195, + "num_tokens": 197794334.0, + "step": 5182 + }, + { + "epoch": 0.6593308739346139, + "ewc_loss": 0.006879638880491257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.8796391133219e-05, + "grad_norm": 3.4725985527038574, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8713808059692383, + "num_tokens": 197832497.0, + "step": 5183 + }, + { + "epoch": 0.6594580842132044, + "ewc_loss": 0.0069106463342905045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.910646334290504e-05, + "grad_norm": 3.5344159603118896, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.856566846370697, + "num_tokens": 197869021.0, + "step": 5184 + }, + { + "epoch": 0.6595852944917949, + "ewc_loss": 0.006962892133742571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.96289207553491e-05, + "grad_norm": 3.5234460830688477, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8500955104827881, + "num_tokens": 197906365.0, + "step": 5185 + }, + { + "epoch": 0.6597125047703855, + "ewc_loss": 0.006932035554200411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.93203546688892e-05, + "grad_norm": 3.411961555480957, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8660608530044556, + "num_tokens": 197950725.0, + "step": 5186 + }, + { + "epoch": 0.659839715048976, + "ewc_loss": 0.006884218659251928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.884218601044267e-05, + "grad_norm": 3.4864118099212646, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8709312677383423, + "num_tokens": 197991178.0, + "step": 5187 + }, + { + "epoch": 0.6599669253275665, + "ewc_loss": 0.006958645302802324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.958645099075511e-05, + "grad_norm": 3.476062536239624, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8561202883720398, + "num_tokens": 198030176.0, + "step": 5188 + }, + { + "epoch": 0.6600941356061569, + "ewc_loss": 0.006934862118214369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.93486217642203e-05, + "grad_norm": 3.5237882137298584, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8594709634780884, + "num_tokens": 198067169.0, + "step": 5189 + }, + { + "epoch": 0.6602213458847475, + "ewc_loss": 0.006942331790924072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.94233167450875e-05, + "grad_norm": 3.4899773597717285, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8544265031814575, + "num_tokens": 198104994.0, + "step": 5190 + }, + { + "epoch": 0.660348556163338, + "ewc_loss": 0.00692748511210084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.927485082997009e-05, + "grad_norm": 3.5029821395874023, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8682761192321777, + "num_tokens": 198142791.0, + "step": 5191 + }, + { + "epoch": 0.6604757664419285, + "ewc_loss": 0.006954291369765997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.954291166039184e-05, + "grad_norm": 3.5045745372772217, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8615754246711731, + "num_tokens": 198179300.0, + "step": 5192 + }, + { + "epoch": 0.660602976720519, + "ewc_loss": 0.006924671586602926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.924671470187604e-05, + "grad_norm": 3.4698266983032227, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8772687315940857, + "num_tokens": 198220230.0, + "step": 5193 + }, + { + "epoch": 0.6607301869991096, + "ewc_loss": 0.006920505780726671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.920505984453484e-05, + "grad_norm": 3.715073585510254, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8520188927650452, + "num_tokens": 198251067.0, + "step": 5194 + }, + { + "epoch": 0.6608573972777, + "ewc_loss": 0.007060654927045107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.060654752422124e-05, + "grad_norm": 3.53391695022583, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8731174468994141, + "num_tokens": 198282130.0, + "step": 5195 + }, + { + "epoch": 0.6609846075562905, + "ewc_loss": 0.006879823282361031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.879823195049539e-05, + "grad_norm": 3.415649652481079, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8735603094100952, + "num_tokens": 198325609.0, + "step": 5196 + }, + { + "epoch": 0.661111817834881, + "ewc_loss": 0.006881919223815203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.881919398438185e-05, + "grad_norm": 3.493126630783081, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8675048351287842, + "num_tokens": 198365116.0, + "step": 5197 + }, + { + "epoch": 0.6612390281134716, + "ewc_loss": 0.006962676532566547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.96267670718953e-05, + "grad_norm": 3.4988996982574463, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8646746873855591, + "num_tokens": 198403467.0, + "step": 5198 + }, + { + "epoch": 0.6613662383920621, + "ewc_loss": 0.006919943727552891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.919943552929908e-05, + "grad_norm": 3.4540181159973145, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8708032965660095, + "num_tokens": 198440676.0, + "step": 5199 + }, + { + "epoch": 0.6614934486706526, + "ewc_loss": 0.0069063580594956875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.906357884872705e-05, + "grad_norm": 3.492741107940674, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8642346858978271, + "num_tokens": 198479082.0, + "step": 5200 + }, + { + "epoch": 0.661620658949243, + "ewc_loss": 0.006932512857019901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.93251276970841e-05, + "grad_norm": 3.4564425945281982, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8609835505485535, + "num_tokens": 198521741.0, + "step": 5201 + }, + { + "epoch": 0.6617478692278336, + "ewc_loss": 0.006905706599354744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.905706686666235e-05, + "grad_norm": 3.530353307723999, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8583083152770996, + "num_tokens": 198562177.0, + "step": 5202 + }, + { + "epoch": 0.6618750795064241, + "ewc_loss": 0.0069467658177018166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.946765643078834e-05, + "grad_norm": 3.50901460647583, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8651699423789978, + "num_tokens": 198600726.0, + "step": 5203 + }, + { + "epoch": 0.6620022897850146, + "ewc_loss": 0.006907779723405838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.907779606990516e-05, + "grad_norm": 3.607362985610962, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8350526690483093, + "num_tokens": 198638156.0, + "step": 5204 + }, + { + "epoch": 0.6621295000636052, + "ewc_loss": 0.0069668907672166824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.966890941839665e-05, + "grad_norm": 3.5116567611694336, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8455097675323486, + "num_tokens": 198680312.0, + "step": 5205 + }, + { + "epoch": 0.6622567103421957, + "ewc_loss": 0.006867536809295416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.867537013022229e-05, + "grad_norm": 3.4512546062469482, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8816054463386536, + "num_tokens": 198723886.0, + "step": 5206 + }, + { + "epoch": 0.6623839206207861, + "ewc_loss": 0.006864593829959631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.864593888167292e-05, + "grad_norm": 3.4811763763427734, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8647972345352173, + "num_tokens": 198765331.0, + "step": 5207 + }, + { + "epoch": 0.6625111308993766, + "ewc_loss": 0.006904421839863062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.90442175255157e-05, + "grad_norm": 3.492427110671997, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8686984777450562, + "num_tokens": 198802421.0, + "step": 5208 + }, + { + "epoch": 0.6626383411779672, + "ewc_loss": 0.006884453818202019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.884453614475206e-05, + "grad_norm": 3.4453203678131104, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8737632632255554, + "num_tokens": 198843335.0, + "step": 5209 + }, + { + "epoch": 0.6627655514565577, + "ewc_loss": 0.006846404168754816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.846403994131833e-05, + "grad_norm": 3.4929537773132324, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8699937462806702, + "num_tokens": 198880925.0, + "step": 5210 + }, + { + "epoch": 0.6628927617351482, + "ewc_loss": 0.006899768020957708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.899768050061539e-05, + "grad_norm": 3.5726590156555176, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8642769455909729, + "num_tokens": 198915448.0, + "step": 5211 + }, + { + "epoch": 0.6630199720137387, + "ewc_loss": 0.006909884512424469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.9098845415283e-05, + "grad_norm": 3.460371971130371, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8734575510025024, + "num_tokens": 198952254.0, + "step": 5212 + }, + { + "epoch": 0.6631471822923293, + "ewc_loss": 0.006825303193181753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.82530298945494e-05, + "grad_norm": 3.541130781173706, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8687517642974854, + "num_tokens": 198990841.0, + "step": 5213 + }, + { + "epoch": 0.6632743925709197, + "ewc_loss": 0.006901905871927738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.901905726408586e-05, + "grad_norm": 3.4756786823272705, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8670557141304016, + "num_tokens": 199029383.0, + "step": 5214 + }, + { + "epoch": 0.6634016028495102, + "ewc_loss": 0.006846750155091286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.846750329714268e-05, + "grad_norm": 3.545811414718628, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8618907928466797, + "num_tokens": 199066731.0, + "step": 5215 + }, + { + "epoch": 0.6635288131281007, + "ewc_loss": 0.0069000814110040665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.90008164383471e-05, + "grad_norm": 3.4765849113464355, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.875786542892456, + "num_tokens": 199108752.0, + "step": 5216 + }, + { + "epoch": 0.6636560234066913, + "ewc_loss": 0.006839219015091658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.839218985987827e-05, + "grad_norm": 3.5249783992767334, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8681045174598694, + "num_tokens": 199146438.0, + "step": 5217 + }, + { + "epoch": 0.6637832336852818, + "ewc_loss": 0.006882890127599239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.882890011183918e-05, + "grad_norm": 3.536956548690796, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.865861177444458, + "num_tokens": 199186367.0, + "step": 5218 + }, + { + "epoch": 0.6639104439638723, + "ewc_loss": 0.0068745482712984085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.874548125779256e-05, + "grad_norm": 3.493393659591675, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8780443072319031, + "num_tokens": 199225785.0, + "step": 5219 + }, + { + "epoch": 0.6640376542424627, + "ewc_loss": 0.0068326531909406185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.832653161836788e-05, + "grad_norm": 3.4868924617767334, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8680406808853149, + "num_tokens": 199262574.0, + "step": 5220 + }, + { + "epoch": 0.6641648645210533, + "ewc_loss": 0.006847475655376911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.847475742688403e-05, + "grad_norm": 3.4987096786499023, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8699758052825928, + "num_tokens": 199299093.0, + "step": 5221 + }, + { + "epoch": 0.6642920747996438, + "ewc_loss": 0.006855782121419907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.855781975900754e-05, + "grad_norm": 3.5021812915802, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8695917129516602, + "num_tokens": 199337446.0, + "step": 5222 + }, + { + "epoch": 0.6644192850782343, + "ewc_loss": 0.006849842611700296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.849842611700296e-05, + "grad_norm": 3.4702718257904053, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8669355511665344, + "num_tokens": 199377321.0, + "step": 5223 + }, + { + "epoch": 0.6645464953568249, + "ewc_loss": 0.006827165838330984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.827165634604171e-05, + "grad_norm": 3.4970970153808594, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8754650354385376, + "num_tokens": 199413057.0, + "step": 5224 + }, + { + "epoch": 0.6646737056354154, + "ewc_loss": 0.006858102045953274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.858102278783917e-05, + "grad_norm": 3.504413604736328, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8718211650848389, + "num_tokens": 199452730.0, + "step": 5225 + }, + { + "epoch": 0.6648009159140058, + "ewc_loss": 0.006856495048850775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.856495019746944e-05, + "grad_norm": 3.4990921020507812, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8748552203178406, + "num_tokens": 199487860.0, + "step": 5226 + }, + { + "epoch": 0.6649281261925963, + "ewc_loss": 0.006850836332887411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.850836507510394e-05, + "grad_norm": 3.4483354091644287, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.871849536895752, + "num_tokens": 199532790.0, + "step": 5227 + }, + { + "epoch": 0.6650553364711869, + "ewc_loss": 0.006804972421377897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.804972508689389e-05, + "grad_norm": 3.5519604682922363, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8836860656738281, + "num_tokens": 199561072.0, + "step": 5228 + }, + { + "epoch": 0.6651825467497774, + "ewc_loss": 0.006898053921759129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.898053834447637e-05, + "grad_norm": 3.521912097930908, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8706913590431213, + "num_tokens": 199595820.0, + "step": 5229 + }, + { + "epoch": 0.6653097570283679, + "ewc_loss": 0.006850889418274164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.850889622000977e-05, + "grad_norm": 3.4596903324127197, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8525949716567993, + "num_tokens": 199640101.0, + "step": 5230 + }, + { + "epoch": 0.6654369673069584, + "ewc_loss": 0.006824973504990339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.824973388575017e-05, + "grad_norm": 3.509427547454834, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8608799576759338, + "num_tokens": 199680287.0, + "step": 5231 + }, + { + "epoch": 0.6655641775855489, + "ewc_loss": 0.00687076011672616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.870760262245312e-05, + "grad_norm": 3.4913477897644043, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8645261526107788, + "num_tokens": 199721958.0, + "step": 5232 + }, + { + "epoch": 0.6656913878641394, + "ewc_loss": 0.006836381275206804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.836381362518296e-05, + "grad_norm": 3.516676902770996, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8722785115242004, + "num_tokens": 199756021.0, + "step": 5233 + }, + { + "epoch": 0.6658185981427299, + "ewc_loss": 0.006865887437015772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.865887553431094e-05, + "grad_norm": 3.520965099334717, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8694740533828735, + "num_tokens": 199793586.0, + "step": 5234 + }, + { + "epoch": 0.6659458084213205, + "ewc_loss": 0.006853827740997076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.853827653685585e-05, + "grad_norm": 3.484769582748413, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8517231345176697, + "num_tokens": 199835728.0, + "step": 5235 + }, + { + "epoch": 0.666073018699911, + "ewc_loss": 0.006839935667812824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.839935667812824e-05, + "grad_norm": 3.4716856479644775, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.874627947807312, + "num_tokens": 199873348.0, + "step": 5236 + }, + { + "epoch": 0.6662002289785015, + "ewc_loss": 0.006837519351392984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.837519322289154e-05, + "grad_norm": 3.5450432300567627, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8646721839904785, + "num_tokens": 199908223.0, + "step": 5237 + }, + { + "epoch": 0.6663274392570919, + "ewc_loss": 0.006887916941195726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.887916970299557e-05, + "grad_norm": 3.4945290088653564, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8623218536376953, + "num_tokens": 199947017.0, + "step": 5238 + }, + { + "epoch": 0.6664546495356825, + "ewc_loss": 0.006823638454079628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.823638250352815e-05, + "grad_norm": 3.52862811088562, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8733530640602112, + "num_tokens": 199984806.0, + "step": 5239 + }, + { + "epoch": 0.666581859814273, + "ewc_loss": 0.006865373346954584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.865373143227771e-05, + "grad_norm": 3.514939785003662, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8597647547721863, + "num_tokens": 200022600.0, + "step": 5240 + }, + { + "epoch": 0.6667090700928635, + "ewc_loss": 0.006853115279227495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.853115337435156e-05, + "grad_norm": 3.50429368019104, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8763520121574402, + "num_tokens": 200058407.0, + "step": 5241 + }, + { + "epoch": 0.666836280371454, + "ewc_loss": 0.006840869318693876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.840869173174724e-05, + "grad_norm": 3.54447603225708, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8378723859786987, + "num_tokens": 200101871.0, + "step": 5242 + }, + { + "epoch": 0.6669634906500446, + "ewc_loss": 0.006873413920402527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.873413803987205e-05, + "grad_norm": 3.5192461013793945, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8661303520202637, + "num_tokens": 200138501.0, + "step": 5243 + }, + { + "epoch": 0.667090700928635, + "ewc_loss": 0.0068485853262245655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.848585326224566e-05, + "grad_norm": 3.4978668689727783, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8633745312690735, + "num_tokens": 200176639.0, + "step": 5244 + }, + { + "epoch": 0.6672179112072255, + "ewc_loss": 0.0068389358930289745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.838935951236635e-05, + "grad_norm": 3.5505502223968506, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.860199511051178, + "num_tokens": 200211873.0, + "step": 5245 + }, + { + "epoch": 0.667345121485816, + "ewc_loss": 0.0069012208841741085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.901221058797091e-05, + "grad_norm": 3.590930938720703, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8614640235900879, + "num_tokens": 200246596.0, + "step": 5246 + }, + { + "epoch": 0.6674723317644066, + "ewc_loss": 0.006902819033712149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.902818859089166e-05, + "grad_norm": 3.491481304168701, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8813644647598267, + "num_tokens": 200282761.0, + "step": 5247 + }, + { + "epoch": 0.6675995420429971, + "ewc_loss": 0.006851085461676121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.851085345260799e-05, + "grad_norm": 3.5125160217285156, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8508058786392212, + "num_tokens": 200321821.0, + "step": 5248 + }, + { + "epoch": 0.6677267523215876, + "ewc_loss": 0.006918127648532391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.918127473909408e-05, + "grad_norm": 3.4865670204162598, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8487728238105774, + "num_tokens": 200369066.0, + "step": 5249 + }, + { + "epoch": 0.667853962600178, + "ewc_loss": 0.006884088274091482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.884088361402974e-05, + "grad_norm": 3.498765230178833, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8588970303535461, + "num_tokens": 200408067.0, + "step": 5250 + }, + { + "epoch": 0.6679811728787686, + "ewc_loss": 0.006895575672388077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.895575643284246e-05, + "grad_norm": 3.466388702392578, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8822859525680542, + "num_tokens": 200446693.0, + "step": 5251 + }, + { + "epoch": 0.6681083831573591, + "ewc_loss": 0.006884739734232426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.884739559609443e-05, + "grad_norm": 3.5337252616882324, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.84627366065979, + "num_tokens": 200485340.0, + "step": 5252 + }, + { + "epoch": 0.6682355934359496, + "ewc_loss": 0.006940687075257301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.940687308087945e-05, + "grad_norm": 3.5318686962127686, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8615430593490601, + "num_tokens": 200521978.0, + "step": 5253 + }, + { + "epoch": 0.6683628037145402, + "ewc_loss": 0.006923323031514883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.923323235241696e-05, + "grad_norm": 3.5056166648864746, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8674556016921997, + "num_tokens": 200560604.0, + "step": 5254 + }, + { + "epoch": 0.6684900139931307, + "ewc_loss": 0.006917949300259352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.91794921294786e-05, + "grad_norm": 3.4852068424224854, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8604280948638916, + "num_tokens": 200601718.0, + "step": 5255 + }, + { + "epoch": 0.6686172242717211, + "ewc_loss": 0.006913715973496437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.913716060807928e-05, + "grad_norm": 3.467926263809204, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8398505449295044, + "num_tokens": 200646186.0, + "step": 5256 + }, + { + "epoch": 0.6687444345503116, + "ewc_loss": 0.006920278072357178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.92027824698016e-05, + "grad_norm": 3.5004563331604004, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8677511811256409, + "num_tokens": 200684849.0, + "step": 5257 + }, + { + "epoch": 0.6688716448289022, + "ewc_loss": 0.0069443779066205025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.944377673789859e-05, + "grad_norm": 3.5432686805725098, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.870954692363739, + "num_tokens": 200719542.0, + "step": 5258 + }, + { + "epoch": 0.6689988551074927, + "ewc_loss": 0.006940385326743126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.940385355846956e-05, + "grad_norm": 3.472053289413452, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.865446925163269, + "num_tokens": 200756212.0, + "step": 5259 + }, + { + "epoch": 0.6691260653860832, + "ewc_loss": 0.006890638265758753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.890638178447261e-05, + "grad_norm": 3.5289509296417236, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8678704500198364, + "num_tokens": 200791184.0, + "step": 5260 + }, + { + "epoch": 0.6692532756646737, + "ewc_loss": 0.006960287224501371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.960287282709032e-05, + "grad_norm": 3.5782508850097656, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8478825092315674, + "num_tokens": 200823966.0, + "step": 5261 + }, + { + "epoch": 0.6693804859432643, + "ewc_loss": 0.006963872350752354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.96387214702554e-05, + "grad_norm": 3.5015666484832764, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8692725300788879, + "num_tokens": 200862718.0, + "step": 5262 + }, + { + "epoch": 0.6695076962218547, + "ewc_loss": 0.006907949224114418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.907949136802927e-05, + "grad_norm": 3.4897735118865967, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8597038388252258, + "num_tokens": 200901755.0, + "step": 5263 + }, + { + "epoch": 0.6696349065004452, + "ewc_loss": 0.006933425087481737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.933425174793229e-05, + "grad_norm": 3.503192186355591, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8760644197463989, + "num_tokens": 200935788.0, + "step": 5264 + }, + { + "epoch": 0.6697621167790357, + "ewc_loss": 0.006939398590475321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.939398735994473e-05, + "grad_norm": 3.475181818008423, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8793061971664429, + "num_tokens": 200975159.0, + "step": 5265 + }, + { + "epoch": 0.6698893270576263, + "ewc_loss": 0.00691957026720047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.9195702963043e-05, + "grad_norm": 3.53271222114563, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8756605982780457, + "num_tokens": 201010813.0, + "step": 5266 + }, + { + "epoch": 0.6700165373362168, + "ewc_loss": 0.006966643501073122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.966643559280783e-05, + "grad_norm": 3.510751724243164, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8627327084541321, + "num_tokens": 201051937.0, + "step": 5267 + }, + { + "epoch": 0.6701437476148073, + "ewc_loss": 0.006923459470272064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.923459295649081e-05, + "grad_norm": 3.4688210487365723, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8704087734222412, + "num_tokens": 201090926.0, + "step": 5268 + }, + { + "epoch": 0.6702709578933977, + "ewc_loss": 0.006900092586874962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.900092557771131e-05, + "grad_norm": 3.5772643089294434, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8459302186965942, + "num_tokens": 201126086.0, + "step": 5269 + }, + { + "epoch": 0.6703981681719883, + "ewc_loss": 0.0069849565625190735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.984956416999921e-05, + "grad_norm": 3.515127420425415, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8544042110443115, + "num_tokens": 201165933.0, + "step": 5270 + }, + { + "epoch": 0.6705253784505788, + "ewc_loss": 0.006903066299855709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.903066241648048e-05, + "grad_norm": 3.5298357009887695, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8677010536193848, + "num_tokens": 201198794.0, + "step": 5271 + }, + { + "epoch": 0.6706525887291693, + "ewc_loss": 0.006938762962818146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.938762817298993e-05, + "grad_norm": 3.5420916080474854, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8563541173934937, + "num_tokens": 201234081.0, + "step": 5272 + }, + { + "epoch": 0.6707797990077599, + "ewc_loss": 0.006935033947229385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.935033889021724e-05, + "grad_norm": 3.5311717987060547, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8610105514526367, + "num_tokens": 201271796.0, + "step": 5273 + }, + { + "epoch": 0.6709070092863504, + "ewc_loss": 0.006919563747942448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.919563747942448e-05, + "grad_norm": 3.53940749168396, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8463650941848755, + "num_tokens": 201309526.0, + "step": 5274 + }, + { + "epoch": 0.6710342195649408, + "ewc_loss": 0.0069456398487091064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.94564005243592e-05, + "grad_norm": 3.4997239112854004, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8709602952003479, + "num_tokens": 201344015.0, + "step": 5275 + }, + { + "epoch": 0.6711614298435313, + "ewc_loss": 0.006928196642547846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.928196671651676e-05, + "grad_norm": 3.448117256164551, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8659201860427856, + "num_tokens": 201385201.0, + "step": 5276 + }, + { + "epoch": 0.6712886401221219, + "ewc_loss": 0.006913826800882816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.913826655363664e-05, + "grad_norm": 3.5084035396575928, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8697322607040405, + "num_tokens": 201422370.0, + "step": 5277 + }, + { + "epoch": 0.6714158504007124, + "ewc_loss": 0.006963746156543493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.963746272958815e-05, + "grad_norm": 3.481182098388672, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8497357964515686, + "num_tokens": 201467085.0, + "step": 5278 + }, + { + "epoch": 0.6715430606793029, + "ewc_loss": 0.006922885309904814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.922885222593322e-05, + "grad_norm": 3.50801944732666, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8799806833267212, + "num_tokens": 201503432.0, + "step": 5279 + }, + { + "epoch": 0.6716702709578934, + "ewc_loss": 0.006948706693947315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.948706868570298e-05, + "grad_norm": 3.481935977935791, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8727911114692688, + "num_tokens": 201541358.0, + "step": 5280 + }, + { + "epoch": 0.6717974812364839, + "ewc_loss": 0.006917064543813467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.917064456501976e-05, + "grad_norm": 3.493678092956543, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8616318106651306, + "num_tokens": 201580235.0, + "step": 5281 + }, + { + "epoch": 0.6719246915150744, + "ewc_loss": 0.006919709965586662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.919709994690493e-05, + "grad_norm": 3.4159810543060303, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8643370866775513, + "num_tokens": 201627703.0, + "step": 5282 + }, + { + "epoch": 0.6720519017936649, + "ewc_loss": 0.0068849362432956696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.884936010465026e-05, + "grad_norm": 3.5215134620666504, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8680930137634277, + "num_tokens": 201667182.0, + "step": 5283 + }, + { + "epoch": 0.6721791120722554, + "ewc_loss": 0.006968287285417318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.968287198105827e-05, + "grad_norm": 3.542869806289673, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8664393424987793, + "num_tokens": 201699876.0, + "step": 5284 + }, + { + "epoch": 0.672306322350846, + "ewc_loss": 0.006919604726135731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.919604493305087e-05, + "grad_norm": 3.603337526321411, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8558807373046875, + "num_tokens": 201732770.0, + "step": 5285 + }, + { + "epoch": 0.6724335326294365, + "ewc_loss": 0.006960144732147455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.960144673939794e-05, + "grad_norm": 3.4725124835968018, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8674888014793396, + "num_tokens": 201771388.0, + "step": 5286 + }, + { + "epoch": 0.6725607429080269, + "ewc_loss": 0.006875303573906422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.875303370179608e-05, + "grad_norm": 3.4993295669555664, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8599095940589905, + "num_tokens": 201809283.0, + "step": 5287 + }, + { + "epoch": 0.6726879531866174, + "ewc_loss": 0.006937704049050808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.93770416546613e-05, + "grad_norm": 3.439192771911621, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8716152906417847, + "num_tokens": 201854950.0, + "step": 5288 + }, + { + "epoch": 0.672815163465208, + "ewc_loss": 0.006887813098728657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.887812924105674e-05, + "grad_norm": 3.482611894607544, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8678340911865234, + "num_tokens": 201895638.0, + "step": 5289 + }, + { + "epoch": 0.6729423737437985, + "ewc_loss": 0.0069269901141524315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.926990317879245e-05, + "grad_norm": 3.5690653324127197, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8517600893974304, + "num_tokens": 201928243.0, + "step": 5290 + }, + { + "epoch": 0.673069584022389, + "ewc_loss": 0.006976383738219738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.97638388373889e-05, + "grad_norm": 3.5353164672851562, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8650004267692566, + "num_tokens": 201964687.0, + "step": 5291 + }, + { + "epoch": 0.6731967943009796, + "ewc_loss": 0.006925900932401419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.925901107024401e-05, + "grad_norm": 3.490061044692993, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8783713579177856, + "num_tokens": 201997166.0, + "step": 5292 + }, + { + "epoch": 0.67332400457957, + "ewc_loss": 0.006913414224982262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.91341410856694e-05, + "grad_norm": 3.552466630935669, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8662216067314148, + "num_tokens": 202030771.0, + "step": 5293 + }, + { + "epoch": 0.6734512148581605, + "ewc_loss": 0.0069706495851278305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.970649701543152e-05, + "grad_norm": 3.446251153945923, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8563601970672607, + "num_tokens": 202077280.0, + "step": 5294 + }, + { + "epoch": 0.673578425136751, + "ewc_loss": 0.0069035557098686695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.903555913595483e-05, + "grad_norm": 3.550252914428711, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8564547896385193, + "num_tokens": 202115273.0, + "step": 5295 + }, + { + "epoch": 0.6737056354153416, + "ewc_loss": 0.006996041163802147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.99604133842513e-05, + "grad_norm": 3.590118885040283, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8740493059158325, + "num_tokens": 202145347.0, + "step": 5296 + }, + { + "epoch": 0.6738328456939321, + "ewc_loss": 0.006987297907471657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.987297820160165e-05, + "grad_norm": 3.479802370071411, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.869209349155426, + "num_tokens": 202184621.0, + "step": 5297 + }, + { + "epoch": 0.6739600559725226, + "ewc_loss": 0.006908542010933161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.908542127348483e-05, + "grad_norm": 3.41521954536438, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8675469756126404, + "num_tokens": 202230099.0, + "step": 5298 + }, + { + "epoch": 0.674087266251113, + "ewc_loss": 0.006901984103024006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.901984306750819e-05, + "grad_norm": 3.518455982208252, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8592678308486938, + "num_tokens": 202267872.0, + "step": 5299 + }, + { + "epoch": 0.6742144765297036, + "ewc_loss": 0.006991440895944834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.991440750425681e-05, + "grad_norm": 3.4712302684783936, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.863062858581543, + "num_tokens": 202305517.0, + "step": 5300 + }, + { + "epoch": 0.6743416868082941, + "ewc_loss": 0.006910600699484348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.910600495757535e-05, + "grad_norm": 3.547797203063965, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8508341312408447, + "num_tokens": 202340671.0, + "step": 5301 + }, + { + "epoch": 0.6744688970868846, + "ewc_loss": 0.006972765550017357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.972765550017357e-05, + "grad_norm": 3.5170531272888184, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8722769021987915, + "num_tokens": 202379552.0, + "step": 5302 + }, + { + "epoch": 0.6745961073654752, + "ewc_loss": 0.006921899970620871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.921900057932362e-05, + "grad_norm": 3.4482367038726807, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8601679801940918, + "num_tokens": 202421404.0, + "step": 5303 + }, + { + "epoch": 0.6747233176440657, + "ewc_loss": 0.006904349196702242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.904348992975429e-05, + "grad_norm": 3.524902105331421, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8505412340164185, + "num_tokens": 202463765.0, + "step": 5304 + }, + { + "epoch": 0.6748505279226561, + "ewc_loss": 0.0069584413431584835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.958441372262314e-05, + "grad_norm": 3.454294443130493, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8599413633346558, + "num_tokens": 202505872.0, + "step": 5305 + }, + { + "epoch": 0.6749777382012466, + "ewc_loss": 0.006895560305565596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.895560363773257e-05, + "grad_norm": 3.475632429122925, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8627156615257263, + "num_tokens": 202544085.0, + "step": 5306 + }, + { + "epoch": 0.6751049484798372, + "ewc_loss": 0.006928333546966314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.928333459654823e-05, + "grad_norm": 3.477888345718384, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8596448302268982, + "num_tokens": 202588237.0, + "step": 5307 + }, + { + "epoch": 0.6752321587584277, + "ewc_loss": 0.006915419362485409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.915419362485409e-05, + "grad_norm": 3.4973199367523193, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8576943874359131, + "num_tokens": 202628462.0, + "step": 5308 + }, + { + "epoch": 0.6753593690370182, + "ewc_loss": 0.006920620799064636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.920620944583789e-05, + "grad_norm": 3.4792325496673584, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8665359020233154, + "num_tokens": 202668149.0, + "step": 5309 + }, + { + "epoch": 0.6754865793156087, + "ewc_loss": 0.006905538495630026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.905538612045348e-05, + "grad_norm": 3.4388155937194824, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.880138635635376, + "num_tokens": 202708651.0, + "step": 5310 + }, + { + "epoch": 0.6756137895941993, + "ewc_loss": 0.006873491685837507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.873491656733677e-05, + "grad_norm": 3.5784714221954346, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8696372509002686, + "num_tokens": 202738613.0, + "step": 5311 + }, + { + "epoch": 0.6757409998727897, + "ewc_loss": 0.006962708663195372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.962708721403033e-05, + "grad_norm": 3.4920668601989746, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8606921434402466, + "num_tokens": 202779279.0, + "step": 5312 + }, + { + "epoch": 0.6758682101513802, + "ewc_loss": 0.006873169913887978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.87317005940713e-05, + "grad_norm": 3.5610649585723877, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8673459887504578, + "num_tokens": 202812095.0, + "step": 5313 + }, + { + "epoch": 0.6759954204299707, + "ewc_loss": 0.006966391112655401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.966391083551571e-05, + "grad_norm": 3.460134506225586, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8602402210235596, + "num_tokens": 202855705.0, + "step": 5314 + }, + { + "epoch": 0.6761226307085613, + "ewc_loss": 0.006873208098113537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.873207894386724e-05, + "grad_norm": 3.5211548805236816, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8489745855331421, + "num_tokens": 202894180.0, + "step": 5315 + }, + { + "epoch": 0.6762498409871518, + "ewc_loss": 0.006935684941709042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.935685087228194e-05, + "grad_norm": 3.5739586353302, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8721528053283691, + "num_tokens": 202925842.0, + "step": 5316 + }, + { + "epoch": 0.6763770512657423, + "ewc_loss": 0.006947984918951988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.947985093574971e-05, + "grad_norm": 3.5719614028930664, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.869591236114502, + "num_tokens": 202959103.0, + "step": 5317 + }, + { + "epoch": 0.6765042615443327, + "ewc_loss": 0.006925011053681374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.925011257408187e-05, + "grad_norm": 3.451986312866211, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8642299175262451, + "num_tokens": 203000393.0, + "step": 5318 + }, + { + "epoch": 0.6766314718229233, + "ewc_loss": 0.006873472593724728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.87347273924388e-05, + "grad_norm": 3.495209217071533, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8646188378334045, + "num_tokens": 203041613.0, + "step": 5319 + }, + { + "epoch": 0.6767586821015138, + "ewc_loss": 0.00693882629275322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.938826118130237e-05, + "grad_norm": 3.5119524002075195, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8800356388092041, + "num_tokens": 203073744.0, + "step": 5320 + }, + { + "epoch": 0.6768858923801043, + "ewc_loss": 0.006934859324246645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.934859266038984e-05, + "grad_norm": 3.5305142402648926, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8507310152053833, + "num_tokens": 203108350.0, + "step": 5321 + }, + { + "epoch": 0.6770131026586949, + "ewc_loss": 0.006943243555724621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.943243351997808e-05, + "grad_norm": 3.5866734981536865, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.868277370929718, + "num_tokens": 203139149.0, + "step": 5322 + }, + { + "epoch": 0.6771403129372854, + "ewc_loss": 0.006977397482842207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.977397424634546e-05, + "grad_norm": 3.558910608291626, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8483489155769348, + "num_tokens": 203173953.0, + "step": 5323 + }, + { + "epoch": 0.6772675232158758, + "ewc_loss": 0.006947145331650972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.947145448066294e-05, + "grad_norm": 3.5536749362945557, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8754376769065857, + "num_tokens": 203208915.0, + "step": 5324 + }, + { + "epoch": 0.6773947334944663, + "ewc_loss": 0.006960025988519192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.960026075830683e-05, + "grad_norm": 3.5036826133728027, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8529258966445923, + "num_tokens": 203251906.0, + "step": 5325 + }, + { + "epoch": 0.6775219437730569, + "ewc_loss": 0.006942402105778456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.942402251297608e-05, + "grad_norm": 3.517159938812256, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8633036017417908, + "num_tokens": 203289460.0, + "step": 5326 + }, + { + "epoch": 0.6776491540516474, + "ewc_loss": 0.006963157095015049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.963156920392066e-05, + "grad_norm": 3.474897623062134, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.873679518699646, + "num_tokens": 203325647.0, + "step": 5327 + }, + { + "epoch": 0.6777763643302379, + "ewc_loss": 0.006942312233150005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.942312029423192e-05, + "grad_norm": 3.473318099975586, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8602526187896729, + "num_tokens": 203364972.0, + "step": 5328 + }, + { + "epoch": 0.6779035746088284, + "ewc_loss": 0.006942258216440678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.942258187336847e-05, + "grad_norm": 3.471112012863159, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8653641939163208, + "num_tokens": 203407778.0, + "step": 5329 + }, + { + "epoch": 0.6780307848874189, + "ewc_loss": 0.006943935994058847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.943936023162678e-05, + "grad_norm": 3.498199224472046, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8652136921882629, + "num_tokens": 203443820.0, + "step": 5330 + }, + { + "epoch": 0.6781579951660094, + "ewc_loss": 0.006952324416488409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95232447469607e-05, + "grad_norm": 3.5498316287994385, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.870500385761261, + "num_tokens": 203475742.0, + "step": 5331 + }, + { + "epoch": 0.6782852054445999, + "ewc_loss": 0.0069764843210577965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.976484291953966e-05, + "grad_norm": 3.5433053970336914, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8391349911689758, + "num_tokens": 203512617.0, + "step": 5332 + }, + { + "epoch": 0.6784124157231904, + "ewc_loss": 0.00696696899831295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.966968794586137e-05, + "grad_norm": 3.549271583557129, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.849958062171936, + "num_tokens": 203546778.0, + "step": 5333 + }, + { + "epoch": 0.678539626001781, + "ewc_loss": 0.006974020507186651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.974020652705804e-05, + "grad_norm": 3.507880687713623, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8667402267456055, + "num_tokens": 203585222.0, + "step": 5334 + }, + { + "epoch": 0.6786668362803715, + "ewc_loss": 0.006955140270292759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.955140270292759e-05, + "grad_norm": 3.4966843128204346, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8605265617370605, + "num_tokens": 203627014.0, + "step": 5335 + }, + { + "epoch": 0.6787940465589619, + "ewc_loss": 0.006945411674678326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.945411587366834e-05, + "grad_norm": 3.532151937484741, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.870434045791626, + "num_tokens": 203659189.0, + "step": 5336 + }, + { + "epoch": 0.6789212568375524, + "ewc_loss": 0.006975357886403799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.97535797371529e-05, + "grad_norm": 3.522587299346924, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8759160041809082, + "num_tokens": 203695355.0, + "step": 5337 + }, + { + "epoch": 0.679048467116143, + "ewc_loss": 0.006968556437641382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.968556408537552e-05, + "grad_norm": 3.4929254055023193, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8728066086769104, + "num_tokens": 203733446.0, + "step": 5338 + }, + { + "epoch": 0.6791756773947335, + "ewc_loss": 0.006951625924557447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.951625982765108e-05, + "grad_norm": 3.5018155574798584, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8474001884460449, + "num_tokens": 203775387.0, + "step": 5339 + }, + { + "epoch": 0.679302887673324, + "ewc_loss": 0.006976012606173754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.976012809900567e-05, + "grad_norm": 3.525372266769409, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8604462742805481, + "num_tokens": 203810713.0, + "step": 5340 + }, + { + "epoch": 0.6794300979519146, + "ewc_loss": 0.006965854205191135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.965854117879644e-05, + "grad_norm": 3.4993367195129395, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8776133060455322, + "num_tokens": 203847144.0, + "step": 5341 + }, + { + "epoch": 0.679557308230505, + "ewc_loss": 0.006945834960788488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.94583504809998e-05, + "grad_norm": 3.4486329555511475, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.869743824005127, + "num_tokens": 203889716.0, + "step": 5342 + }, + { + "epoch": 0.6796845185090955, + "ewc_loss": 0.006934645585715771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.934645352885127e-05, + "grad_norm": 3.529341697692871, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8517762422561646, + "num_tokens": 203927692.0, + "step": 5343 + }, + { + "epoch": 0.679811728787686, + "ewc_loss": 0.00698093930259347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.98093936080113e-05, + "grad_norm": 3.566833734512329, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8526057004928589, + "num_tokens": 203960180.0, + "step": 5344 + }, + { + "epoch": 0.6799389390662766, + "ewc_loss": 0.006982111372053623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.982111517572775e-05, + "grad_norm": 3.474797248840332, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8752224445343018, + "num_tokens": 203997375.0, + "step": 5345 + }, + { + "epoch": 0.6800661493448671, + "ewc_loss": 0.006926005240529776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.926005153218284e-05, + "grad_norm": 3.5116140842437744, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.860776424407959, + "num_tokens": 204037588.0, + "step": 5346 + }, + { + "epoch": 0.6801933596234576, + "ewc_loss": 0.006982062943279743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.98206276865676e-05, + "grad_norm": 3.463094472885132, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8579056262969971, + "num_tokens": 204080993.0, + "step": 5347 + }, + { + "epoch": 0.680320569902048, + "ewc_loss": 0.006926706526428461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.926706555532292e-05, + "grad_norm": 3.5466370582580566, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8673280477523804, + "num_tokens": 204116863.0, + "step": 5348 + }, + { + "epoch": 0.6804477801806386, + "ewc_loss": 0.006982150953263044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.982150807743892e-05, + "grad_norm": 3.500852108001709, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8594318628311157, + "num_tokens": 204156551.0, + "step": 5349 + }, + { + "epoch": 0.6805749904592291, + "ewc_loss": 0.0069172740913927555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.917274004081264e-05, + "grad_norm": 3.505722761154175, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8578965663909912, + "num_tokens": 204196210.0, + "step": 5350 + }, + { + "epoch": 0.6807022007378196, + "ewc_loss": 0.0069454386830329895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.945438508410007e-05, + "grad_norm": 3.5387117862701416, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8631996512413025, + "num_tokens": 204234791.0, + "step": 5351 + }, + { + "epoch": 0.6808294110164101, + "ewc_loss": 0.006953426171094179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.953426054678857e-05, + "grad_norm": 3.51029634475708, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8690609931945801, + "num_tokens": 204271631.0, + "step": 5352 + }, + { + "epoch": 0.6809566212950007, + "ewc_loss": 0.006913797464221716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.913797551533207e-05, + "grad_norm": 3.4657347202301025, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8747680187225342, + "num_tokens": 204311879.0, + "step": 5353 + }, + { + "epoch": 0.6810838315735911, + "ewc_loss": 0.006899690721184015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.899690924910828e-05, + "grad_norm": 3.473027229309082, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.867691159248352, + "num_tokens": 204357123.0, + "step": 5354 + }, + { + "epoch": 0.6812110418521816, + "ewc_loss": 0.0069177402183413506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.917740392964333e-05, + "grad_norm": 3.5141665935516357, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8625801801681519, + "num_tokens": 204397196.0, + "step": 5355 + }, + { + "epoch": 0.6813382521307721, + "ewc_loss": 0.006910914089530706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.910914089530706e-05, + "grad_norm": 3.551999568939209, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.856391429901123, + "num_tokens": 204434654.0, + "step": 5356 + }, + { + "epoch": 0.6814654624093627, + "ewc_loss": 0.006906955968588591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.90695596858859e-05, + "grad_norm": 3.5737357139587402, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8864438533782959, + "num_tokens": 204466581.0, + "step": 5357 + }, + { + "epoch": 0.6815926726879532, + "ewc_loss": 0.006907439790666103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.907439819769934e-05, + "grad_norm": 3.512183666229248, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8771849274635315, + "num_tokens": 204503890.0, + "step": 5358 + }, + { + "epoch": 0.6817198829665437, + "ewc_loss": 0.006858771666884422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.858771666884422e-05, + "grad_norm": 3.5762410163879395, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8603912591934204, + "num_tokens": 204537160.0, + "step": 5359 + }, + { + "epoch": 0.6818470932451343, + "ewc_loss": 0.006927381735295057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.927381764398888e-05, + "grad_norm": 3.5364317893981934, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8758236169815063, + "num_tokens": 204570077.0, + "step": 5360 + }, + { + "epoch": 0.6819743035237247, + "ewc_loss": 0.006888458505272865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.888458301546052e-05, + "grad_norm": 3.506391763687134, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8803256154060364, + "num_tokens": 204603638.0, + "step": 5361 + }, + { + "epoch": 0.6821015138023152, + "ewc_loss": 0.006898112595081329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.898112769704312e-05, + "grad_norm": 3.50179386138916, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8612188696861267, + "num_tokens": 204644584.0, + "step": 5362 + }, + { + "epoch": 0.6822287240809057, + "ewc_loss": 0.006899471394717693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.89947119099088e-05, + "grad_norm": 3.5590858459472656, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8527319431304932, + "num_tokens": 204678459.0, + "step": 5363 + }, + { + "epoch": 0.6823559343594963, + "ewc_loss": 0.006959962658584118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95996277499944e-05, + "grad_norm": 3.572707414627075, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8620566129684448, + "num_tokens": 204712923.0, + "step": 5364 + }, + { + "epoch": 0.6824831446380868, + "ewc_loss": 0.006946887820959091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.946887879166752e-05, + "grad_norm": 3.513583183288574, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8484830856323242, + "num_tokens": 204751303.0, + "step": 5365 + }, + { + "epoch": 0.6826103549166773, + "ewc_loss": 0.00692196749150753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.921967724338174e-05, + "grad_norm": 3.4883525371551514, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8647388219833374, + "num_tokens": 204791685.0, + "step": 5366 + }, + { + "epoch": 0.6827375651952677, + "ewc_loss": 0.006930152419954538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.930152449058369e-05, + "grad_norm": 3.4838273525238037, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8664940595626831, + "num_tokens": 204827748.0, + "step": 5367 + }, + { + "epoch": 0.6828647754738583, + "ewc_loss": 0.006942528299987316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.942528125364333e-05, + "grad_norm": 3.4739303588867188, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8804730772972107, + "num_tokens": 204866568.0, + "step": 5368 + }, + { + "epoch": 0.6829919857524488, + "ewc_loss": 0.0069424984976649284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.942498293938115e-05, + "grad_norm": 3.5431675910949707, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8616964817047119, + "num_tokens": 204908334.0, + "step": 5369 + }, + { + "epoch": 0.6831191960310393, + "ewc_loss": 0.006994466297328472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.994466093601659e-05, + "grad_norm": 3.5056347846984863, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8680343627929688, + "num_tokens": 204949929.0, + "step": 5370 + }, + { + "epoch": 0.6832464063096299, + "ewc_loss": 0.006928222719579935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.928222865099087e-05, + "grad_norm": 3.4976589679718018, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8550668954849243, + "num_tokens": 204991435.0, + "step": 5371 + }, + { + "epoch": 0.6833736165882204, + "ewc_loss": 0.006945247761905193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.945247878320515e-05, + "grad_norm": 3.5677542686462402, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8713093996047974, + "num_tokens": 205028434.0, + "step": 5372 + }, + { + "epoch": 0.6835008268668108, + "ewc_loss": 0.006972855888307095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.972855771891773e-05, + "grad_norm": 3.504936933517456, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.860451340675354, + "num_tokens": 205064138.0, + "step": 5373 + }, + { + "epoch": 0.6836280371454013, + "ewc_loss": 0.00691647594794631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.916475831530988e-05, + "grad_norm": 3.512364149093628, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8594975471496582, + "num_tokens": 205104058.0, + "step": 5374 + }, + { + "epoch": 0.6837552474239919, + "ewc_loss": 0.006945667788386345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.945667701074854e-05, + "grad_norm": 3.5133373737335205, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8630396127700806, + "num_tokens": 205142609.0, + "step": 5375 + }, + { + "epoch": 0.6838824577025824, + "ewc_loss": 0.006932539865374565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.932539690751582e-05, + "grad_norm": 3.5083231925964355, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8750066757202148, + "num_tokens": 205180407.0, + "step": 5376 + }, + { + "epoch": 0.6840096679811729, + "ewc_loss": 0.006929878145456314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.929878145456314e-05, + "grad_norm": 3.4873015880584717, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.886284351348877, + "num_tokens": 205217340.0, + "step": 5377 + }, + { + "epoch": 0.6841368782597634, + "ewc_loss": 0.006917028222233057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.917028076713905e-05, + "grad_norm": 3.484360694885254, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8530123233795166, + "num_tokens": 205263671.0, + "step": 5378 + }, + { + "epoch": 0.6842640885383539, + "ewc_loss": 0.00691286101937294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.912861135788262e-05, + "grad_norm": 3.5004031658172607, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.86324143409729, + "num_tokens": 205302970.0, + "step": 5379 + }, + { + "epoch": 0.6843912988169444, + "ewc_loss": 0.00691642053425312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.91642053425312e-05, + "grad_norm": 3.5265815258026123, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8695847988128662, + "num_tokens": 205342247.0, + "step": 5380 + }, + { + "epoch": 0.6845185090955349, + "ewc_loss": 0.006925602909177542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.92560279276222e-05, + "grad_norm": 3.609116554260254, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8509552478790283, + "num_tokens": 205374897.0, + "step": 5381 + }, + { + "epoch": 0.6846457193741254, + "ewc_loss": 0.006969478912651539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.96947899996303e-05, + "grad_norm": 3.577453851699829, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8617263436317444, + "num_tokens": 205411645.0, + "step": 5382 + }, + { + "epoch": 0.684772929652716, + "ewc_loss": 0.006911550182849169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.911550008226186e-05, + "grad_norm": 3.5164473056793213, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8643293380737305, + "num_tokens": 205450972.0, + "step": 5383 + }, + { + "epoch": 0.6849001399313065, + "ewc_loss": 0.0069067515432834625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.906751514179632e-05, + "grad_norm": 3.4594695568084717, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8665695190429688, + "num_tokens": 205494314.0, + "step": 5384 + }, + { + "epoch": 0.6850273502098969, + "ewc_loss": 0.006888052448630333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.88805230311118e-05, + "grad_norm": 3.488804340362549, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8521392345428467, + "num_tokens": 205534696.0, + "step": 5385 + }, + { + "epoch": 0.6851545604884874, + "ewc_loss": 0.006920524872839451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.920524901943281e-05, + "grad_norm": 3.5340206623077393, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.876166820526123, + "num_tokens": 205575579.0, + "step": 5386 + }, + { + "epoch": 0.685281770767078, + "ewc_loss": 0.006918583530932665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.918583676451817e-05, + "grad_norm": 3.4970269203186035, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8691765666007996, + "num_tokens": 205609249.0, + "step": 5387 + }, + { + "epoch": 0.6854089810456685, + "ewc_loss": 0.006884204689413309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.8842047767248e-05, + "grad_norm": 3.549071788787842, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8617449998855591, + "num_tokens": 205644135.0, + "step": 5388 + }, + { + "epoch": 0.685536191324259, + "ewc_loss": 0.006937320809811354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.937320722499862e-05, + "grad_norm": 3.515023708343506, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8584225177764893, + "num_tokens": 205685312.0, + "step": 5389 + }, + { + "epoch": 0.6856634016028496, + "ewc_loss": 0.006903557572513819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.903557368787006e-05, + "grad_norm": 3.5657622814178467, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8593704104423523, + "num_tokens": 205720525.0, + "step": 5390 + }, + { + "epoch": 0.68579061188144, + "ewc_loss": 0.006952251773327589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.952251715119928e-05, + "grad_norm": 3.4920296669006348, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8456981778144836, + "num_tokens": 205763358.0, + "step": 5391 + }, + { + "epoch": 0.6859178221600305, + "ewc_loss": 0.006886007729917765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.886007759021595e-05, + "grad_norm": 3.514221429824829, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.868725061416626, + "num_tokens": 205801413.0, + "step": 5392 + }, + { + "epoch": 0.686045032438621, + "ewc_loss": 0.006952227558940649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.952227704459801e-05, + "grad_norm": 3.512017250061035, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8658758401870728, + "num_tokens": 205840442.0, + "step": 5393 + }, + { + "epoch": 0.6861722427172116, + "ewc_loss": 0.006943211425095797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.943211337784305e-05, + "grad_norm": 3.505319595336914, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8773258924484253, + "num_tokens": 205877319.0, + "step": 5394 + }, + { + "epoch": 0.6862994529958021, + "ewc_loss": 0.006938356906175613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.938356818864122e-05, + "grad_norm": 3.5172500610351562, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8614602088928223, + "num_tokens": 205916186.0, + "step": 5395 + }, + { + "epoch": 0.6864266632743926, + "ewc_loss": 0.006951976101845503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95197595632635e-05, + "grad_norm": 3.462557554244995, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8553686141967773, + "num_tokens": 205956336.0, + "step": 5396 + }, + { + "epoch": 0.686553873552983, + "ewc_loss": 0.006916633807122707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.916633719811216e-05, + "grad_norm": 3.5710067749023438, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8634200096130371, + "num_tokens": 205989570.0, + "step": 5397 + }, + { + "epoch": 0.6866810838315736, + "ewc_loss": 0.006994159892201424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.994159775786102e-05, + "grad_norm": 3.503859519958496, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8572138547897339, + "num_tokens": 206026139.0, + "step": 5398 + }, + { + "epoch": 0.6868082941101641, + "ewc_loss": 0.006921322550624609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.921322346897796e-05, + "grad_norm": 3.5015945434570312, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8596971035003662, + "num_tokens": 206070059.0, + "step": 5399 + }, + { + "epoch": 0.6869355043887546, + "ewc_loss": 0.006952395662665367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.952395779080689e-05, + "grad_norm": 3.5522384643554688, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8681423664093018, + "num_tokens": 206104645.0, + "step": 5400 + }, + { + "epoch": 0.6870627146673451, + "ewc_loss": 0.006977525539696217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.977525481488556e-05, + "grad_norm": 3.893735885620117, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8621137738227844, + "num_tokens": 206141220.0, + "step": 5401 + }, + { + "epoch": 0.6871899249459357, + "ewc_loss": 0.0071602631360292435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160263339756057e-05, + "grad_norm": 3.489579200744629, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8616529703140259, + "num_tokens": 206177917.0, + "step": 5402 + }, + { + "epoch": 0.6873171352245261, + "ewc_loss": 0.006835225038230419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.835225212853402e-05, + "grad_norm": 3.5099666118621826, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8630689978599548, + "num_tokens": 206213419.0, + "step": 5403 + }, + { + "epoch": 0.6874443455031166, + "ewc_loss": 0.007003021892160177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.003021892160177e-05, + "grad_norm": 3.561478614807129, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8482699990272522, + "num_tokens": 206252421.0, + "step": 5404 + }, + { + "epoch": 0.6875715557817071, + "ewc_loss": 0.007001799531280994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.001799531280994e-05, + "grad_norm": 3.5170528888702393, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8687633275985718, + "num_tokens": 206287195.0, + "step": 5405 + }, + { + "epoch": 0.6876987660602977, + "ewc_loss": 0.006949287373572588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.94928748998791e-05, + "grad_norm": 3.532477378845215, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.844323456287384, + "num_tokens": 206328339.0, + "step": 5406 + }, + { + "epoch": 0.6878259763388882, + "ewc_loss": 0.0069936164654791355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.993616261752322e-05, + "grad_norm": 3.529871702194214, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8502353429794312, + "num_tokens": 206369523.0, + "step": 5407 + }, + { + "epoch": 0.6879531866174787, + "ewc_loss": 0.006979642901569605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.979642785154283e-05, + "grad_norm": 3.505232095718384, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8836060762405396, + "num_tokens": 206402864.0, + "step": 5408 + }, + { + "epoch": 0.6880803968960693, + "ewc_loss": 0.006979746278375387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.979746103752404e-05, + "grad_norm": 3.554825782775879, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8535383939743042, + "num_tokens": 206439004.0, + "step": 5409 + }, + { + "epoch": 0.6882076071746597, + "ewc_loss": 0.007005780003964901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.005780207691714e-05, + "grad_norm": 3.5416462421417236, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8650864362716675, + "num_tokens": 206472320.0, + "step": 5410 + }, + { + "epoch": 0.6883348174532502, + "ewc_loss": 0.006979134399443865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.979134195717052e-05, + "grad_norm": 3.4616150856018066, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8557596802711487, + "num_tokens": 206518649.0, + "step": 5411 + }, + { + "epoch": 0.6884620277318407, + "ewc_loss": 0.006948405876755714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.948405643925071e-05, + "grad_norm": 3.6291322708129883, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.878516674041748, + "num_tokens": 206551318.0, + "step": 5412 + }, + { + "epoch": 0.6885892380104313, + "ewc_loss": 0.007066503167152405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.066503167152405e-05, + "grad_norm": 3.504072427749634, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8524640798568726, + "num_tokens": 206588173.0, + "step": 5413 + }, + { + "epoch": 0.6887164482890218, + "ewc_loss": 0.006930622272193432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.930622475920245e-05, + "grad_norm": 3.566801071166992, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8639222383499146, + "num_tokens": 206624013.0, + "step": 5414 + }, + { + "epoch": 0.6888436585676123, + "ewc_loss": 0.007010193075984716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.010193075984716e-05, + "grad_norm": 3.4663803577423096, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8710592985153198, + "num_tokens": 206668180.0, + "step": 5415 + }, + { + "epoch": 0.6889708688462027, + "ewc_loss": 0.00692763552069664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.927635695319623e-05, + "grad_norm": 3.5202786922454834, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8596477508544922, + "num_tokens": 206710619.0, + "step": 5416 + }, + { + "epoch": 0.6890980791247933, + "ewc_loss": 0.006994114723056555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.994114664848894e-05, + "grad_norm": 3.4981021881103516, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8682581782341003, + "num_tokens": 206750121.0, + "step": 5417 + }, + { + "epoch": 0.6892252894033838, + "ewc_loss": 0.006941211875528097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.941211904631928e-05, + "grad_norm": 3.4873135089874268, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8768424391746521, + "num_tokens": 206789538.0, + "step": 5418 + }, + { + "epoch": 0.6893524996819743, + "ewc_loss": 0.006950714159756899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.950714305276051e-05, + "grad_norm": 3.5118274688720703, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8645596504211426, + "num_tokens": 206827513.0, + "step": 5419 + }, + { + "epoch": 0.6894797099605648, + "ewc_loss": 0.0069432384334504604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.943238258827478e-05, + "grad_norm": 3.535978317260742, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8711000680923462, + "num_tokens": 206866563.0, + "step": 5420 + }, + { + "epoch": 0.6896069202391554, + "ewc_loss": 0.006949517410248518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.949517410248518e-05, + "grad_norm": 3.498220920562744, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8742122054100037, + "num_tokens": 206906281.0, + "step": 5421 + }, + { + "epoch": 0.6897341305177458, + "ewc_loss": 0.006920294836163521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.920294981682673e-05, + "grad_norm": 3.5801260471343994, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8457757234573364, + "num_tokens": 206944055.0, + "step": 5422 + }, + { + "epoch": 0.6898613407963363, + "ewc_loss": 0.006966037675738335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.966037472011521e-05, + "grad_norm": 3.5168356895446777, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8649387955665588, + "num_tokens": 206984421.0, + "step": 5423 + }, + { + "epoch": 0.6899885510749268, + "ewc_loss": 0.006906041409820318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.906041380716488e-05, + "grad_norm": 3.556302785873413, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8534458875656128, + "num_tokens": 207020319.0, + "step": 5424 + }, + { + "epoch": 0.6901157613535174, + "ewc_loss": 0.0069470154121518135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.947015208425e-05, + "grad_norm": 3.502307891845703, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8684567809104919, + "num_tokens": 207061487.0, + "step": 5425 + }, + { + "epoch": 0.6902429716321079, + "ewc_loss": 0.006904995068907738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.904995098011568e-05, + "grad_norm": 3.5167441368103027, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8661282062530518, + "num_tokens": 207098343.0, + "step": 5426 + }, + { + "epoch": 0.6903701819106984, + "ewc_loss": 0.006923612207174301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.923612090758979e-05, + "grad_norm": 3.48992919921875, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8599008917808533, + "num_tokens": 207140381.0, + "step": 5427 + }, + { + "epoch": 0.6904973921892888, + "ewc_loss": 0.006906409747898579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.906409544171765e-05, + "grad_norm": 3.5419254302978516, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8771467804908752, + "num_tokens": 207173923.0, + "step": 5428 + }, + { + "epoch": 0.6906246024678794, + "ewc_loss": 0.006951008457690477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.951008253963664e-05, + "grad_norm": 3.4911482334136963, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8723649978637695, + "num_tokens": 207214840.0, + "step": 5429 + }, + { + "epoch": 0.6907518127464699, + "ewc_loss": 0.0068784914910793304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.878491694806144e-05, + "grad_norm": 3.560730457305908, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8634214401245117, + "num_tokens": 207247850.0, + "step": 5430 + }, + { + "epoch": 0.6908790230250604, + "ewc_loss": 0.006966335698962212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.966335786273703e-05, + "grad_norm": 3.5044729709625244, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.86449134349823, + "num_tokens": 207286698.0, + "step": 5431 + }, + { + "epoch": 0.691006233303651, + "ewc_loss": 0.006885961163789034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.885961192892864e-05, + "grad_norm": 3.520613431930542, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.860546350479126, + "num_tokens": 207322470.0, + "step": 5432 + }, + { + "epoch": 0.6911334435822415, + "ewc_loss": 0.006926341913640499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.926342030055821e-05, + "grad_norm": 3.50523042678833, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8512923717498779, + "num_tokens": 207361517.0, + "step": 5433 + }, + { + "epoch": 0.6912606538608319, + "ewc_loss": 0.006911520846188068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.911520904395729e-05, + "grad_norm": 3.5764994621276855, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8726388216018677, + "num_tokens": 207394504.0, + "step": 5434 + }, + { + "epoch": 0.6913878641394224, + "ewc_loss": 0.006973570212721825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.973570270929486e-05, + "grad_norm": 3.5645081996917725, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8475750684738159, + "num_tokens": 207430148.0, + "step": 5435 + }, + { + "epoch": 0.691515074418013, + "ewc_loss": 0.0069284988567233086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.928498623892665e-05, + "grad_norm": 3.5906732082366943, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8628517389297485, + "num_tokens": 207461094.0, + "step": 5436 + }, + { + "epoch": 0.6916422846966035, + "ewc_loss": 0.0069653005339205265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.965300417505205e-05, + "grad_norm": 3.4883499145507812, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8710086345672607, + "num_tokens": 207500572.0, + "step": 5437 + }, + { + "epoch": 0.691769494975194, + "ewc_loss": 0.006914966739714146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.914966797921807e-05, + "grad_norm": 3.5819196701049805, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8566213846206665, + "num_tokens": 207534977.0, + "step": 5438 + }, + { + "epoch": 0.6918967052537845, + "ewc_loss": 0.007012514863163233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.012514834059402e-05, + "grad_norm": 3.5192601680755615, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8696592450141907, + "num_tokens": 207569809.0, + "step": 5439 + }, + { + "epoch": 0.692023915532375, + "ewc_loss": 0.006936008110642433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.936008139746264e-05, + "grad_norm": 3.542902708053589, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8653709292411804, + "num_tokens": 207606785.0, + "step": 5440 + }, + { + "epoch": 0.6921511258109655, + "ewc_loss": 0.0069916523061692715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.991652480792254e-05, + "grad_norm": 3.5158865451812744, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.862433671951294, + "num_tokens": 207648344.0, + "step": 5441 + }, + { + "epoch": 0.692278336089556, + "ewc_loss": 0.006973279174417257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.973279232624918e-05, + "grad_norm": 3.5377120971679688, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8735243082046509, + "num_tokens": 207680962.0, + "step": 5442 + }, + { + "epoch": 0.6924055463681466, + "ewc_loss": 0.007008088752627373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.008088869042695e-05, + "grad_norm": 3.5487470626831055, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.897268533706665, + "num_tokens": 207709162.0, + "step": 5443 + }, + { + "epoch": 0.6925327566467371, + "ewc_loss": 0.007006351370364428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.006351370364428e-05, + "grad_norm": 3.5478696823120117, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.860849142074585, + "num_tokens": 207746089.0, + "step": 5444 + }, + { + "epoch": 0.6926599669253276, + "ewc_loss": 0.007013087626546621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.013087451923639e-05, + "grad_norm": 3.489851474761963, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8749349117279053, + "num_tokens": 207791235.0, + "step": 5445 + }, + { + "epoch": 0.692787177203918, + "ewc_loss": 0.006970384158194065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.970384129090235e-05, + "grad_norm": 3.6301894187927246, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8556537628173828, + "num_tokens": 207824527.0, + "step": 5446 + }, + { + "epoch": 0.6929143874825086, + "ewc_loss": 0.007072188425809145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.072188600432128e-05, + "grad_norm": 3.513835906982422, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8742179870605469, + "num_tokens": 207863535.0, + "step": 5447 + }, + { + "epoch": 0.6930415977610991, + "ewc_loss": 0.00695710489526391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.957104778848588e-05, + "grad_norm": 3.5327816009521484, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8588407039642334, + "num_tokens": 207899694.0, + "step": 5448 + }, + { + "epoch": 0.6931688080396896, + "ewc_loss": 0.007028199732303619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.028199615888298e-05, + "grad_norm": 3.5704870223999023, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8549646139144897, + "num_tokens": 207931440.0, + "step": 5449 + }, + { + "epoch": 0.6932960183182801, + "ewc_loss": 0.0070287855342030525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.02878533047624e-05, + "grad_norm": 3.5406312942504883, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8633813858032227, + "num_tokens": 207963758.0, + "step": 5450 + }, + { + "epoch": 0.6934232285968707, + "ewc_loss": 0.00700346939265728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.00346936355345e-05, + "grad_norm": 3.491312026977539, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8683017492294312, + "num_tokens": 208002383.0, + "step": 5451 + }, + { + "epoch": 0.6935504388754611, + "ewc_loss": 0.007008696440607309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.008696411503479e-05, + "grad_norm": 3.5250511169433594, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.869025468826294, + "num_tokens": 208038723.0, + "step": 5452 + }, + { + "epoch": 0.6936776491540516, + "ewc_loss": 0.0070339045487344265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.033904694253579e-05, + "grad_norm": 3.5346522331237793, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.869909405708313, + "num_tokens": 208071851.0, + "step": 5453 + }, + { + "epoch": 0.6938048594326421, + "ewc_loss": 0.007024987135082483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.024987280601636e-05, + "grad_norm": 3.556004762649536, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8468306064605713, + "num_tokens": 208106916.0, + "step": 5454 + }, + { + "epoch": 0.6939320697112327, + "ewc_loss": 0.007039211690425873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.039211777737364e-05, + "grad_norm": 3.4957289695739746, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.874830961227417, + "num_tokens": 208146883.0, + "step": 5455 + }, + { + "epoch": 0.6940592799898232, + "ewc_loss": 0.006993826013058424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.993825809331611e-05, + "grad_norm": 3.523367404937744, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.867128849029541, + "num_tokens": 208187364.0, + "step": 5456 + }, + { + "epoch": 0.6941864902684137, + "ewc_loss": 0.007018572650849819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.018572796368971e-05, + "grad_norm": 3.514202356338501, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8719483613967896, + "num_tokens": 208226087.0, + "step": 5457 + }, + { + "epoch": 0.6943137005470043, + "ewc_loss": 0.006995826028287411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.99582597007975e-05, + "grad_norm": 3.5215625762939453, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.862636148929596, + "num_tokens": 208265193.0, + "step": 5458 + }, + { + "epoch": 0.6944409108255947, + "ewc_loss": 0.006993257440626621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.993257557041943e-05, + "grad_norm": 3.469583034515381, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8786535263061523, + "num_tokens": 208305163.0, + "step": 5459 + }, + { + "epoch": 0.6945681211041852, + "ewc_loss": 0.006964385975152254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.964385829633102e-05, + "grad_norm": 3.558088779449463, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8534637689590454, + "num_tokens": 208342229.0, + "step": 5460 + }, + { + "epoch": 0.6946953313827757, + "ewc_loss": 0.007030731067061424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.030730921542272e-05, + "grad_norm": 3.4966344833374023, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8552919030189514, + "num_tokens": 208384606.0, + "step": 5461 + }, + { + "epoch": 0.6948225416613663, + "ewc_loss": 0.006953035481274128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.953035335754976e-05, + "grad_norm": 3.5522279739379883, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8831520080566406, + "num_tokens": 208416777.0, + "step": 5462 + }, + { + "epoch": 0.6949497519399568, + "ewc_loss": 0.006993804592639208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.99380470905453e-05, + "grad_norm": 3.521923780441284, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8495621681213379, + "num_tokens": 208456101.0, + "step": 5463 + }, + { + "epoch": 0.6950769622185473, + "ewc_loss": 0.006955737713724375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.955737626412883e-05, + "grad_norm": 3.5884580612182617, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8746829628944397, + "num_tokens": 208489201.0, + "step": 5464 + }, + { + "epoch": 0.6952041724971377, + "ewc_loss": 0.007001670077443123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.001670019235462e-05, + "grad_norm": 3.495016574859619, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8794611692428589, + "num_tokens": 208525006.0, + "step": 5465 + }, + { + "epoch": 0.6953313827757283, + "ewc_loss": 0.00692648533731699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.92648536642082e-05, + "grad_norm": 3.4621503353118896, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8625319600105286, + "num_tokens": 208568997.0, + "step": 5466 + }, + { + "epoch": 0.6954585930543188, + "ewc_loss": 0.006944021675735712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.944021879462525e-05, + "grad_norm": 3.576172351837158, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8567994832992554, + "num_tokens": 208605925.0, + "step": 5467 + }, + { + "epoch": 0.6955858033329093, + "ewc_loss": 0.006998362485319376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.998362368904054e-05, + "grad_norm": 3.5463383197784424, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8394544124603271, + "num_tokens": 208646290.0, + "step": 5468 + }, + { + "epoch": 0.6957130136114998, + "ewc_loss": 0.006940936669707298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.940936873434111e-05, + "grad_norm": 3.5004165172576904, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.865929901599884, + "num_tokens": 208686424.0, + "step": 5469 + }, + { + "epoch": 0.6958402238900904, + "ewc_loss": 0.00692778779193759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.927787762833759e-05, + "grad_norm": 3.5826549530029297, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8667947053909302, + "num_tokens": 208719643.0, + "step": 5470 + }, + { + "epoch": 0.6959674341686808, + "ewc_loss": 0.006991347763687372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.99134761816822e-05, + "grad_norm": 3.53899884223938, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8487116098403931, + "num_tokens": 208760804.0, + "step": 5471 + }, + { + "epoch": 0.6960946444472713, + "ewc_loss": 0.006926883943378925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.926884088898078e-05, + "grad_norm": 3.476302146911621, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.859350860118866, + "num_tokens": 208803022.0, + "step": 5472 + }, + { + "epoch": 0.6962218547258618, + "ewc_loss": 0.006919811945408583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.919811858097091e-05, + "grad_norm": 3.5067715644836426, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8644105195999146, + "num_tokens": 208844205.0, + "step": 5473 + }, + { + "epoch": 0.6963490650044524, + "ewc_loss": 0.006934212986379862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.934213161002845e-05, + "grad_norm": 3.451700448989868, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8750272393226624, + "num_tokens": 208886882.0, + "step": 5474 + }, + { + "epoch": 0.6964762752830429, + "ewc_loss": 0.006887851282954216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.887851486681029e-05, + "grad_norm": 3.534576654434204, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.86122727394104, + "num_tokens": 208926649.0, + "step": 5475 + }, + { + "epoch": 0.6966034855616334, + "ewc_loss": 0.006969323381781578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.969323294470087e-05, + "grad_norm": 3.504770517349243, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8590571284294128, + "num_tokens": 208963403.0, + "step": 5476 + }, + { + "epoch": 0.6967306958402238, + "ewc_loss": 0.006922262255102396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.922262400621548e-05, + "grad_norm": 3.5468084812164307, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.85736483335495, + "num_tokens": 209002475.0, + "step": 5477 + }, + { + "epoch": 0.6968579061188144, + "ewc_loss": 0.006950289476662874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.950289389351383e-05, + "grad_norm": 3.4581995010375977, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.865390419960022, + "num_tokens": 209044216.0, + "step": 5478 + }, + { + "epoch": 0.6969851163974049, + "ewc_loss": 0.006887094117701054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.887094059493393e-05, + "grad_norm": 3.5429158210754395, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8559365272521973, + "num_tokens": 209080774.0, + "step": 5479 + }, + { + "epoch": 0.6971123266759954, + "ewc_loss": 0.006978107616305351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.97810755809769e-05, + "grad_norm": 3.531536340713501, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8699108362197876, + "num_tokens": 209117167.0, + "step": 5480 + }, + { + "epoch": 0.697239536954586, + "ewc_loss": 0.006932801101356745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.932800897629932e-05, + "grad_norm": 3.6509087085723877, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8599230647087097, + "num_tokens": 209148872.0, + "step": 5481 + }, + { + "epoch": 0.6973667472331765, + "ewc_loss": 0.007017417345196009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.017417374299839e-05, + "grad_norm": 3.504119396209717, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8672939538955688, + "num_tokens": 209187912.0, + "step": 5482 + }, + { + "epoch": 0.6974939575117669, + "ewc_loss": 0.006894318386912346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.894318357808515e-05, + "grad_norm": 3.49660325050354, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8710086345672607, + "num_tokens": 209227055.0, + "step": 5483 + }, + { + "epoch": 0.6976211677903574, + "ewc_loss": 0.0069515882059931755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.951588147785515e-05, + "grad_norm": 3.5770838260650635, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8787167072296143, + "num_tokens": 209255343.0, + "step": 5484 + }, + { + "epoch": 0.697748378068948, + "ewc_loss": 0.006994439288973808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.994439172558486e-05, + "grad_norm": 3.524656295776367, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8579451441764832, + "num_tokens": 209294621.0, + "step": 5485 + }, + { + "epoch": 0.6978755883475385, + "ewc_loss": 0.006948759313672781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.94875925546512e-05, + "grad_norm": 3.468479633331299, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.880207359790802, + "num_tokens": 209336519.0, + "step": 5486 + }, + { + "epoch": 0.698002798626129, + "ewc_loss": 0.006946431007236242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.946430949028581e-05, + "grad_norm": 3.5626089572906494, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8622642755508423, + "num_tokens": 209372913.0, + "step": 5487 + }, + { + "epoch": 0.6981300089047195, + "ewc_loss": 0.007016325369477272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.01632525306195e-05, + "grad_norm": 3.500995397567749, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8563560843467712, + "num_tokens": 209417768.0, + "step": 5488 + }, + { + "epoch": 0.69825721918331, + "ewc_loss": 0.006947120185941458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.947119982214645e-05, + "grad_norm": 3.577497959136963, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8690708875656128, + "num_tokens": 209451221.0, + "step": 5489 + }, + { + "epoch": 0.6983844294619005, + "ewc_loss": 0.007021159399300814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.021159399300814e-05, + "grad_norm": 3.584069013595581, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8621687889099121, + "num_tokens": 209488710.0, + "step": 5490 + }, + { + "epoch": 0.698511639740491, + "ewc_loss": 0.007001944351941347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.001944322837517e-05, + "grad_norm": 3.545569658279419, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8615568280220032, + "num_tokens": 209525166.0, + "step": 5491 + }, + { + "epoch": 0.6986388500190815, + "ewc_loss": 0.006975919473916292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.975919677643105e-05, + "grad_norm": 3.4806320667266846, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8662959337234497, + "num_tokens": 209564804.0, + "step": 5492 + }, + { + "epoch": 0.6987660602976721, + "ewc_loss": 0.006958582438528538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.958582525840029e-05, + "grad_norm": 3.503016471862793, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8678909540176392, + "num_tokens": 209607155.0, + "step": 5493 + }, + { + "epoch": 0.6988932705762626, + "ewc_loss": 0.006981069687753916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.981069600442424e-05, + "grad_norm": 3.570706367492676, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8533197641372681, + "num_tokens": 209641226.0, + "step": 5494 + }, + { + "epoch": 0.699020480854853, + "ewc_loss": 0.007009439170360565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.009439286775887e-05, + "grad_norm": 3.5015814304351807, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.86326003074646, + "num_tokens": 209679180.0, + "step": 5495 + }, + { + "epoch": 0.6991476911334435, + "ewc_loss": 0.00696039292961359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.960392784094438e-05, + "grad_norm": 3.553539514541626, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8564202785491943, + "num_tokens": 209716412.0, + "step": 5496 + }, + { + "epoch": 0.6992749014120341, + "ewc_loss": 0.007010207511484623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.010207627899945e-05, + "grad_norm": 3.541607618331909, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.856227695941925, + "num_tokens": 209755697.0, + "step": 5497 + }, + { + "epoch": 0.6994021116906246, + "ewc_loss": 0.00700340885668993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.003408973105252e-05, + "grad_norm": 3.5009422302246094, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8719231486320496, + "num_tokens": 209793654.0, + "step": 5498 + }, + { + "epoch": 0.6995293219692151, + "ewc_loss": 0.006971392780542374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.97139257681556e-05, + "grad_norm": 3.5380821228027344, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8634434938430786, + "num_tokens": 209830703.0, + "step": 5499 + }, + { + "epoch": 0.6996565322478057, + "ewc_loss": 0.007016576826572418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.016577001195401e-05, + "grad_norm": 3.531832456588745, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8648254871368408, + "num_tokens": 209868994.0, + "step": 5500 + }, + { + "epoch": 0.6997837425263961, + "ewc_loss": 0.006983362138271332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.983362254686654e-05, + "grad_norm": 3.507042646408081, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8736706972122192, + "num_tokens": 209909605.0, + "step": 5501 + }, + { + "epoch": 0.6999109528049866, + "ewc_loss": 0.0069709522649645805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.970952381379902e-05, + "grad_norm": 3.5354044437408447, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8714863061904907, + "num_tokens": 209945196.0, + "step": 5502 + }, + { + "epoch": 0.7000381630835771, + "ewc_loss": 0.00697117717936635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.97117720847018e-05, + "grad_norm": 3.4825615882873535, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8643395900726318, + "num_tokens": 209988585.0, + "step": 5503 + }, + { + "epoch": 0.7001653733621677, + "ewc_loss": 0.006942931097000837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.942931213416159e-05, + "grad_norm": 3.6332952976226807, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8664218783378601, + "num_tokens": 210032084.0, + "step": 5504 + }, + { + "epoch": 0.7002925836407582, + "ewc_loss": 0.00704922853037715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04922858858481e-05, + "grad_norm": 3.6020538806915283, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8539268970489502, + "num_tokens": 210070639.0, + "step": 5505 + }, + { + "epoch": 0.7004197939193487, + "ewc_loss": 0.006946885492652655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.946885696379468e-05, + "grad_norm": 3.460355520248413, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8727626800537109, + "num_tokens": 210113793.0, + "step": 5506 + }, + { + "epoch": 0.7005470041979391, + "ewc_loss": 0.006885491777211428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.88549189362675e-05, + "grad_norm": 3.5212395191192627, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8568617105484009, + "num_tokens": 210153517.0, + "step": 5507 + }, + { + "epoch": 0.7006742144765297, + "ewc_loss": 0.006968384142965078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.968383968342096e-05, + "grad_norm": 3.5802876949310303, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8604959845542908, + "num_tokens": 210186818.0, + "step": 5508 + }, + { + "epoch": 0.7008014247551202, + "ewc_loss": 0.006974132731556892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.974132702453062e-05, + "grad_norm": 3.52519154548645, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8578065633773804, + "num_tokens": 210224283.0, + "step": 5509 + }, + { + "epoch": 0.7009286350337107, + "ewc_loss": 0.006934336852282286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.934336852282286e-05, + "grad_norm": 3.497147560119629, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8606986999511719, + "num_tokens": 210263439.0, + "step": 5510 + }, + { + "epoch": 0.7010558453123013, + "ewc_loss": 0.00696016289293766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.96016286383383e-05, + "grad_norm": 3.489717483520508, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8664094805717468, + "num_tokens": 210305887.0, + "step": 5511 + }, + { + "epoch": 0.7011830555908918, + "ewc_loss": 0.006956664379686117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95666458341293e-05, + "grad_norm": 3.551845073699951, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8643255233764648, + "num_tokens": 210340118.0, + "step": 5512 + }, + { + "epoch": 0.7013102658694823, + "ewc_loss": 0.006981323007494211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.981322803767398e-05, + "grad_norm": 3.5250861644744873, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8660959005355835, + "num_tokens": 210376098.0, + "step": 5513 + }, + { + "epoch": 0.7014374761480727, + "ewc_loss": 0.006962615065276623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.96261486154981e-05, + "grad_norm": 3.514305591583252, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8686119318008423, + "num_tokens": 210418801.0, + "step": 5514 + }, + { + "epoch": 0.7015646864266633, + "ewc_loss": 0.006952847819775343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95284761604853e-05, + "grad_norm": 3.5669021606445312, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8544616103172302, + "num_tokens": 210452392.0, + "step": 5515 + }, + { + "epoch": 0.7016918967052538, + "ewc_loss": 0.006989018060266972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.989017856540158e-05, + "grad_norm": 3.511776924133301, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8721193075180054, + "num_tokens": 210489697.0, + "step": 5516 + }, + { + "epoch": 0.7018191069838443, + "ewc_loss": 0.0069327098317444324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.932709948159754e-05, + "grad_norm": 3.4896316528320312, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8498117923736572, + "num_tokens": 210530212.0, + "step": 5517 + }, + { + "epoch": 0.7019463172624348, + "ewc_loss": 0.006955847609788179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.955847493372858e-05, + "grad_norm": 3.561314344406128, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8713062405586243, + "num_tokens": 210564974.0, + "step": 5518 + }, + { + "epoch": 0.7020735275410254, + "ewc_loss": 0.006999938283115625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.999938341323286e-05, + "grad_norm": 3.5705718994140625, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8651852011680603, + "num_tokens": 210601272.0, + "step": 5519 + }, + { + "epoch": 0.7022007378196158, + "ewc_loss": 0.006987508852034807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.987508822930977e-05, + "grad_norm": 3.548152446746826, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8673555850982666, + "num_tokens": 210636908.0, + "step": 5520 + }, + { + "epoch": 0.7023279480982063, + "ewc_loss": 0.006977487821131945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.977487646508962e-05, + "grad_norm": 3.5317776203155518, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8613799810409546, + "num_tokens": 210672629.0, + "step": 5521 + }, + { + "epoch": 0.7024551583767968, + "ewc_loss": 0.00698451790958643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.984517676755786e-05, + "grad_norm": 3.58186674118042, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8789033889770508, + "num_tokens": 210703531.0, + "step": 5522 + }, + { + "epoch": 0.7025823686553874, + "ewc_loss": 0.007014158647507429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.014158472884446e-05, + "grad_norm": 3.4682505130767822, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8780329823493958, + "num_tokens": 210742946.0, + "step": 5523 + }, + { + "epoch": 0.7027095789339779, + "ewc_loss": 0.00693766213953495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.937661964911968e-05, + "grad_norm": 3.5160739421844482, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.873245894908905, + "num_tokens": 210778345.0, + "step": 5524 + }, + { + "epoch": 0.7028367892125684, + "ewc_loss": 0.007002384401857853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.002384518273175e-05, + "grad_norm": 3.4835760593414307, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8699723482131958, + "num_tokens": 210821544.0, + "step": 5525 + }, + { + "epoch": 0.7029639994911588, + "ewc_loss": 0.006959393620491028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95939379511401e-05, + "grad_norm": 3.5318641662597656, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8686696290969849, + "num_tokens": 210858060.0, + "step": 5526 + }, + { + "epoch": 0.7030912097697494, + "ewc_loss": 0.0070039089769124985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.003908831393346e-05, + "grad_norm": 3.5126922130584717, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8591892123222351, + "num_tokens": 210900365.0, + "step": 5527 + }, + { + "epoch": 0.7032184200483399, + "ewc_loss": 0.006973204202950001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.973204290261492e-05, + "grad_norm": 3.4851551055908203, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.860076367855072, + "num_tokens": 210941118.0, + "step": 5528 + }, + { + "epoch": 0.7033456303269304, + "ewc_loss": 0.006979059893637896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.979059980949387e-05, + "grad_norm": 3.498894214630127, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.871658980846405, + "num_tokens": 210979865.0, + "step": 5529 + }, + { + "epoch": 0.703472840605521, + "ewc_loss": 0.006977890618145466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.977890734560788e-05, + "grad_norm": 3.501514196395874, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8656017780303955, + "num_tokens": 211016385.0, + "step": 5530 + }, + { + "epoch": 0.7036000508841115, + "ewc_loss": 0.0069730645045638084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.9730645918753e-05, + "grad_norm": 3.5205013751983643, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8531625866889954, + "num_tokens": 211059071.0, + "step": 5531 + }, + { + "epoch": 0.7037272611627019, + "ewc_loss": 0.006977489218115807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.977489101700485e-05, + "grad_norm": 3.549619197845459, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8669215440750122, + "num_tokens": 211094189.0, + "step": 5532 + }, + { + "epoch": 0.7038544714412924, + "ewc_loss": 0.006988678127527237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.988678069319576e-05, + "grad_norm": 3.567080020904541, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8795799016952515, + "num_tokens": 211130524.0, + "step": 5533 + }, + { + "epoch": 0.703981681719883, + "ewc_loss": 0.006986378692090511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.986378866713494e-05, + "grad_norm": 3.496361255645752, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8584307432174683, + "num_tokens": 211172335.0, + "step": 5534 + }, + { + "epoch": 0.7041088919984735, + "ewc_loss": 0.006940540857613087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.9405410613399e-05, + "grad_norm": 3.5768158435821533, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.867636501789093, + "num_tokens": 211205406.0, + "step": 5535 + }, + { + "epoch": 0.704236102277064, + "ewc_loss": 0.0070117139257490635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.011713751126081e-05, + "grad_norm": 3.4926934242248535, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8595397472381592, + "num_tokens": 211246443.0, + "step": 5536 + }, + { + "epoch": 0.7043633125556545, + "ewc_loss": 0.006927256006747484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.927255890332162e-05, + "grad_norm": 3.5019917488098145, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8796917796134949, + "num_tokens": 211286707.0, + "step": 5537 + }, + { + "epoch": 0.704490522834245, + "ewc_loss": 0.006965987384319305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.965987267903984e-05, + "grad_norm": 3.5282480716705322, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8776922821998596, + "num_tokens": 211319448.0, + "step": 5538 + }, + { + "epoch": 0.7046177331128355, + "ewc_loss": 0.006967123132199049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.967123044887558e-05, + "grad_norm": 3.4907896518707275, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8604782819747925, + "num_tokens": 211363747.0, + "step": 5539 + }, + { + "epoch": 0.704744943391426, + "ewc_loss": 0.006948962341994047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.948962254682556e-05, + "grad_norm": 3.515666961669922, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8712151050567627, + "num_tokens": 211408080.0, + "step": 5540 + }, + { + "epoch": 0.7048721536700165, + "ewc_loss": 0.006955541204661131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.9555411755573e-05, + "grad_norm": 3.529827117919922, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8549144268035889, + "num_tokens": 211445656.0, + "step": 5541 + }, + { + "epoch": 0.7049993639486071, + "ewc_loss": 0.006955684628337622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.9556845119223e-05, + "grad_norm": 3.4869863986968994, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8740530610084534, + "num_tokens": 211484342.0, + "step": 5542 + }, + { + "epoch": 0.7051265742271976, + "ewc_loss": 0.006908249109983444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.908248906256631e-05, + "grad_norm": 3.5435240268707275, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8606404662132263, + "num_tokens": 211518535.0, + "step": 5543 + }, + { + "epoch": 0.705253784505788, + "ewc_loss": 0.006975224707275629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.975224823690951e-05, + "grad_norm": 3.511864423751831, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8583844900131226, + "num_tokens": 211557236.0, + "step": 5544 + }, + { + "epoch": 0.7053809947843785, + "ewc_loss": 0.0069254799745976925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.92547982907854e-05, + "grad_norm": 3.4724535942077637, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.876375138759613, + "num_tokens": 211599216.0, + "step": 5545 + }, + { + "epoch": 0.7055082050629691, + "ewc_loss": 0.006921470630913973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.921470776433125e-05, + "grad_norm": 3.536339521408081, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8469735383987427, + "num_tokens": 211638930.0, + "step": 5546 + }, + { + "epoch": 0.7056354153415596, + "ewc_loss": 0.0069733369164168835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.97333671269007e-05, + "grad_norm": 3.50950288772583, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8579641580581665, + "num_tokens": 211680765.0, + "step": 5547 + }, + { + "epoch": 0.7057626256201501, + "ewc_loss": 0.0069169108755886555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.916910933796316e-05, + "grad_norm": 3.4973204135894775, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8778722286224365, + "num_tokens": 211719037.0, + "step": 5548 + }, + { + "epoch": 0.7058898358987407, + "ewc_loss": 0.0069202580489218235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.920257874298841e-05, + "grad_norm": 3.5514254570007324, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8599333763122559, + "num_tokens": 211759655.0, + "step": 5549 + }, + { + "epoch": 0.7060170461773311, + "ewc_loss": 0.006955295335501432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.955295248189941e-05, + "grad_norm": 3.5361340045928955, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8487651944160461, + "num_tokens": 211800405.0, + "step": 5550 + }, + { + "epoch": 0.7061442564559216, + "ewc_loss": 0.0069222538731992245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.922253669472411e-05, + "grad_norm": 3.5251951217651367, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8475354313850403, + "num_tokens": 211846057.0, + "step": 5551 + }, + { + "epoch": 0.7062714667345121, + "ewc_loss": 0.00692497193813324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.92497196723707e-05, + "grad_norm": 3.545454263687134, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8628267645835876, + "num_tokens": 211885944.0, + "step": 5552 + }, + { + "epoch": 0.7063986770131027, + "ewc_loss": 0.006939721759408712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.939721788512543e-05, + "grad_norm": 3.465101718902588, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8697434067726135, + "num_tokens": 211928727.0, + "step": 5553 + }, + { + "epoch": 0.7065258872916932, + "ewc_loss": 0.006875978782773018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.875978579046205e-05, + "grad_norm": 3.5527305603027344, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8558986186981201, + "num_tokens": 211967593.0, + "step": 5554 + }, + { + "epoch": 0.7066530975702837, + "ewc_loss": 0.00695966137573123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.959661550354213e-05, + "grad_norm": 3.565188407897949, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8602426052093506, + "num_tokens": 212001023.0, + "step": 5555 + }, + { + "epoch": 0.7067803078488741, + "ewc_loss": 0.006940504536032677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.940504681551829e-05, + "grad_norm": 3.5444657802581787, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8560115694999695, + "num_tokens": 212045611.0, + "step": 5556 + }, + { + "epoch": 0.7069075181274647, + "ewc_loss": 0.006927003618329763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.92700341460295e-05, + "grad_norm": 3.5012381076812744, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8602323532104492, + "num_tokens": 212087167.0, + "step": 5557 + }, + { + "epoch": 0.7070347284060552, + "ewc_loss": 0.006923490669578314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.923490582266822e-05, + "grad_norm": 3.5232622623443604, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.864743173122406, + "num_tokens": 212124254.0, + "step": 5558 + }, + { + "epoch": 0.7071619386846457, + "ewc_loss": 0.006948208436369896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.948208465473726e-05, + "grad_norm": 3.5423641204833984, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8573751449584961, + "num_tokens": 212162706.0, + "step": 5559 + }, + { + "epoch": 0.7072891489632362, + "ewc_loss": 0.0069468035362660885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.946803478058428e-05, + "grad_norm": 3.5281219482421875, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8642423152923584, + "num_tokens": 212197359.0, + "step": 5560 + }, + { + "epoch": 0.7074163592418268, + "ewc_loss": 0.006934128701686859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.93412875989452e-05, + "grad_norm": 3.507066249847412, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8715604543685913, + "num_tokens": 212237300.0, + "step": 5561 + }, + { + "epoch": 0.7075435695204173, + "ewc_loss": 0.006937119178473949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.93711917847395e-05, + "grad_norm": 3.4827747344970703, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8450995683670044, + "num_tokens": 212283472.0, + "step": 5562 + }, + { + "epoch": 0.7076707797990077, + "ewc_loss": 0.006930043455213308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.930043309694156e-05, + "grad_norm": 3.5369551181793213, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8588143587112427, + "num_tokens": 212321433.0, + "step": 5563 + }, + { + "epoch": 0.7077979900775982, + "ewc_loss": 0.006980509497225285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.980509351706132e-05, + "grad_norm": 3.508702516555786, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8625099658966064, + "num_tokens": 212362234.0, + "step": 5564 + }, + { + "epoch": 0.7079252003561888, + "ewc_loss": 0.00693101529031992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.931015377631411e-05, + "grad_norm": 3.5801663398742676, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.854414701461792, + "num_tokens": 212399259.0, + "step": 5565 + }, + { + "epoch": 0.7080524106347793, + "ewc_loss": 0.006979311816394329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.979311729082838e-05, + "grad_norm": 3.4994304180145264, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8642416000366211, + "num_tokens": 212438489.0, + "step": 5566 + }, + { + "epoch": 0.7081796209133698, + "ewc_loss": 0.006916443817317486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.916443817317486e-05, + "grad_norm": 3.5060017108917236, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8552833199501038, + "num_tokens": 212482283.0, + "step": 5567 + }, + { + "epoch": 0.7083068311919604, + "ewc_loss": 0.0069446563720703125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.944656342966482e-05, + "grad_norm": 3.52687406539917, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8583088517189026, + "num_tokens": 212522749.0, + "step": 5568 + }, + { + "epoch": 0.7084340414705508, + "ewc_loss": 0.0069420537911355495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.942053732927889e-05, + "grad_norm": 3.545722246170044, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8548129200935364, + "num_tokens": 212559937.0, + "step": 5569 + }, + { + "epoch": 0.7085612517491413, + "ewc_loss": 0.006950457580387592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95045746397227e-05, + "grad_norm": 3.5606346130371094, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8691962957382202, + "num_tokens": 212595578.0, + "step": 5570 + }, + { + "epoch": 0.7086884620277318, + "ewc_loss": 0.006946119479835033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.946119538042694e-05, + "grad_norm": 3.5218427181243896, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8751339912414551, + "num_tokens": 212634085.0, + "step": 5571 + }, + { + "epoch": 0.7088156723063224, + "ewc_loss": 0.006925287190824747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.925287016201764e-05, + "grad_norm": 3.5683071613311768, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8535085320472717, + "num_tokens": 212673557.0, + "step": 5572 + }, + { + "epoch": 0.7089428825849129, + "ewc_loss": 0.006958900485187769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.958900485187769e-05, + "grad_norm": 3.5142414569854736, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8729081153869629, + "num_tokens": 212711142.0, + "step": 5573 + }, + { + "epoch": 0.7090700928635034, + "ewc_loss": 0.006921787280589342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.921787280589342e-05, + "grad_norm": 3.5984811782836914, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8662445545196533, + "num_tokens": 212745570.0, + "step": 5574 + }, + { + "epoch": 0.7091973031420938, + "ewc_loss": 0.0069894567131996155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.989456596784294e-05, + "grad_norm": 3.5302348136901855, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8671108484268188, + "num_tokens": 212783048.0, + "step": 5575 + }, + { + "epoch": 0.7093245134206844, + "ewc_loss": 0.006903151515871286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.903151370352134e-05, + "grad_norm": 3.5041987895965576, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8816767334938049, + "num_tokens": 212821231.0, + "step": 5576 + }, + { + "epoch": 0.7094517236992749, + "ewc_loss": 0.006938244216144085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.938244041521102e-05, + "grad_norm": 3.516303300857544, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8625070452690125, + "num_tokens": 212861715.0, + "step": 5577 + }, + { + "epoch": 0.7095789339778654, + "ewc_loss": 0.006949907168745995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.949907401576638e-05, + "grad_norm": 3.5346736907958984, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8543546795845032, + "num_tokens": 212900655.0, + "step": 5578 + }, + { + "epoch": 0.709706144256456, + "ewc_loss": 0.006956009194254875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.956009019631892e-05, + "grad_norm": 3.5611932277679443, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8554983139038086, + "num_tokens": 212939308.0, + "step": 5579 + }, + { + "epoch": 0.7098333545350465, + "ewc_loss": 0.006962142884731293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.962142651900649e-05, + "grad_norm": 3.454122543334961, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8674210906028748, + "num_tokens": 212981174.0, + "step": 5580 + }, + { + "epoch": 0.7099605648136369, + "ewc_loss": 0.006897399201989174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.897398998262361e-05, + "grad_norm": 3.532224178314209, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8730507493019104, + "num_tokens": 213016401.0, + "step": 5581 + }, + { + "epoch": 0.7100877750922274, + "ewc_loss": 0.0069749257527291775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.974925781833008e-05, + "grad_norm": 3.5151569843292236, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8528057336807251, + "num_tokens": 213058480.0, + "step": 5582 + }, + { + "epoch": 0.710214985370818, + "ewc_loss": 0.006952277384698391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.952277180971578e-05, + "grad_norm": 3.5650031566619873, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8755360245704651, + "num_tokens": 213093527.0, + "step": 5583 + }, + { + "epoch": 0.7103421956494085, + "ewc_loss": 0.006971354596316814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.971354741835967e-05, + "grad_norm": 3.5659031867980957, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8606662750244141, + "num_tokens": 213126778.0, + "step": 5584 + }, + { + "epoch": 0.710469405927999, + "ewc_loss": 0.006955608259886503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.955608114367351e-05, + "grad_norm": 3.4835011959075928, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8655664920806885, + "num_tokens": 213170863.0, + "step": 5585 + }, + { + "epoch": 0.7105966162065895, + "ewc_loss": 0.00692698685452342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.926986679900438e-05, + "grad_norm": 3.540011405944824, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8404783010482788, + "num_tokens": 213211336.0, + "step": 5586 + }, + { + "epoch": 0.71072382648518, + "ewc_loss": 0.006975837517529726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.975837459322065e-05, + "grad_norm": 3.533113718032837, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8708884716033936, + "num_tokens": 213245262.0, + "step": 5587 + }, + { + "epoch": 0.7108510367637705, + "ewc_loss": 0.006959288381040096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.959288293728605e-05, + "grad_norm": 3.568199634552002, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8545039892196655, + "num_tokens": 213282150.0, + "step": 5588 + }, + { + "epoch": 0.710978247042361, + "ewc_loss": 0.006975875236093998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.975875294301659e-05, + "grad_norm": 3.553621292114258, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8696473836898804, + "num_tokens": 213325573.0, + "step": 5589 + }, + { + "epoch": 0.7111054573209515, + "ewc_loss": 0.00696027185767889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.960272003198043e-05, + "grad_norm": 3.504612922668457, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8722928762435913, + "num_tokens": 213363738.0, + "step": 5590 + }, + { + "epoch": 0.7112326675995421, + "ewc_loss": 0.006943217944353819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.943217886146158e-05, + "grad_norm": 3.4742941856384277, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8699005246162415, + "num_tokens": 213404796.0, + "step": 5591 + }, + { + "epoch": 0.7113598778781326, + "ewc_loss": 0.0069312360137701035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.931235839147121e-05, + "grad_norm": 3.5902512073516846, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8556973934173584, + "num_tokens": 213436711.0, + "step": 5592 + }, + { + "epoch": 0.711487088156723, + "ewc_loss": 0.007019311189651489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.019311306066811e-05, + "grad_norm": 3.535384178161621, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8675501942634583, + "num_tokens": 213473458.0, + "step": 5593 + }, + { + "epoch": 0.7116142984353135, + "ewc_loss": 0.006940172053873539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.940172170288861e-05, + "grad_norm": 3.5199644565582275, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8736649751663208, + "num_tokens": 213507355.0, + "step": 5594 + }, + { + "epoch": 0.7117415087139041, + "ewc_loss": 0.006966249085962772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.966249202378094e-05, + "grad_norm": 3.599188804626465, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8464758992195129, + "num_tokens": 213543771.0, + "step": 5595 + }, + { + "epoch": 0.7118687189924946, + "ewc_loss": 0.007025459315627813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.025459490250796e-05, + "grad_norm": 3.4964730739593506, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8619715571403503, + "num_tokens": 213578317.0, + "step": 5596 + }, + { + "epoch": 0.7119959292710851, + "ewc_loss": 0.006957856472581625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.957856385270134e-05, + "grad_norm": 3.5326883792877197, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8594467043876648, + "num_tokens": 213615795.0, + "step": 5597 + }, + { + "epoch": 0.7121231395496757, + "ewc_loss": 0.007012574002146721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.012573769316077e-05, + "grad_norm": 3.4955153465270996, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8739407062530518, + "num_tokens": 213655452.0, + "step": 5598 + }, + { + "epoch": 0.7122503498282661, + "ewc_loss": 0.006985542364418507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.985542131587863e-05, + "grad_norm": 3.5921125411987305, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8554856777191162, + "num_tokens": 213688079.0, + "step": 5599 + }, + { + "epoch": 0.7123775601068566, + "ewc_loss": 0.007085494231432676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.085494144121185e-05, + "grad_norm": 3.4804131984710693, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8801313638687134, + "num_tokens": 213725310.0, + "step": 5600 + }, + { + "epoch": 0.7125047703854471, + "ewc_loss": 0.006971906870603561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.971906987018883e-05, + "grad_norm": 3.578033924102783, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8586786985397339, + "num_tokens": 213757950.0, + "step": 5601 + }, + { + "epoch": 0.7126319806640377, + "ewc_loss": 0.0070844958536326885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.084495882736519e-05, + "grad_norm": 3.5475142002105713, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.852530837059021, + "num_tokens": 213794146.0, + "step": 5602 + }, + { + "epoch": 0.7127591909426282, + "ewc_loss": 0.0070374575443565845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.037457544356585e-05, + "grad_norm": 3.4889979362487793, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8678528666496277, + "num_tokens": 213834922.0, + "step": 5603 + }, + { + "epoch": 0.7128864012212187, + "ewc_loss": 0.007014627102762461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0146270445548e-05, + "grad_norm": 3.4662277698516846, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8823862075805664, + "num_tokens": 213879651.0, + "step": 5604 + }, + { + "epoch": 0.7130136114998091, + "ewc_loss": 0.007032344583421946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.032344728941098e-05, + "grad_norm": 3.5208821296691895, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8632675409317017, + "num_tokens": 213921833.0, + "step": 5605 + }, + { + "epoch": 0.7131408217783997, + "ewc_loss": 0.0070360382087528706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.036038005026057e-05, + "grad_norm": 3.4743311405181885, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8700923919677734, + "num_tokens": 213962909.0, + "step": 5606 + }, + { + "epoch": 0.7132680320569902, + "ewc_loss": 0.006983151193708181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.983151251915842e-05, + "grad_norm": 3.4822747707366943, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8664109706878662, + "num_tokens": 214002030.0, + "step": 5607 + }, + { + "epoch": 0.7133952423355807, + "ewc_loss": 0.006997629068791866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.997628952376544e-05, + "grad_norm": 3.5558815002441406, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8585458993911743, + "num_tokens": 214037094.0, + "step": 5608 + }, + { + "epoch": 0.7135224526141712, + "ewc_loss": 0.0070403520949184895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.040351920295507e-05, + "grad_norm": 3.5207200050354004, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8677540421485901, + "num_tokens": 214076649.0, + "step": 5609 + }, + { + "epoch": 0.7136496628927618, + "ewc_loss": 0.006973654497414827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.97365467203781e-05, + "grad_norm": 3.535352945327759, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8591495156288147, + "num_tokens": 214114194.0, + "step": 5610 + }, + { + "epoch": 0.7137768731713523, + "ewc_loss": 0.006996392272412777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.996392039582133e-05, + "grad_norm": 3.5669331550598145, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8669683933258057, + "num_tokens": 214149652.0, + "step": 5611 + }, + { + "epoch": 0.7139040834499427, + "ewc_loss": 0.006993691902607679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.99369193171151e-05, + "grad_norm": 3.4764671325683594, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8734414577484131, + "num_tokens": 214189068.0, + "step": 5612 + }, + { + "epoch": 0.7140312937285332, + "ewc_loss": 0.006910266354680061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.910266529303044e-05, + "grad_norm": 3.633056640625, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8524928689002991, + "num_tokens": 214220251.0, + "step": 5613 + }, + { + "epoch": 0.7141585040071238, + "ewc_loss": 0.00704768393188715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04768390278332e-05, + "grad_norm": 3.5999934673309326, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8622639179229736, + "num_tokens": 214256705.0, + "step": 5614 + }, + { + "epoch": 0.7142857142857143, + "ewc_loss": 0.006977362558245659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.977362500037998e-05, + "grad_norm": 3.4940176010131836, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8779184818267822, + "num_tokens": 214292563.0, + "step": 5615 + }, + { + "epoch": 0.7144129245643048, + "ewc_loss": 0.006934244651347399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.934244447620586e-05, + "grad_norm": 3.4945483207702637, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8592325448989868, + "num_tokens": 214333843.0, + "step": 5616 + }, + { + "epoch": 0.7145401348428954, + "ewc_loss": 0.006985927000641823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.985927029745653e-05, + "grad_norm": 3.475209951400757, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8755499124526978, + "num_tokens": 214372272.0, + "step": 5617 + }, + { + "epoch": 0.7146673451214858, + "ewc_loss": 0.006967557594180107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.967557419557124e-05, + "grad_norm": 3.5301804542541504, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8698283433914185, + "num_tokens": 214416364.0, + "step": 5618 + }, + { + "epoch": 0.7147945554000763, + "ewc_loss": 0.007005654275417328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.005654333624989e-05, + "grad_norm": 3.5153048038482666, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8631646037101746, + "num_tokens": 214453102.0, + "step": 5619 + }, + { + "epoch": 0.7149217656786668, + "ewc_loss": 0.006973310373723507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.973310519242659e-05, + "grad_norm": 3.484802007675171, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8683276176452637, + "num_tokens": 214492374.0, + "step": 5620 + }, + { + "epoch": 0.7150489759572574, + "ewc_loss": 0.006976279895752668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.976279837545007e-05, + "grad_norm": 3.510530948638916, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8654065728187561, + "num_tokens": 214530918.0, + "step": 5621 + }, + { + "epoch": 0.7151761862358479, + "ewc_loss": 0.007005730643868446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.005730731179938e-05, + "grad_norm": 3.5202889442443848, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.880617618560791, + "num_tokens": 214570227.0, + "step": 5622 + }, + { + "epoch": 0.7153033965144384, + "ewc_loss": 0.0069880676455795765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.988067616475746e-05, + "grad_norm": 3.496537923812866, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.868004322052002, + "num_tokens": 214609519.0, + "step": 5623 + }, + { + "epoch": 0.7154306067930288, + "ewc_loss": 0.00698234885931015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.982348713790998e-05, + "grad_norm": 3.5725290775299072, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8609476089477539, + "num_tokens": 214643564.0, + "step": 5624 + }, + { + "epoch": 0.7155578170716194, + "ewc_loss": 0.007014998700469732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.014998845988885e-05, + "grad_norm": 3.545161724090576, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8732753992080688, + "num_tokens": 214677632.0, + "step": 5625 + }, + { + "epoch": 0.7156850273502099, + "ewc_loss": 0.00698191998526454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.981920159887522e-05, + "grad_norm": 3.5492756366729736, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8664082884788513, + "num_tokens": 214710363.0, + "step": 5626 + }, + { + "epoch": 0.7158122376288004, + "ewc_loss": 0.006998089607805014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.998089520493522e-05, + "grad_norm": 3.5537424087524414, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8716830611228943, + "num_tokens": 214746982.0, + "step": 5627 + }, + { + "epoch": 0.715939447907391, + "ewc_loss": 0.006988058798015118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.988058885326609e-05, + "grad_norm": 3.52978515625, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8559407591819763, + "num_tokens": 214786447.0, + "step": 5628 + }, + { + "epoch": 0.7160666581859815, + "ewc_loss": 0.006971973925828934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.971973925828934e-05, + "grad_norm": 3.5118112564086914, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8733783960342407, + "num_tokens": 214824627.0, + "step": 5629 + }, + { + "epoch": 0.7161938684645719, + "ewc_loss": 0.006972792558372021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.97279247106053e-05, + "grad_norm": 3.575887441635132, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8465556502342224, + "num_tokens": 214863051.0, + "step": 5630 + }, + { + "epoch": 0.7163210787431624, + "ewc_loss": 0.007006755098700523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.006755186012015e-05, + "grad_norm": 3.628857374191284, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8516262173652649, + "num_tokens": 214900524.0, + "step": 5631 + }, + { + "epoch": 0.716448289021753, + "ewc_loss": 0.0070357550866901875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.035754970274866e-05, + "grad_norm": 3.4887783527374268, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8708103895187378, + "num_tokens": 214943949.0, + "step": 5632 + }, + { + "epoch": 0.7165754993003435, + "ewc_loss": 0.006946131121367216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.946131179574877e-05, + "grad_norm": 3.4929356575012207, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8693696856498718, + "num_tokens": 214979126.0, + "step": 5633 + }, + { + "epoch": 0.716702709578934, + "ewc_loss": 0.0070045278407633305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.004528015386313e-05, + "grad_norm": 3.529574394226074, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8655691146850586, + "num_tokens": 215017740.0, + "step": 5634 + }, + { + "epoch": 0.7168299198575245, + "ewc_loss": 0.007016026880592108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.016026938799769e-05, + "grad_norm": 3.5229742527008057, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8738799095153809, + "num_tokens": 215054472.0, + "step": 5635 + }, + { + "epoch": 0.716957130136115, + "ewc_loss": 0.006992565002292395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.992564885877073e-05, + "grad_norm": 3.5835437774658203, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8679971694946289, + "num_tokens": 215086894.0, + "step": 5636 + }, + { + "epoch": 0.7170843404147055, + "ewc_loss": 0.00703980028629303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.039800402708352e-05, + "grad_norm": 3.4665229320526123, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8664454221725464, + "num_tokens": 215128157.0, + "step": 5637 + }, + { + "epoch": 0.717211550693296, + "ewc_loss": 0.006945921573787928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.945921631995589e-05, + "grad_norm": 3.5479507446289062, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8724704384803772, + "num_tokens": 215161008.0, + "step": 5638 + }, + { + "epoch": 0.7173387609718865, + "ewc_loss": 0.007049567066133022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04956692061387e-05, + "grad_norm": 3.5830631256103516, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8471686244010925, + "num_tokens": 215197157.0, + "step": 5639 + }, + { + "epoch": 0.7174659712504771, + "ewc_loss": 0.007053263019770384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.053263107081875e-05, + "grad_norm": 3.4671382904052734, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8714404106140137, + "num_tokens": 215240270.0, + "step": 5640 + }, + { + "epoch": 0.7175931815290676, + "ewc_loss": 0.006973653100430965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.973653216846287e-05, + "grad_norm": 3.476560115814209, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8728697896003723, + "num_tokens": 215279828.0, + "step": 5641 + }, + { + "epoch": 0.717720391807658, + "ewc_loss": 0.007025973871350288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.025973900454119e-05, + "grad_norm": 3.5477359294891357, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8599309921264648, + "num_tokens": 215315271.0, + "step": 5642 + }, + { + "epoch": 0.7178476020862485, + "ewc_loss": 0.007056645583361387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.056645699776709e-05, + "grad_norm": 3.513145923614502, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8667703866958618, + "num_tokens": 215352621.0, + "step": 5643 + }, + { + "epoch": 0.7179748123648391, + "ewc_loss": 0.007010387256741524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.010387344053015e-05, + "grad_norm": 3.519341468811035, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8686717748641968, + "num_tokens": 215390231.0, + "step": 5644 + }, + { + "epoch": 0.7181020226434296, + "ewc_loss": 0.007028775289654732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.02877514413558e-05, + "grad_norm": 3.518923759460449, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8790168762207031, + "num_tokens": 215426812.0, + "step": 5645 + }, + { + "epoch": 0.7182292329220201, + "ewc_loss": 0.007012306712567806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.012306741671637e-05, + "grad_norm": 3.5253665447235107, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8552553057670593, + "num_tokens": 215467551.0, + "step": 5646 + }, + { + "epoch": 0.7183564432006107, + "ewc_loss": 0.007033399771898985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.033399742795154e-05, + "grad_norm": 3.4945907592773438, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8738235235214233, + "num_tokens": 215508559.0, + "step": 5647 + }, + { + "epoch": 0.7184836534792011, + "ewc_loss": 0.006995227187871933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.995227158768103e-05, + "grad_norm": 3.5332963466644287, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8588670492172241, + "num_tokens": 215548709.0, + "step": 5648 + }, + { + "epoch": 0.7186108637577916, + "ewc_loss": 0.007014443166553974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.014442962827161e-05, + "grad_norm": 3.477775812149048, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8675627112388611, + "num_tokens": 215593844.0, + "step": 5649 + }, + { + "epoch": 0.7187380740363821, + "ewc_loss": 0.006969926413148642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.969926471356302e-05, + "grad_norm": 3.5382254123687744, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8713911771774292, + "num_tokens": 215633853.0, + "step": 5650 + }, + { + "epoch": 0.7188652843149727, + "ewc_loss": 0.007022955920547247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.022955833235756e-05, + "grad_norm": 3.605158567428589, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8642029166221619, + "num_tokens": 215667272.0, + "step": 5651 + }, + { + "epoch": 0.7189924945935632, + "ewc_loss": 0.007026818580925465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.026818639133126e-05, + "grad_norm": 3.5590837001800537, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8486665487289429, + "num_tokens": 215705889.0, + "step": 5652 + }, + { + "epoch": 0.7191197048721537, + "ewc_loss": 0.0069864182732999325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.98641815688461e-05, + "grad_norm": 3.5118765830993652, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8742645382881165, + "num_tokens": 215749143.0, + "step": 5653 + }, + { + "epoch": 0.7192469151507441, + "ewc_loss": 0.006970056798309088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.970056710997596e-05, + "grad_norm": 3.526731252670288, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8599435687065125, + "num_tokens": 215787583.0, + "step": 5654 + }, + { + "epoch": 0.7193741254293347, + "ewc_loss": 0.0069929687306284904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.99296870152466e-05, + "grad_norm": 3.5464484691619873, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8654129505157471, + "num_tokens": 215825552.0, + "step": 5655 + }, + { + "epoch": 0.7195013357079252, + "ewc_loss": 0.0069847628474235535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.984762876527384e-05, + "grad_norm": 3.5297791957855225, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8746004700660706, + "num_tokens": 215858310.0, + "step": 5656 + }, + { + "epoch": 0.7196285459865157, + "ewc_loss": 0.0069918218068778515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.991822010604665e-05, + "grad_norm": 3.573503255844116, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8541790246963501, + "num_tokens": 215898403.0, + "step": 5657 + }, + { + "epoch": 0.7197557562651062, + "ewc_loss": 0.0070061469450592995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.006146915955469e-05, + "grad_norm": 3.5562992095947266, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8417907953262329, + "num_tokens": 215935930.0, + "step": 5658 + }, + { + "epoch": 0.7198829665436968, + "ewc_loss": 0.0069763315841555595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.976331496844068e-05, + "grad_norm": 3.570641040802002, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8605440258979797, + "num_tokens": 215971509.0, + "step": 5659 + }, + { + "epoch": 0.7200101768222873, + "ewc_loss": 0.007001346442848444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.001346239121631e-05, + "grad_norm": 3.476243019104004, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8720635175704956, + "num_tokens": 216008696.0, + "step": 5660 + }, + { + "epoch": 0.7201373871008777, + "ewc_loss": 0.00694128917530179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.941289029782638e-05, + "grad_norm": 3.5341317653656006, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8647077083587646, + "num_tokens": 216045861.0, + "step": 5661 + }, + { + "epoch": 0.7202645973794682, + "ewc_loss": 0.0070101008750498295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.010100671323016e-05, + "grad_norm": 3.5269222259521484, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8746187090873718, + "num_tokens": 216081307.0, + "step": 5662 + }, + { + "epoch": 0.7203918076580588, + "ewc_loss": 0.007010181434452534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.010181434452534e-05, + "grad_norm": 3.5158472061157227, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.873090386390686, + "num_tokens": 216118506.0, + "step": 5663 + }, + { + "epoch": 0.7205190179366493, + "ewc_loss": 0.006980411242693663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.980411126278341e-05, + "grad_norm": 3.515178918838501, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8569899797439575, + "num_tokens": 216159000.0, + "step": 5664 + }, + { + "epoch": 0.7206462282152398, + "ewc_loss": 0.006999714765697718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.999714969424531e-05, + "grad_norm": 3.489649772644043, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8591151237487793, + "num_tokens": 216204712.0, + "step": 5665 + }, + { + "epoch": 0.7207734384938304, + "ewc_loss": 0.00698000006377697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.98000003467314e-05, + "grad_norm": 3.614302158355713, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8580882549285889, + "num_tokens": 216235001.0, + "step": 5666 + }, + { + "epoch": 0.7209006487724208, + "ewc_loss": 0.007069174200296402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.069174171192572e-05, + "grad_norm": 3.5752182006835938, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8588645458221436, + "num_tokens": 216271383.0, + "step": 5667 + }, + { + "epoch": 0.7210278590510113, + "ewc_loss": 0.007013130933046341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.013131107669324e-05, + "grad_norm": 3.538454055786133, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.86552894115448, + "num_tokens": 216307923.0, + "step": 5668 + }, + { + "epoch": 0.7211550693296018, + "ewc_loss": 0.007014586124569178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.01458629919216e-05, + "grad_norm": 3.491607189178467, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8839976787567139, + "num_tokens": 216345147.0, + "step": 5669 + }, + { + "epoch": 0.7212822796081924, + "ewc_loss": 0.006991121452301741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.991121335886419e-05, + "grad_norm": 3.5027284622192383, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8581454157829285, + "num_tokens": 216386991.0, + "step": 5670 + }, + { + "epoch": 0.7214094898867829, + "ewc_loss": 0.007025734521448612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.025734521448612e-05, + "grad_norm": 3.526479482650757, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8673418760299683, + "num_tokens": 216425534.0, + "step": 5671 + }, + { + "epoch": 0.7215367001653734, + "ewc_loss": 0.007024098187685013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.024098158581182e-05, + "grad_norm": 3.5103423595428467, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8652594089508057, + "num_tokens": 216466686.0, + "step": 5672 + }, + { + "epoch": 0.7216639104439638, + "ewc_loss": 0.0070027681067585945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.002767961239442e-05, + "grad_norm": 3.524513006210327, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8771195411682129, + "num_tokens": 216505245.0, + "step": 5673 + }, + { + "epoch": 0.7217911207225544, + "ewc_loss": 0.00702497037127614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.024970545899123e-05, + "grad_norm": 3.597257614135742, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8629895448684692, + "num_tokens": 216538024.0, + "step": 5674 + }, + { + "epoch": 0.7219183310011449, + "ewc_loss": 0.007057002279907465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.057002221699804e-05, + "grad_norm": 3.5558340549468994, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8654321432113647, + "num_tokens": 216573722.0, + "step": 5675 + }, + { + "epoch": 0.7220455412797354, + "ewc_loss": 0.00700739910826087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.00739910826087e-05, + "grad_norm": 3.509176015853882, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8721822500228882, + "num_tokens": 216610362.0, + "step": 5676 + }, + { + "epoch": 0.7221727515583259, + "ewc_loss": 0.007014529779553413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.01452954672277e-05, + "grad_norm": 3.5690338611602783, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8574703931808472, + "num_tokens": 216648732.0, + "step": 5677 + }, + { + "epoch": 0.7222999618369165, + "ewc_loss": 0.0070495689287781715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.049569103401154e-05, + "grad_norm": 3.5599985122680664, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.87516188621521, + "num_tokens": 216680324.0, + "step": 5678 + }, + { + "epoch": 0.7224271721155069, + "ewc_loss": 0.0070383595302701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.038359763100743e-05, + "grad_norm": 3.5567922592163086, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8679682612419128, + "num_tokens": 216712773.0, + "step": 5679 + }, + { + "epoch": 0.7225543823940974, + "ewc_loss": 0.007036400958895683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.036401075311005e-05, + "grad_norm": 3.5520503520965576, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8574123978614807, + "num_tokens": 216752538.0, + "step": 5680 + }, + { + "epoch": 0.722681592672688, + "ewc_loss": 0.007046258542686701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.046258542686701e-05, + "grad_norm": 3.579735517501831, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8682605028152466, + "num_tokens": 216791795.0, + "step": 5681 + }, + { + "epoch": 0.7228088029512785, + "ewc_loss": 0.007067038677632809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.067038677632809e-05, + "grad_norm": 3.5125198364257812, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8619616627693176, + "num_tokens": 216831729.0, + "step": 5682 + }, + { + "epoch": 0.722936013229869, + "ewc_loss": 0.007012778427451849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.012778223725036e-05, + "grad_norm": 3.522162675857544, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8661463260650635, + "num_tokens": 216869494.0, + "step": 5683 + }, + { + "epoch": 0.7230632235084595, + "ewc_loss": 0.007043111603707075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.043111691018566e-05, + "grad_norm": 3.5301125049591064, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8465632200241089, + "num_tokens": 216906672.0, + "step": 5684 + }, + { + "epoch": 0.72319043378705, + "ewc_loss": 0.007050899788737297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.050899876048788e-05, + "grad_norm": 3.544379234313965, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8721076846122742, + "num_tokens": 216941587.0, + "step": 5685 + }, + { + "epoch": 0.7233176440656405, + "ewc_loss": 0.007067561149597168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.067561091389507e-05, + "grad_norm": 3.486117362976074, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8587360978126526, + "num_tokens": 216981812.0, + "step": 5686 + }, + { + "epoch": 0.723444854344231, + "ewc_loss": 0.007033600006252527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.033599831629544e-05, + "grad_norm": 3.5040290355682373, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8755945563316345, + "num_tokens": 217024137.0, + "step": 5687 + }, + { + "epoch": 0.7235720646228215, + "ewc_loss": 0.007065361365675926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06536156940274e-05, + "grad_norm": 3.5011825561523438, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8777514100074768, + "num_tokens": 217063753.0, + "step": 5688 + }, + { + "epoch": 0.7236992749014121, + "ewc_loss": 0.007027382031083107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.027381798252463e-05, + "grad_norm": 3.5494117736816406, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8643356561660767, + "num_tokens": 217105611.0, + "step": 5689 + }, + { + "epoch": 0.7238264851800026, + "ewc_loss": 0.007054441142827272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.054441084619612e-05, + "grad_norm": 3.5621466636657715, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8467026948928833, + "num_tokens": 217145989.0, + "step": 5690 + }, + { + "epoch": 0.723953695458593, + "ewc_loss": 0.007047174032777548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.047173858154565e-05, + "grad_norm": 3.519327402114868, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8828121423721313, + "num_tokens": 217183726.0, + "step": 5691 + }, + { + "epoch": 0.7240809057371835, + "ewc_loss": 0.007008285261690617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.008285319898278e-05, + "grad_norm": 3.5020487308502197, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.863369345664978, + "num_tokens": 217226644.0, + "step": 5692 + }, + { + "epoch": 0.7242081160157741, + "ewc_loss": 0.006997268181294203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.997268064878881e-05, + "grad_norm": 3.565241813659668, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8516455888748169, + "num_tokens": 217263327.0, + "step": 5693 + }, + { + "epoch": 0.7243353262943646, + "ewc_loss": 0.007031485438346863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.031485438346863e-05, + "grad_norm": 3.604811906814575, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8688778281211853, + "num_tokens": 217300595.0, + "step": 5694 + }, + { + "epoch": 0.7244625365729551, + "ewc_loss": 0.007014276459813118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.014276343397796e-05, + "grad_norm": 3.5823593139648438, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8522147536277771, + "num_tokens": 217336385.0, + "step": 5695 + }, + { + "epoch": 0.7245897468515456, + "ewc_loss": 0.007004712242633104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.004712097113952e-05, + "grad_norm": 3.511124610900879, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8733609318733215, + "num_tokens": 217374309.0, + "step": 5696 + }, + { + "epoch": 0.7247169571301361, + "ewc_loss": 0.006976789329200983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.976789154578e-05, + "grad_norm": 3.556431531906128, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8572407960891724, + "num_tokens": 217408386.0, + "step": 5697 + }, + { + "epoch": 0.7248441674087266, + "ewc_loss": 0.007021194323897362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.021194323897362e-05, + "grad_norm": 3.5581088066101074, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8533071279525757, + "num_tokens": 217442855.0, + "step": 5698 + }, + { + "epoch": 0.7249713776873171, + "ewc_loss": 0.007007409818470478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.007410022197291e-05, + "grad_norm": 3.53460431098938, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8695472478866577, + "num_tokens": 217476892.0, + "step": 5699 + }, + { + "epoch": 0.7250985879659076, + "ewc_loss": 0.0070211682468652725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.021168130449951e-05, + "grad_norm": 3.573392868041992, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8632344007492065, + "num_tokens": 217513569.0, + "step": 5700 + }, + { + "epoch": 0.7252257982444982, + "ewc_loss": 0.0070346868596971035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.034686859697104e-05, + "grad_norm": 3.622682571411133, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8617880344390869, + "num_tokens": 217545585.0, + "step": 5701 + }, + { + "epoch": 0.7253530085230887, + "ewc_loss": 0.007085984572768211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.085984543664381e-05, + "grad_norm": 3.530733346939087, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8683227896690369, + "num_tokens": 217583156.0, + "step": 5702 + }, + { + "epoch": 0.7254802188016791, + "ewc_loss": 0.007001530844718218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.001531048445031e-05, + "grad_norm": 3.5368926525115967, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8667875528335571, + "num_tokens": 217619630.0, + "step": 5703 + }, + { + "epoch": 0.7256074290802697, + "ewc_loss": 0.007040663156658411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.040663331281394e-05, + "grad_norm": 3.562602996826172, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8759205341339111, + "num_tokens": 217653616.0, + "step": 5704 + }, + { + "epoch": 0.7257346393588602, + "ewc_loss": 0.007056592032313347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.056591857690364e-05, + "grad_norm": 3.5467913150787354, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8607802391052246, + "num_tokens": 217692758.0, + "step": 5705 + }, + { + "epoch": 0.7258618496374507, + "ewc_loss": 0.007036236580461264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.036236638668925e-05, + "grad_norm": 3.5186984539031982, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8512744307518005, + "num_tokens": 217734175.0, + "step": 5706 + }, + { + "epoch": 0.7259890599160412, + "ewc_loss": 0.007030347362160683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.030347478576005e-05, + "grad_norm": 3.586594581604004, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8427343368530273, + "num_tokens": 217773372.0, + "step": 5707 + }, + { + "epoch": 0.7261162701946318, + "ewc_loss": 0.0070739444345235825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.07394428900443e-05, + "grad_norm": 3.546431303024292, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8753432035446167, + "num_tokens": 217809174.0, + "step": 5708 + }, + { + "epoch": 0.7262434804732223, + "ewc_loss": 0.007024733349680901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0247333496809e-05, + "grad_norm": 3.551957368850708, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.857635498046875, + "num_tokens": 217845882.0, + "step": 5709 + }, + { + "epoch": 0.7263706907518127, + "ewc_loss": 0.007052235770970583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.052235741866753e-05, + "grad_norm": 3.518038749694824, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.861260712146759, + "num_tokens": 217885798.0, + "step": 5710 + }, + { + "epoch": 0.7264979010304032, + "ewc_loss": 0.007012289948761463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.012290006969124e-05, + "grad_norm": 3.5434539318084717, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8615107536315918, + "num_tokens": 217924967.0, + "step": 5711 + }, + { + "epoch": 0.7266251113089938, + "ewc_loss": 0.007051877211779356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.051877037156373e-05, + "grad_norm": 3.514801502227783, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8760992288589478, + "num_tokens": 217967381.0, + "step": 5712 + }, + { + "epoch": 0.7267523215875843, + "ewc_loss": 0.007004431914538145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.004431972745806e-05, + "grad_norm": 3.5630030632019043, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8712085485458374, + "num_tokens": 218001097.0, + "step": 5713 + }, + { + "epoch": 0.7268795318661748, + "ewc_loss": 0.007048574276268482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.048574479995295e-05, + "grad_norm": 3.523286819458008, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8670231699943542, + "num_tokens": 218040705.0, + "step": 5714 + }, + { + "epoch": 0.7270067421447653, + "ewc_loss": 0.006983751431107521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.983751518419012e-05, + "grad_norm": 3.5701019763946533, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8655306100845337, + "num_tokens": 218077549.0, + "step": 5715 + }, + { + "epoch": 0.7271339524233558, + "ewc_loss": 0.007038178388029337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.03817859175615e-05, + "grad_norm": 3.540686845779419, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8640425801277161, + "num_tokens": 218116251.0, + "step": 5716 + }, + { + "epoch": 0.7272611627019463, + "ewc_loss": 0.00700573343783617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.005733641562983e-05, + "grad_norm": 3.6514713764190674, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8386160135269165, + "num_tokens": 218155798.0, + "step": 5717 + }, + { + "epoch": 0.7273883729805368, + "ewc_loss": 0.007073516491800547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.073516462696716e-05, + "grad_norm": 3.5531978607177734, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8716107606887817, + "num_tokens": 218191277.0, + "step": 5718 + }, + { + "epoch": 0.7275155832591274, + "ewc_loss": 0.006975833792239428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.975833821343258e-05, + "grad_norm": 3.5206174850463867, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8702412843704224, + "num_tokens": 218233165.0, + "step": 5719 + }, + { + "epoch": 0.7276427935377179, + "ewc_loss": 0.006989843677729368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.989843677729368e-05, + "grad_norm": 3.5537760257720947, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8452519178390503, + "num_tokens": 218271263.0, + "step": 5720 + }, + { + "epoch": 0.7277700038163084, + "ewc_loss": 0.0070324488915503025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.032448775134981e-05, + "grad_norm": 3.5949294567108154, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8666009902954102, + "num_tokens": 218309081.0, + "step": 5721 + }, + { + "epoch": 0.7278972140948988, + "ewc_loss": 0.0070177847519516945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.017784810159355e-05, + "grad_norm": 3.485379934310913, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8604283332824707, + "num_tokens": 218348314.0, + "step": 5722 + }, + { + "epoch": 0.7280244243734894, + "ewc_loss": 0.006950883660465479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.950883835088462e-05, + "grad_norm": 3.539438247680664, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8692328333854675, + "num_tokens": 218387556.0, + "step": 5723 + }, + { + "epoch": 0.7281516346520799, + "ewc_loss": 0.007015853188931942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.01585304341279e-05, + "grad_norm": 3.5479161739349365, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8580198287963867, + "num_tokens": 218423130.0, + "step": 5724 + }, + { + "epoch": 0.7282788449306704, + "ewc_loss": 0.0070034051313996315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.003405335126445e-05, + "grad_norm": 3.5511715412139893, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8478755950927734, + "num_tokens": 218464079.0, + "step": 5725 + }, + { + "epoch": 0.7284060552092609, + "ewc_loss": 0.0070074163377285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.007416570559144e-05, + "grad_norm": 3.5254929065704346, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8743686676025391, + "num_tokens": 218501307.0, + "step": 5726 + }, + { + "epoch": 0.7285332654878515, + "ewc_loss": 0.006993558723479509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.99355878168717e-05, + "grad_norm": 3.580824375152588, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8713964223861694, + "num_tokens": 218537710.0, + "step": 5727 + }, + { + "epoch": 0.7286604757664419, + "ewc_loss": 0.007029663771390915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.029663538560271e-05, + "grad_norm": 3.5589234828948975, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8606984615325928, + "num_tokens": 218576026.0, + "step": 5728 + }, + { + "epoch": 0.7287876860450324, + "ewc_loss": 0.006999331526458263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.999331526458263e-05, + "grad_norm": 3.5484390258789062, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.861042320728302, + "num_tokens": 218612491.0, + "step": 5729 + }, + { + "epoch": 0.7289148963236229, + "ewc_loss": 0.007001091726124287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.001091580605134e-05, + "grad_norm": 3.799574375152588, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8734556436538696, + "num_tokens": 218642541.0, + "step": 5730 + }, + { + "epoch": 0.7290421066022135, + "ewc_loss": 0.0071624270640313625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.162427209550515e-05, + "grad_norm": 3.530651330947876, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8529925346374512, + "num_tokens": 218678514.0, + "step": 5731 + }, + { + "epoch": 0.729169316880804, + "ewc_loss": 0.006938822567462921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.93882248015143e-05, + "grad_norm": 3.5298969745635986, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.869695782661438, + "num_tokens": 218715415.0, + "step": 5732 + }, + { + "epoch": 0.7292965271593945, + "ewc_loss": 0.007029999513179064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.029999687802047e-05, + "grad_norm": 3.5179831981658936, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8722246885299683, + "num_tokens": 218754422.0, + "step": 5733 + }, + { + "epoch": 0.7294237374379849, + "ewc_loss": 0.007013327442109585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.013327558524907e-05, + "grad_norm": 3.6415178775787354, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8561429977416992, + "num_tokens": 218787141.0, + "step": 5734 + }, + { + "epoch": 0.7295509477165755, + "ewc_loss": 0.007100320886820555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.100321090547368e-05, + "grad_norm": 3.497387647628784, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8818541169166565, + "num_tokens": 218824764.0, + "step": 5735 + }, + { + "epoch": 0.729678157995166, + "ewc_loss": 0.006983150728046894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.983150524320081e-05, + "grad_norm": 3.535688638687134, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8643014430999756, + "num_tokens": 218866142.0, + "step": 5736 + }, + { + "epoch": 0.7298053682737565, + "ewc_loss": 0.007067089434713125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.067089609336108e-05, + "grad_norm": 3.4956939220428467, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8604930639266968, + "num_tokens": 218909208.0, + "step": 5737 + }, + { + "epoch": 0.7299325785523471, + "ewc_loss": 0.0070188697427511215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.01886965543963e-05, + "grad_norm": 3.563894271850586, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8732207417488098, + "num_tokens": 218942469.0, + "step": 5738 + }, + { + "epoch": 0.7300597888309376, + "ewc_loss": 0.007075558882206678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.075558823999017e-05, + "grad_norm": 3.554255247116089, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8592965602874756, + "num_tokens": 218982934.0, + "step": 5739 + }, + { + "epoch": 0.730186999109528, + "ewc_loss": 0.007051724009215832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.051724242046475e-05, + "grad_norm": 3.5166850090026855, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8633652925491333, + "num_tokens": 219027055.0, + "step": 5740 + }, + { + "epoch": 0.7303142093881185, + "ewc_loss": 0.007033344358205795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.033344445517287e-05, + "grad_norm": 3.621133327484131, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8642115592956543, + "num_tokens": 219069639.0, + "step": 5741 + }, + { + "epoch": 0.7304414196667091, + "ewc_loss": 0.007098390255123377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.098390051396564e-05, + "grad_norm": 3.5752508640289307, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8586499691009521, + "num_tokens": 219111774.0, + "step": 5742 + }, + { + "epoch": 0.7305686299452996, + "ewc_loss": 0.007017443422228098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.01744356774725e-05, + "grad_norm": 3.4948341846466064, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8521414995193481, + "num_tokens": 219156962.0, + "step": 5743 + }, + { + "epoch": 0.7306958402238901, + "ewc_loss": 0.007005170453339815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.005170482443646e-05, + "grad_norm": 3.542010545730591, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8741806149482727, + "num_tokens": 219193352.0, + "step": 5744 + }, + { + "epoch": 0.7308230505024806, + "ewc_loss": 0.007046943996101618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.046943937893957e-05, + "grad_norm": 3.620424747467041, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8512297868728638, + "num_tokens": 219230041.0, + "step": 5745 + }, + { + "epoch": 0.7309502607810711, + "ewc_loss": 0.007061098702251911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06109858583659e-05, + "grad_norm": 3.6110177040100098, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8432601690292358, + "num_tokens": 219262748.0, + "step": 5746 + }, + { + "epoch": 0.7310774710596616, + "ewc_loss": 0.007049729116261005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.049729174468666e-05, + "grad_norm": 3.525563955307007, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.876507580280304, + "num_tokens": 219296556.0, + "step": 5747 + }, + { + "epoch": 0.7312046813382521, + "ewc_loss": 0.007009490393102169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.009490218479186e-05, + "grad_norm": 3.5872256755828857, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8513885140419006, + "num_tokens": 219330801.0, + "step": 5748 + }, + { + "epoch": 0.7313318916168426, + "ewc_loss": 0.007070679683238268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.070679566822946e-05, + "grad_norm": 3.5076963901519775, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8659865856170654, + "num_tokens": 219367470.0, + "step": 5749 + }, + { + "epoch": 0.7314591018954332, + "ewc_loss": 0.007029026746749878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.02902689226903e-05, + "grad_norm": 3.5426700115203857, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.85188227891922, + "num_tokens": 219405291.0, + "step": 5750 + }, + { + "epoch": 0.7315863121740237, + "ewc_loss": 0.0070810262113809586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.081025978550315e-05, + "grad_norm": 3.531351327896118, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8670709133148193, + "num_tokens": 219443365.0, + "step": 5751 + }, + { + "epoch": 0.7317135224526141, + "ewc_loss": 0.007062251213937998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.062251097522676e-05, + "grad_norm": 3.556281805038452, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8667824268341064, + "num_tokens": 219477524.0, + "step": 5752 + }, + { + "epoch": 0.7318407327312046, + "ewc_loss": 0.007094974629580975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.094974716892466e-05, + "grad_norm": 3.5956907272338867, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8575887680053711, + "num_tokens": 219514576.0, + "step": 5753 + }, + { + "epoch": 0.7319679430097952, + "ewc_loss": 0.007102898322045803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.102898234734312e-05, + "grad_norm": 3.514462947845459, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.864978015422821, + "num_tokens": 219558253.0, + "step": 5754 + }, + { + "epoch": 0.7320951532883857, + "ewc_loss": 0.007044795900583267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04479607520625e-05, + "grad_norm": 3.501206636428833, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8608201742172241, + "num_tokens": 219601156.0, + "step": 5755 + }, + { + "epoch": 0.7322223635669762, + "ewc_loss": 0.00707212695851922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.072126754792407e-05, + "grad_norm": 3.6088554859161377, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8544512987136841, + "num_tokens": 219638241.0, + "step": 5756 + }, + { + "epoch": 0.7323495738455668, + "ewc_loss": 0.007124500349164009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.124500552890822e-05, + "grad_norm": 3.478783369064331, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8649882674217224, + "num_tokens": 219678168.0, + "step": 5757 + }, + { + "epoch": 0.7324767841241572, + "ewc_loss": 0.007012973073869944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.012973219389096e-05, + "grad_norm": 3.5417685508728027, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8613609075546265, + "num_tokens": 219718115.0, + "step": 5758 + }, + { + "epoch": 0.7326039944027477, + "ewc_loss": 0.007091605104506016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.091605220921338e-05, + "grad_norm": 3.5544533729553223, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.865043044090271, + "num_tokens": 219754221.0, + "step": 5759 + }, + { + "epoch": 0.7327312046813382, + "ewc_loss": 0.007067454047501087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.067454134812579e-05, + "grad_norm": 3.5826449394226074, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8695602416992188, + "num_tokens": 219786880.0, + "step": 5760 + }, + { + "epoch": 0.7328584149599288, + "ewc_loss": 0.007086513563990593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.086513505782932e-05, + "grad_norm": 3.5039875507354736, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8680285811424255, + "num_tokens": 219830585.0, + "step": 5761 + }, + { + "epoch": 0.7329856252385193, + "ewc_loss": 0.007014717906713486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.014717994024977e-05, + "grad_norm": 3.5278899669647217, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8684083819389343, + "num_tokens": 219867249.0, + "step": 5762 + }, + { + "epoch": 0.7331128355171098, + "ewc_loss": 0.0070648193359375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.064819510560483e-05, + "grad_norm": 3.5792770385742188, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8548569679260254, + "num_tokens": 219906046.0, + "step": 5763 + }, + { + "epoch": 0.7332400457957003, + "ewc_loss": 0.00705484626814723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.054846355458722e-05, + "grad_norm": 3.531883478164673, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8558504581451416, + "num_tokens": 219947466.0, + "step": 5764 + }, + { + "epoch": 0.7333672560742908, + "ewc_loss": 0.006997625343501568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.997625314397737e-05, + "grad_norm": 3.5260963439941406, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8665642738342285, + "num_tokens": 219985698.0, + "step": 5765 + }, + { + "epoch": 0.7334944663528813, + "ewc_loss": 0.007010943256318569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.010943227214739e-05, + "grad_norm": 3.481128692626953, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8669373989105225, + "num_tokens": 220026158.0, + "step": 5766 + }, + { + "epoch": 0.7336216766314718, + "ewc_loss": 0.006975685246288776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.975685391807929e-05, + "grad_norm": 3.5036814212799072, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8617779016494751, + "num_tokens": 220071849.0, + "step": 5767 + }, + { + "epoch": 0.7337488869100623, + "ewc_loss": 0.00700010359287262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.000103505561128e-05, + "grad_norm": 3.5224990844726562, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.870708703994751, + "num_tokens": 220110255.0, + "step": 5768 + }, + { + "epoch": 0.7338760971886529, + "ewc_loss": 0.006998977158218622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.998977187322453e-05, + "grad_norm": 3.6232550144195557, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8493680357933044, + "num_tokens": 220144962.0, + "step": 5769 + }, + { + "epoch": 0.7340033074672434, + "ewc_loss": 0.007037509232759476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.037509203655645e-05, + "grad_norm": 3.5688998699188232, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8556833267211914, + "num_tokens": 220182118.0, + "step": 5770 + }, + { + "epoch": 0.7341305177458338, + "ewc_loss": 0.006974278017878532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.974278221605346e-05, + "grad_norm": 3.572692632675171, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8559296131134033, + "num_tokens": 220218954.0, + "step": 5771 + }, + { + "epoch": 0.7342577280244243, + "ewc_loss": 0.006995731964707375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.995732110226527e-05, + "grad_norm": 3.512770891189575, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8614801168441772, + "num_tokens": 220259116.0, + "step": 5772 + }, + { + "epoch": 0.7343849383030149, + "ewc_loss": 0.006962774321436882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.96277420502156e-05, + "grad_norm": 3.539123773574829, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8629266023635864, + "num_tokens": 220301742.0, + "step": 5773 + }, + { + "epoch": 0.7345121485816054, + "ewc_loss": 0.006998076569288969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.998076423769817e-05, + "grad_norm": 3.488750696182251, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8690972328186035, + "num_tokens": 220342572.0, + "step": 5774 + }, + { + "epoch": 0.7346393588601959, + "ewc_loss": 0.006958316080272198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95831622579135e-05, + "grad_norm": 3.576059579849243, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8710170984268188, + "num_tokens": 220376373.0, + "step": 5775 + }, + { + "epoch": 0.7347665691387865, + "ewc_loss": 0.007031980901956558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.031980931060389e-05, + "grad_norm": 3.4743428230285645, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8800036907196045, + "num_tokens": 220413290.0, + "step": 5776 + }, + { + "epoch": 0.7348937794173769, + "ewc_loss": 0.006931176874786615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.931176903890446e-05, + "grad_norm": 3.516960382461548, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8601568937301636, + "num_tokens": 220453219.0, + "step": 5777 + }, + { + "epoch": 0.7350209896959674, + "ewc_loss": 0.007002121768891811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.002121856203303e-05, + "grad_norm": 3.5858652591705322, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8649451732635498, + "num_tokens": 220488369.0, + "step": 5778 + }, + { + "epoch": 0.7351481999745579, + "ewc_loss": 0.007013531867414713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.013532012933865e-05, + "grad_norm": 3.5188333988189697, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8681496977806091, + "num_tokens": 220527798.0, + "step": 5779 + }, + { + "epoch": 0.7352754102531485, + "ewc_loss": 0.006942883133888245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.942883192095906e-05, + "grad_norm": 3.4978039264678955, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8826015591621399, + "num_tokens": 220565398.0, + "step": 5780 + }, + { + "epoch": 0.735402620531739, + "ewc_loss": 0.0069561125710606575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.956112338230014e-05, + "grad_norm": 3.5208582878112793, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8749912977218628, + "num_tokens": 220603903.0, + "step": 5781 + }, + { + "epoch": 0.7355298308103295, + "ewc_loss": 0.006971916649490595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.971916445763782e-05, + "grad_norm": 3.555774450302124, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8461567163467407, + "num_tokens": 220644168.0, + "step": 5782 + }, + { + "epoch": 0.7356570410889199, + "ewc_loss": 0.0069718677550554276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.971867696847767e-05, + "grad_norm": 3.5876095294952393, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.877143383026123, + "num_tokens": 220679709.0, + "step": 5783 + }, + { + "epoch": 0.7357842513675105, + "ewc_loss": 0.006975299678742886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.975299766054377e-05, + "grad_norm": 3.5227575302124023, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8573644161224365, + "num_tokens": 220721184.0, + "step": 5784 + }, + { + "epoch": 0.735911461646101, + "ewc_loss": 0.006948217749595642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.948217924218625e-05, + "grad_norm": 3.562622547149658, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8704462647438049, + "num_tokens": 220755027.0, + "step": 5785 + }, + { + "epoch": 0.7360386719246915, + "ewc_loss": 0.0069718859158456326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.971885886741802e-05, + "grad_norm": 3.601929187774658, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8415880799293518, + "num_tokens": 220793597.0, + "step": 5786 + }, + { + "epoch": 0.736165882203282, + "ewc_loss": 0.006998727098107338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.998726894380525e-05, + "grad_norm": 3.5436465740203857, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8640027046203613, + "num_tokens": 220833549.0, + "step": 5787 + }, + { + "epoch": 0.7362930924818726, + "ewc_loss": 0.00694111967459321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.941119499970227e-05, + "grad_norm": 3.5673329830169678, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8708611726760864, + "num_tokens": 220865551.0, + "step": 5788 + }, + { + "epoch": 0.736420302760463, + "ewc_loss": 0.00698778685182333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.987786764511839e-05, + "grad_norm": 3.4732284545898438, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8762716054916382, + "num_tokens": 220911626.0, + "step": 5789 + }, + { + "epoch": 0.7365475130390535, + "ewc_loss": 0.006923855748027563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.923855835339054e-05, + "grad_norm": 3.59543514251709, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8660714030265808, + "num_tokens": 220949839.0, + "step": 5790 + }, + { + "epoch": 0.736674723317644, + "ewc_loss": 0.007027042098343372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.027042011031881e-05, + "grad_norm": 3.595139503479004, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8494524955749512, + "num_tokens": 220984069.0, + "step": 5791 + }, + { + "epoch": 0.7368019335962346, + "ewc_loss": 0.006979548372328281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.979548197705299e-05, + "grad_norm": 3.4896557331085205, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8841222524642944, + "num_tokens": 221020999.0, + "step": 5792 + }, + { + "epoch": 0.7369291438748251, + "ewc_loss": 0.006931541953235865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.931542156962678e-05, + "grad_norm": 3.634438991546631, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8559845685958862, + "num_tokens": 221051082.0, + "step": 5793 + }, + { + "epoch": 0.7370563541534156, + "ewc_loss": 0.00706680491566658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.066805119393393e-05, + "grad_norm": 3.5349371433258057, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8473648428916931, + "num_tokens": 221095444.0, + "step": 5794 + }, + { + "epoch": 0.737183564432006, + "ewc_loss": 0.006953679956495762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.953679985599592e-05, + "grad_norm": 3.5457818508148193, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8665741682052612, + "num_tokens": 221136433.0, + "step": 5795 + }, + { + "epoch": 0.7373107747105966, + "ewc_loss": 0.007015769369900227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.015769369900227e-05, + "grad_norm": 3.6258251667022705, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8574395179748535, + "num_tokens": 221171278.0, + "step": 5796 + }, + { + "epoch": 0.7374379849891871, + "ewc_loss": 0.007050549145787954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.050549174891785e-05, + "grad_norm": 3.5638604164123535, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8643758296966553, + "num_tokens": 221209380.0, + "step": 5797 + }, + { + "epoch": 0.7375651952677776, + "ewc_loss": 0.006987858098000288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.987858068896458e-05, + "grad_norm": 3.5015718936920166, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8618881702423096, + "num_tokens": 221253076.0, + "step": 5798 + }, + { + "epoch": 0.7376924055463682, + "ewc_loss": 0.006971351802349091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.971351831452921e-05, + "grad_norm": 3.5528950691223145, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8620662689208984, + "num_tokens": 221290348.0, + "step": 5799 + }, + { + "epoch": 0.7378196158249587, + "ewc_loss": 0.007028695661574602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.028695836197585e-05, + "grad_norm": 3.5155158042907715, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8710528612136841, + "num_tokens": 221329635.0, + "step": 5800 + }, + { + "epoch": 0.7379468261035491, + "ewc_loss": 0.006979728117585182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.979727913858369e-05, + "grad_norm": 3.4999032020568848, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8709899187088013, + "num_tokens": 221368521.0, + "step": 5801 + }, + { + "epoch": 0.7380740363821396, + "ewc_loss": 0.006999887526035309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.999887409619987e-05, + "grad_norm": 3.506917715072632, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8749459981918335, + "num_tokens": 221407090.0, + "step": 5802 + }, + { + "epoch": 0.7382012466607302, + "ewc_loss": 0.007014244329184294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.014244329184294e-05, + "grad_norm": 3.4777469635009766, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8716472387313843, + "num_tokens": 221450340.0, + "step": 5803 + }, + { + "epoch": 0.7383284569393207, + "ewc_loss": 0.006976849399507046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.976849545026198e-05, + "grad_norm": 3.5433573722839355, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8645482063293457, + "num_tokens": 221491996.0, + "step": 5804 + }, + { + "epoch": 0.7384556672179112, + "ewc_loss": 0.007027509622275829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.027509855106473e-05, + "grad_norm": 3.520960807800293, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8734646439552307, + "num_tokens": 221531466.0, + "step": 5805 + }, + { + "epoch": 0.7385828774965018, + "ewc_loss": 0.006989661138504744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.989661051193252e-05, + "grad_norm": 3.6319994926452637, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.846621036529541, + "num_tokens": 221567003.0, + "step": 5806 + }, + { + "epoch": 0.7387100877750922, + "ewc_loss": 0.007045949809253216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04595004208386e-05, + "grad_norm": 3.49029278755188, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8776434063911438, + "num_tokens": 221607396.0, + "step": 5807 + }, + { + "epoch": 0.7388372980536827, + "ewc_loss": 0.0069411443546414375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.941144238226116e-05, + "grad_norm": 3.5684380531311035, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8590081930160522, + "num_tokens": 221644776.0, + "step": 5808 + }, + { + "epoch": 0.7389645083322732, + "ewc_loss": 0.007026617880910635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.026617822702974e-05, + "grad_norm": 3.6284992694854736, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8755666017532349, + "num_tokens": 221685643.0, + "step": 5809 + }, + { + "epoch": 0.7390917186108638, + "ewc_loss": 0.007028334774076939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.028334948699921e-05, + "grad_norm": 3.556163787841797, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8563549518585205, + "num_tokens": 221724673.0, + "step": 5810 + }, + { + "epoch": 0.7392189288894543, + "ewc_loss": 0.006949095521122217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.949095404706895e-05, + "grad_norm": 3.550513982772827, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8403970003128052, + "num_tokens": 221766287.0, + "step": 5811 + }, + { + "epoch": 0.7393461391680448, + "ewc_loss": 0.0069905431009829044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.990542897256091e-05, + "grad_norm": 3.506289482116699, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.863532304763794, + "num_tokens": 221805357.0, + "step": 5812 + }, + { + "epoch": 0.7394733494466353, + "ewc_loss": 0.0069431341253221035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.943134212633595e-05, + "grad_norm": 3.6129043102264404, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.862770676612854, + "num_tokens": 221838513.0, + "step": 5813 + }, + { + "epoch": 0.7396005597252258, + "ewc_loss": 0.007024568971246481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.02456891303882e-05, + "grad_norm": 3.493645668029785, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8539502024650574, + "num_tokens": 221882808.0, + "step": 5814 + }, + { + "epoch": 0.7397277700038163, + "ewc_loss": 0.006928489077836275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.928489165147766e-05, + "grad_norm": 3.555265188217163, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8786094784736633, + "num_tokens": 221918505.0, + "step": 5815 + }, + { + "epoch": 0.7398549802824068, + "ewc_loss": 0.007007850334048271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.00785021763295e-05, + "grad_norm": 3.53059983253479, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8664725422859192, + "num_tokens": 221955983.0, + "step": 5816 + }, + { + "epoch": 0.7399821905609973, + "ewc_loss": 0.006976791191846132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.976791337365285e-05, + "grad_norm": 3.5020291805267334, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8628935813903809, + "num_tokens": 221997363.0, + "step": 5817 + }, + { + "epoch": 0.7401094008395879, + "ewc_loss": 0.00696956692263484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.969567039050162e-05, + "grad_norm": 3.5632686614990234, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8505761623382568, + "num_tokens": 222039302.0, + "step": 5818 + }, + { + "epoch": 0.7402366111181784, + "ewc_loss": 0.007017065305262804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.017065217951313e-05, + "grad_norm": 3.5391993522644043, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8649479150772095, + "num_tokens": 222075779.0, + "step": 5819 + }, + { + "epoch": 0.7403638213967688, + "ewc_loss": 0.006990191992372274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.990192196099088e-05, + "grad_norm": 3.4740335941314697, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8830198049545288, + "num_tokens": 222115120.0, + "step": 5820 + }, + { + "epoch": 0.7404910316753593, + "ewc_loss": 0.006958950776606798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.958950689295307e-05, + "grad_norm": 3.549177646636963, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8561828136444092, + "num_tokens": 222156570.0, + "step": 5821 + }, + { + "epoch": 0.7406182419539499, + "ewc_loss": 0.007011281326413155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.011281559243798e-05, + "grad_norm": 3.626126766204834, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8640249967575073, + "num_tokens": 222189487.0, + "step": 5822 + }, + { + "epoch": 0.7407454522325404, + "ewc_loss": 0.007025453727692366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.025453669484705e-05, + "grad_norm": 3.606571912765503, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8629486560821533, + "num_tokens": 222223163.0, + "step": 5823 + }, + { + "epoch": 0.7408726625111309, + "ewc_loss": 0.006990129128098488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.990128895267844e-05, + "grad_norm": 3.4955077171325684, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8775242567062378, + "num_tokens": 222259042.0, + "step": 5824 + }, + { + "epoch": 0.7409998727897215, + "ewc_loss": 0.006936388090252876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.936387944733724e-05, + "grad_norm": 3.586322069168091, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8453198671340942, + "num_tokens": 222300497.0, + "step": 5825 + }, + { + "epoch": 0.7411270830683119, + "ewc_loss": 0.007032199762761593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.032199937384576e-05, + "grad_norm": 3.605736017227173, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8616683483123779, + "num_tokens": 222333630.0, + "step": 5826 + }, + { + "epoch": 0.7412542933469024, + "ewc_loss": 0.0070213559083640575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.021355850156397e-05, + "grad_norm": 3.5788426399230957, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8596442937850952, + "num_tokens": 222371715.0, + "step": 5827 + }, + { + "epoch": 0.7413815036254929, + "ewc_loss": 0.00700095621868968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.00095624779351e-05, + "grad_norm": 3.530322790145874, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.864562451839447, + "num_tokens": 222412044.0, + "step": 5828 + }, + { + "epoch": 0.7415087139040835, + "ewc_loss": 0.0069852788001298904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.98527874192223e-05, + "grad_norm": 3.5019640922546387, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8668488264083862, + "num_tokens": 222452668.0, + "step": 5829 + }, + { + "epoch": 0.741635924182674, + "ewc_loss": 0.006998542696237564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.998542812652886e-05, + "grad_norm": 3.594480037689209, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8656883239746094, + "num_tokens": 222485631.0, + "step": 5830 + }, + { + "epoch": 0.7417631344612645, + "ewc_loss": 0.007074111606925726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.074111636029556e-05, + "grad_norm": 3.5281600952148438, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8793588876724243, + "num_tokens": 222522967.0, + "step": 5831 + }, + { + "epoch": 0.7418903447398549, + "ewc_loss": 0.006993817165493965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.993817078182474e-05, + "grad_norm": 3.542109251022339, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8538603782653809, + "num_tokens": 222563258.0, + "step": 5832 + }, + { + "epoch": 0.7420175550184455, + "ewc_loss": 0.007043317425996065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.043317600619048e-05, + "grad_norm": 3.5881855487823486, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8572322726249695, + "num_tokens": 222599248.0, + "step": 5833 + }, + { + "epoch": 0.742144765297036, + "ewc_loss": 0.00706449942663312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.064499368425459e-05, + "grad_norm": 3.501936197280884, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8715506196022034, + "num_tokens": 222642575.0, + "step": 5834 + }, + { + "epoch": 0.7422719755756265, + "ewc_loss": 0.006993371061980724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.993371061980724e-05, + "grad_norm": 3.6599459648132324, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8425414562225342, + "num_tokens": 222687330.0, + "step": 5835 + }, + { + "epoch": 0.742399185854217, + "ewc_loss": 0.00713721476495266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.137214561225846e-05, + "grad_norm": 3.64544939994812, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8383017778396606, + "num_tokens": 222720020.0, + "step": 5836 + }, + { + "epoch": 0.7425263961328076, + "ewc_loss": 0.007056680042296648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.056679896777496e-05, + "grad_norm": 3.4868502616882324, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8775035738945007, + "num_tokens": 222758634.0, + "step": 5837 + }, + { + "epoch": 0.742653606411398, + "ewc_loss": 0.006991248577833176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.991248665144667e-05, + "grad_norm": 3.5852179527282715, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8712607622146606, + "num_tokens": 222794823.0, + "step": 5838 + }, + { + "epoch": 0.7427808166899885, + "ewc_loss": 0.007106605917215347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.1066060627345e-05, + "grad_norm": 3.5512819290161133, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.863641619682312, + "num_tokens": 222833836.0, + "step": 5839 + }, + { + "epoch": 0.742908026968579, + "ewc_loss": 0.007041321601718664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.041321805445477e-05, + "grad_norm": 3.5270543098449707, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8657375574111938, + "num_tokens": 222875712.0, + "step": 5840 + }, + { + "epoch": 0.7430352372471696, + "ewc_loss": 0.00705182459205389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.051824650261551e-05, + "grad_norm": 3.5548503398895264, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8532459735870361, + "num_tokens": 222915494.0, + "step": 5841 + }, + { + "epoch": 0.7431624475257601, + "ewc_loss": 0.007085254881531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.085254765115678e-05, + "grad_norm": 3.538346767425537, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8821512460708618, + "num_tokens": 222951675.0, + "step": 5842 + }, + { + "epoch": 0.7432896578043506, + "ewc_loss": 0.007056361064314842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.056361209833995e-05, + "grad_norm": 3.536674976348877, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8591367602348328, + "num_tokens": 222993245.0, + "step": 5843 + }, + { + "epoch": 0.743416868082941, + "ewc_loss": 0.007056263741105795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.056263712001964e-05, + "grad_norm": 3.5403456687927246, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8716269731521606, + "num_tokens": 223029876.0, + "step": 5844 + }, + { + "epoch": 0.7435440783615316, + "ewc_loss": 0.007067049387842417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06704959156923e-05, + "grad_norm": 3.5156383514404297, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8698316812515259, + "num_tokens": 223072550.0, + "step": 5845 + }, + { + "epoch": 0.7436712886401221, + "ewc_loss": 0.007046857848763466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04685808159411e-05, + "grad_norm": 3.533602237701416, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8642208576202393, + "num_tokens": 223112892.0, + "step": 5846 + }, + { + "epoch": 0.7437984989187126, + "ewc_loss": 0.007065323181450367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.065323006827384e-05, + "grad_norm": 3.516710042953491, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8701781034469604, + "num_tokens": 223157093.0, + "step": 5847 + }, + { + "epoch": 0.7439257091973032, + "ewc_loss": 0.007047196384519339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.047196413623169e-05, + "grad_norm": 3.617147445678711, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8569794297218323, + "num_tokens": 223190756.0, + "step": 5848 + }, + { + "epoch": 0.7440529194758937, + "ewc_loss": 0.007096810266375542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.096810440998524e-05, + "grad_norm": 3.6053714752197266, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8566277623176575, + "num_tokens": 223226013.0, + "step": 5849 + }, + { + "epoch": 0.7441801297544841, + "ewc_loss": 0.0070660971105098724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.066097168717533e-05, + "grad_norm": 3.5411717891693115, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8778939247131348, + "num_tokens": 223262858.0, + "step": 5850 + }, + { + "epoch": 0.7443073400330746, + "ewc_loss": 0.0070226239040493965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.022624049568549e-05, + "grad_norm": 3.485333204269409, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8688756227493286, + "num_tokens": 223310528.0, + "step": 5851 + }, + { + "epoch": 0.7444345503116652, + "ewc_loss": 0.007003679871559143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0036796387285e-05, + "grad_norm": 3.5696678161621094, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8679851293563843, + "num_tokens": 223345021.0, + "step": 5852 + }, + { + "epoch": 0.7445617605902557, + "ewc_loss": 0.0070699891075491905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06998907844536e-05, + "grad_norm": 3.5292670726776123, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8750036358833313, + "num_tokens": 223382055.0, + "step": 5853 + }, + { + "epoch": 0.7446889708688462, + "ewc_loss": 0.006994760595262051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.994760769885033e-05, + "grad_norm": 3.632596492767334, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8549544811248779, + "num_tokens": 223418441.0, + "step": 5854 + }, + { + "epoch": 0.7448161811474368, + "ewc_loss": 0.007052809931337833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.052809814922512e-05, + "grad_norm": 3.569871664047241, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8624147772789001, + "num_tokens": 223458150.0, + "step": 5855 + }, + { + "epoch": 0.7449433914260272, + "ewc_loss": 0.006990266498178244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.990266410866752e-05, + "grad_norm": 3.518178701400757, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8604772090911865, + "num_tokens": 223496916.0, + "step": 5856 + }, + { + "epoch": 0.7450706017046177, + "ewc_loss": 0.0069611030630767345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.961102917557582e-05, + "grad_norm": 3.5559401512145996, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8706866502761841, + "num_tokens": 223534808.0, + "step": 5857 + }, + { + "epoch": 0.7451978119832082, + "ewc_loss": 0.007009071297943592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.00907112332061e-05, + "grad_norm": 3.550830364227295, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8744185566902161, + "num_tokens": 223571255.0, + "step": 5858 + }, + { + "epoch": 0.7453250222617988, + "ewc_loss": 0.0069923726841807365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.992372800596058e-05, + "grad_norm": 3.5346105098724365, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8674961924552917, + "num_tokens": 223616608.0, + "step": 5859 + }, + { + "epoch": 0.7454522325403893, + "ewc_loss": 0.006975899916142225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.975900032557547e-05, + "grad_norm": 3.5161991119384766, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8640428781509399, + "num_tokens": 223656188.0, + "step": 5860 + }, + { + "epoch": 0.7455794428189798, + "ewc_loss": 0.006973510608077049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.973510608077049e-05, + "grad_norm": 3.607863426208496, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8602427244186401, + "num_tokens": 223693787.0, + "step": 5861 + }, + { + "epoch": 0.7457066530975703, + "ewc_loss": 0.0070232972502708435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.023297075647861e-05, + "grad_norm": 3.509312868118286, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8761249780654907, + "num_tokens": 223729350.0, + "step": 5862 + }, + { + "epoch": 0.7458338633761608, + "ewc_loss": 0.006930170115083456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.930169911356643e-05, + "grad_norm": 3.5637850761413574, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8520952463150024, + "num_tokens": 223770846.0, + "step": 5863 + }, + { + "epoch": 0.7459610736547513, + "ewc_loss": 0.006987800821661949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.987800588831306e-05, + "grad_norm": 3.5165677070617676, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8644903898239136, + "num_tokens": 223814194.0, + "step": 5864 + }, + { + "epoch": 0.7460882839333418, + "ewc_loss": 0.00694948248565197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.94948248565197e-05, + "grad_norm": 3.647784471511841, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8582136631011963, + "num_tokens": 223849141.0, + "step": 5865 + }, + { + "epoch": 0.7462154942119323, + "ewc_loss": 0.007029158528894186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.029158587101847e-05, + "grad_norm": 3.4904987812042236, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8852592706680298, + "num_tokens": 223888654.0, + "step": 5866 + }, + { + "epoch": 0.7463427044905229, + "ewc_loss": 0.006903461180627346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.903461326146498e-05, + "grad_norm": 3.551459312438965, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8688876628875732, + "num_tokens": 223924509.0, + "step": 5867 + }, + { + "epoch": 0.7464699147691134, + "ewc_loss": 0.006987159606069326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.987159576965496e-05, + "grad_norm": 3.551685333251953, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8502702713012695, + "num_tokens": 223966191.0, + "step": 5868 + }, + { + "epoch": 0.7465971250477038, + "ewc_loss": 0.006965114735066891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.965114880586043e-05, + "grad_norm": 3.548612117767334, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8663665056228638, + "num_tokens": 224001415.0, + "step": 5869 + }, + { + "epoch": 0.7467243353262943, + "ewc_loss": 0.006960849743336439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.960849714232609e-05, + "grad_norm": 3.465129852294922, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8709191679954529, + "num_tokens": 224042820.0, + "step": 5870 + }, + { + "epoch": 0.7468515456048849, + "ewc_loss": 0.006933584343641996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.933584518264979e-05, + "grad_norm": 3.590703010559082, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8696987628936768, + "num_tokens": 224078411.0, + "step": 5871 + }, + { + "epoch": 0.7469787558834754, + "ewc_loss": 0.007034443784505129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.03444384271279e-05, + "grad_norm": 3.534118890762329, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8716961145401001, + "num_tokens": 224117136.0, + "step": 5872 + }, + { + "epoch": 0.7471059661620659, + "ewc_loss": 0.006938663311302662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.938663136679679e-05, + "grad_norm": 3.5423941612243652, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8582807779312134, + "num_tokens": 224155879.0, + "step": 5873 + }, + { + "epoch": 0.7472331764406565, + "ewc_loss": 0.006977305747568607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.977305747568607e-05, + "grad_norm": 3.5117502212524414, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8750991225242615, + "num_tokens": 224195186.0, + "step": 5874 + }, + { + "epoch": 0.7473603867192469, + "ewc_loss": 0.0069525521248579025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.952552212169394e-05, + "grad_norm": 3.5114800930023193, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8618735671043396, + "num_tokens": 224235722.0, + "step": 5875 + }, + { + "epoch": 0.7474875969978374, + "ewc_loss": 0.006954158190637827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.954158016014844e-05, + "grad_norm": 3.5886847972869873, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8719114065170288, + "num_tokens": 224268461.0, + "step": 5876 + }, + { + "epoch": 0.7476148072764279, + "ewc_loss": 0.00701219541952014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.01219541952014e-05, + "grad_norm": 3.598163604736328, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.877851665019989, + "num_tokens": 224302921.0, + "step": 5877 + }, + { + "epoch": 0.7477420175550185, + "ewc_loss": 0.006980727426707745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.980727630434558e-05, + "grad_norm": 3.4874579906463623, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8659400939941406, + "num_tokens": 224344081.0, + "step": 5878 + }, + { + "epoch": 0.747869227833609, + "ewc_loss": 0.006936732213944197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.936732097528875e-05, + "grad_norm": 3.5788028240203857, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8533653020858765, + "num_tokens": 224384464.0, + "step": 5879 + }, + { + "epoch": 0.7479964381121995, + "ewc_loss": 0.00702187605202198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.021876081125811e-05, + "grad_norm": 3.516975164413452, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8731377124786377, + "num_tokens": 224425542.0, + "step": 5880 + }, + { + "epoch": 0.7481236483907899, + "ewc_loss": 0.006948486901819706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.94848713465035e-05, + "grad_norm": 3.4979453086853027, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8823027610778809, + "num_tokens": 224465342.0, + "step": 5881 + }, + { + "epoch": 0.7482508586693805, + "ewc_loss": 0.006953589152544737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.953589036129415e-05, + "grad_norm": 3.5241637229919434, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8700575828552246, + "num_tokens": 224504787.0, + "step": 5882 + }, + { + "epoch": 0.748378068947971, + "ewc_loss": 0.006987622939050198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.98762305546552e-05, + "grad_norm": 3.5705885887145996, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8652730584144592, + "num_tokens": 224548534.0, + "step": 5883 + }, + { + "epoch": 0.7485052792265615, + "ewc_loss": 0.006992497947067022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.992497947067022e-05, + "grad_norm": 3.549478769302368, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8560663461685181, + "num_tokens": 224590251.0, + "step": 5884 + }, + { + "epoch": 0.748632489505152, + "ewc_loss": 0.0069633484818041325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.963348278077319e-05, + "grad_norm": 3.4848716259002686, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8751984238624573, + "num_tokens": 224633745.0, + "step": 5885 + }, + { + "epoch": 0.7487596997837426, + "ewc_loss": 0.006923168431967497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.923168257344514e-05, + "grad_norm": 3.5495548248291016, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8500945568084717, + "num_tokens": 224671319.0, + "step": 5886 + }, + { + "epoch": 0.748886910062333, + "ewc_loss": 0.006983183324337006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.983183266129345e-05, + "grad_norm": 3.5167720317840576, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8653384447097778, + "num_tokens": 224712670.0, + "step": 5887 + }, + { + "epoch": 0.7490141203409235, + "ewc_loss": 0.006929138209670782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.929138180566952e-05, + "grad_norm": 3.54207706451416, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8633498549461365, + "num_tokens": 224747840.0, + "step": 5888 + }, + { + "epoch": 0.749141330619514, + "ewc_loss": 0.006967558991163969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.967558874748647e-05, + "grad_norm": 3.550325393676758, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.874570369720459, + "num_tokens": 224783274.0, + "step": 5889 + }, + { + "epoch": 0.7492685408981046, + "ewc_loss": 0.006957472302019596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.957472214708105e-05, + "grad_norm": 3.550666093826294, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8606300354003906, + "num_tokens": 224820456.0, + "step": 5890 + }, + { + "epoch": 0.7493957511766951, + "ewc_loss": 0.006956044118851423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95604394422844e-05, + "grad_norm": 3.513993263244629, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8691097497940063, + "num_tokens": 224858569.0, + "step": 5891 + }, + { + "epoch": 0.7495229614552856, + "ewc_loss": 0.006952809635549784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.952809781068936e-05, + "grad_norm": 3.5317249298095703, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8617808818817139, + "num_tokens": 224901033.0, + "step": 5892 + }, + { + "epoch": 0.749650171733876, + "ewc_loss": 0.0069542513228952885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.954251148272306e-05, + "grad_norm": 3.562592029571533, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8802320957183838, + "num_tokens": 224934345.0, + "step": 5893 + }, + { + "epoch": 0.7497773820124666, + "ewc_loss": 0.00697721540927887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.977215525694191e-05, + "grad_norm": 3.555388927459717, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8581078052520752, + "num_tokens": 224972479.0, + "step": 5894 + }, + { + "epoch": 0.7499045922910571, + "ewc_loss": 0.0069571929052472115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95719281793572e-05, + "grad_norm": 3.538724899291992, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8818472623825073, + "num_tokens": 225014213.0, + "step": 5895 + }, + { + "epoch": 0.7500318025696476, + "ewc_loss": 0.006944446824491024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.944446795387194e-05, + "grad_norm": 3.5522820949554443, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.871556282043457, + "num_tokens": 225049929.0, + "step": 5896 + }, + { + "epoch": 0.7501590128482382, + "ewc_loss": 0.006964832544326782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.964832573430613e-05, + "grad_norm": 3.569232225418091, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8490821123123169, + "num_tokens": 225086409.0, + "step": 5897 + }, + { + "epoch": 0.7502862231268287, + "ewc_loss": 0.006966813933104277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.966813816688955e-05, + "grad_norm": 3.5595219135284424, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.848068118095398, + "num_tokens": 225124443.0, + "step": 5898 + }, + { + "epoch": 0.7504134334054191, + "ewc_loss": 0.006964083760976791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.964083877392113e-05, + "grad_norm": 3.524836778640747, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8553547263145447, + "num_tokens": 225166665.0, + "step": 5899 + }, + { + "epoch": 0.7505406436840096, + "ewc_loss": 0.006955748423933983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.955748540349305e-05, + "grad_norm": 3.5513503551483154, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8410441875457764, + "num_tokens": 225203841.0, + "step": 5900 + }, + { + "epoch": 0.7506678539626002, + "ewc_loss": 0.006985656917095184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.985657091718167e-05, + "grad_norm": 3.654496431350708, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8498573303222656, + "num_tokens": 225234844.0, + "step": 5901 + }, + { + "epoch": 0.7507950642411907, + "ewc_loss": 0.007050437852740288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.050437852740288e-05, + "grad_norm": 3.603588342666626, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.848061740398407, + "num_tokens": 225269828.0, + "step": 5902 + }, + { + "epoch": 0.7509222745197812, + "ewc_loss": 0.007003657054156065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.003657083259895e-05, + "grad_norm": 3.475247621536255, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8694840669631958, + "num_tokens": 225312361.0, + "step": 5903 + }, + { + "epoch": 0.7510494847983717, + "ewc_loss": 0.006959572900086641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.95957278367132e-05, + "grad_norm": 3.538100004196167, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8538827300071716, + "num_tokens": 225353708.0, + "step": 5904 + }, + { + "epoch": 0.7511766950769622, + "ewc_loss": 0.007046147249639034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.046147220535204e-05, + "grad_norm": 3.437669038772583, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8839846849441528, + "num_tokens": 225399730.0, + "step": 5905 + }, + { + "epoch": 0.7513039053555527, + "ewc_loss": 0.006963470485061407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.963470514165238e-05, + "grad_norm": 3.553785562515259, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8584546446800232, + "num_tokens": 225441836.0, + "step": 5906 + }, + { + "epoch": 0.7514311156341432, + "ewc_loss": 0.007069926708936691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.069926505209878e-05, + "grad_norm": 3.6151371002197266, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8719710111618042, + "num_tokens": 225471004.0, + "step": 5907 + }, + { + "epoch": 0.7515583259127337, + "ewc_loss": 0.007074754685163498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.07475483068265e-05, + "grad_norm": 3.6212399005889893, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8479456901550293, + "num_tokens": 225505789.0, + "step": 5908 + }, + { + "epoch": 0.7516855361913243, + "ewc_loss": 0.007052343338727951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.052343426039442e-05, + "grad_norm": 3.5528361797332764, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8677224516868591, + "num_tokens": 225541138.0, + "step": 5909 + }, + { + "epoch": 0.7518127464699148, + "ewc_loss": 0.007028436753898859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.02843681210652e-05, + "grad_norm": 3.5044667720794678, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8685793876647949, + "num_tokens": 225581695.0, + "step": 5910 + }, + { + "epoch": 0.7519399567485053, + "ewc_loss": 0.0070125809870660305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.012581045273691e-05, + "grad_norm": 3.5518267154693604, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8701052665710449, + "num_tokens": 225619532.0, + "step": 5911 + }, + { + "epoch": 0.7520671670270958, + "ewc_loss": 0.007083908189088106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.083907985361293e-05, + "grad_norm": 3.560983896255493, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8577883839607239, + "num_tokens": 225657403.0, + "step": 5912 + }, + { + "epoch": 0.7521943773056863, + "ewc_loss": 0.007061739452183247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.061739597702399e-05, + "grad_norm": 3.5961129665374756, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8596738576889038, + "num_tokens": 225693402.0, + "step": 5913 + }, + { + "epoch": 0.7523215875842768, + "ewc_loss": 0.007069048937410116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.069049024721608e-05, + "grad_norm": 3.592247247695923, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8627098798751831, + "num_tokens": 225727208.0, + "step": 5914 + }, + { + "epoch": 0.7524487978628673, + "ewc_loss": 0.007066783495247364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.066783291520551e-05, + "grad_norm": 3.546996831893921, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8693246841430664, + "num_tokens": 225762117.0, + "step": 5915 + }, + { + "epoch": 0.7525760081414579, + "ewc_loss": 0.007045605685561895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.045605889288709e-05, + "grad_norm": 3.594846248626709, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8518989086151123, + "num_tokens": 225794737.0, + "step": 5916 + }, + { + "epoch": 0.7527032184200484, + "ewc_loss": 0.00709482841193676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.094828470144421e-05, + "grad_norm": 3.565885543823242, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8639166355133057, + "num_tokens": 225829074.0, + "step": 5917 + }, + { + "epoch": 0.7528304286986388, + "ewc_loss": 0.007063194643706083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.063194789225236e-05, + "grad_norm": 3.499849319458008, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8790937066078186, + "num_tokens": 225871344.0, + "step": 5918 + }, + { + "epoch": 0.7529576389772293, + "ewc_loss": 0.0070383888669312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0383888669312e-05, + "grad_norm": 3.5289177894592285, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8603498935699463, + "num_tokens": 225911321.0, + "step": 5919 + }, + { + "epoch": 0.7530848492558199, + "ewc_loss": 0.007066292222589254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.066292164381593e-05, + "grad_norm": 3.5542187690734863, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8720188140869141, + "num_tokens": 225944613.0, + "step": 5920 + }, + { + "epoch": 0.7532120595344104, + "ewc_loss": 0.007066415622830391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.066415855661035e-05, + "grad_norm": 3.506246328353882, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8688267469406128, + "num_tokens": 225989501.0, + "step": 5921 + }, + { + "epoch": 0.7533392698130009, + "ewc_loss": 0.007030021399259567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.030021515674889e-05, + "grad_norm": 3.5414111614227295, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8666572570800781, + "num_tokens": 226024984.0, + "step": 5922 + }, + { + "epoch": 0.7534664800915915, + "ewc_loss": 0.007073557935655117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.073557935655117e-05, + "grad_norm": 3.517759084701538, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8767156600952148, + "num_tokens": 226063255.0, + "step": 5923 + }, + { + "epoch": 0.7535936903701819, + "ewc_loss": 0.007042865734547377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.042865763651207e-05, + "grad_norm": 3.522702217102051, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8518173694610596, + "num_tokens": 226105538.0, + "step": 5924 + }, + { + "epoch": 0.7537209006487724, + "ewc_loss": 0.007047838065773249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04783815308474e-05, + "grad_norm": 3.6148388385772705, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8677065372467041, + "num_tokens": 226137449.0, + "step": 5925 + }, + { + "epoch": 0.7538481109273629, + "ewc_loss": 0.007105092518031597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.10509266355075e-05, + "grad_norm": 3.646141290664673, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8582252860069275, + "num_tokens": 226171078.0, + "step": 5926 + }, + { + "epoch": 0.7539753212059535, + "ewc_loss": 0.007100588176399469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.100588118191808e-05, + "grad_norm": 3.5307607650756836, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8660532236099243, + "num_tokens": 226208060.0, + "step": 5927 + }, + { + "epoch": 0.754102531484544, + "ewc_loss": 0.00702696992084384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0269699790515e-05, + "grad_norm": 3.512683153152466, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8753505945205688, + "num_tokens": 226245330.0, + "step": 5928 + }, + { + "epoch": 0.7542297417631345, + "ewc_loss": 0.007060008589178324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.060008647385985e-05, + "grad_norm": 3.520617723464966, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.886650800704956, + "num_tokens": 226281974.0, + "step": 5929 + }, + { + "epoch": 0.7543569520417249, + "ewc_loss": 0.007061257027089596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.061257201712579e-05, + "grad_norm": 3.5322256088256836, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.854250431060791, + "num_tokens": 226323302.0, + "step": 5930 + }, + { + "epoch": 0.7544841623203155, + "ewc_loss": 0.007075652480125427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.07565268385224e-05, + "grad_norm": 3.5611627101898193, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8520069122314453, + "num_tokens": 226361346.0, + "step": 5931 + }, + { + "epoch": 0.754611372598906, + "ewc_loss": 0.007088099140673876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.088098936947063e-05, + "grad_norm": 3.5965752601623535, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8611575961112976, + "num_tokens": 226397744.0, + "step": 5932 + }, + { + "epoch": 0.7547385828774965, + "ewc_loss": 0.007114366628229618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.114366599125788e-05, + "grad_norm": 3.5453851222991943, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8715250492095947, + "num_tokens": 226436745.0, + "step": 5933 + }, + { + "epoch": 0.754865793156087, + "ewc_loss": 0.007063018623739481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.063018711050972e-05, + "grad_norm": 3.5483603477478027, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8723610043525696, + "num_tokens": 226470302.0, + "step": 5934 + }, + { + "epoch": 0.7549930034346776, + "ewc_loss": 0.007083585485816002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.083585660438985e-05, + "grad_norm": 3.638463020324707, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8674668669700623, + "num_tokens": 226500090.0, + "step": 5935 + }, + { + "epoch": 0.755120213713268, + "ewc_loss": 0.007131347432732582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.131347229005769e-05, + "grad_norm": 3.6210925579071045, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8642805814743042, + "num_tokens": 226533359.0, + "step": 5936 + }, + { + "epoch": 0.7552474239918585, + "ewc_loss": 0.007097398396581411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.09739833837375e-05, + "grad_norm": 3.550025224685669, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8564441204071045, + "num_tokens": 226567060.0, + "step": 5937 + }, + { + "epoch": 0.755374634270449, + "ewc_loss": 0.0070684729143977165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.068472768878564e-05, + "grad_norm": 3.554318428039551, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8634058237075806, + "num_tokens": 226604270.0, + "step": 5938 + }, + { + "epoch": 0.7555018445490396, + "ewc_loss": 0.007101563736796379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.10156382410787e-05, + "grad_norm": 3.5381834506988525, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8642541170120239, + "num_tokens": 226641310.0, + "step": 5939 + }, + { + "epoch": 0.7556290548276301, + "ewc_loss": 0.0070924884639680386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0924885221757e-05, + "grad_norm": 3.545518636703491, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8548164367675781, + "num_tokens": 226678305.0, + "step": 5940 + }, + { + "epoch": 0.7557562651062206, + "ewc_loss": 0.007107825018465519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.107824785634875e-05, + "grad_norm": 3.5467653274536133, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8707473874092102, + "num_tokens": 226715628.0, + "step": 5941 + }, + { + "epoch": 0.755883475384811, + "ewc_loss": 0.0070962547324597836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.096254557836801e-05, + "grad_norm": 3.458108901977539, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8763889074325562, + "num_tokens": 226755346.0, + "step": 5942 + }, + { + "epoch": 0.7560106856634016, + "ewc_loss": 0.007052775006741285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.052774890325963e-05, + "grad_norm": 3.5996901988983154, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8549282550811768, + "num_tokens": 226792243.0, + "step": 5943 + }, + { + "epoch": 0.7561378959419921, + "ewc_loss": 0.007174968253821135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.174968050094321e-05, + "grad_norm": 3.534975051879883, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8786066770553589, + "num_tokens": 226829033.0, + "step": 5944 + }, + { + "epoch": 0.7562651062205826, + "ewc_loss": 0.007071214262396097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.071214349707589e-05, + "grad_norm": 3.4815778732299805, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8784971237182617, + "num_tokens": 226870645.0, + "step": 5945 + }, + { + "epoch": 0.7563923164991732, + "ewc_loss": 0.007077548187226057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.077548070810735e-05, + "grad_norm": 3.598140239715576, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.870470404624939, + "num_tokens": 226902395.0, + "step": 5946 + }, + { + "epoch": 0.7565195267777637, + "ewc_loss": 0.007165402173995972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.165402348618954e-05, + "grad_norm": 3.5145392417907715, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8715447187423706, + "num_tokens": 226942620.0, + "step": 5947 + }, + { + "epoch": 0.7566467370563541, + "ewc_loss": 0.007054255343973637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.05425554770045e-05, + "grad_norm": 3.5795669555664062, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8728316426277161, + "num_tokens": 226976472.0, + "step": 5948 + }, + { + "epoch": 0.7567739473349446, + "ewc_loss": 0.007133122533559799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.13312256266363e-05, + "grad_norm": 3.6097187995910645, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8563840389251709, + "num_tokens": 227013626.0, + "step": 5949 + }, + { + "epoch": 0.7569011576135352, + "ewc_loss": 0.007128728087991476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.128727884264663e-05, + "grad_norm": 3.578603744506836, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8534194827079773, + "num_tokens": 227049336.0, + "step": 5950 + }, + { + "epoch": 0.7570283678921257, + "ewc_loss": 0.007088761776685715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.088761776685715e-05, + "grad_norm": 3.4970614910125732, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8890039920806885, + "num_tokens": 227087371.0, + "step": 5951 + }, + { + "epoch": 0.7571555781707162, + "ewc_loss": 0.007072636857628822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.072636799421161e-05, + "grad_norm": 3.521317481994629, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8654320240020752, + "num_tokens": 227129188.0, + "step": 5952 + }, + { + "epoch": 0.7572827884493067, + "ewc_loss": 0.007099438458681107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.099438516888767e-05, + "grad_norm": 3.5952117443084717, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8596447706222534, + "num_tokens": 227161807.0, + "step": 5953 + }, + { + "epoch": 0.7574099987278972, + "ewc_loss": 0.007125564385205507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.125564297894016e-05, + "grad_norm": 3.537623167037964, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.856667160987854, + "num_tokens": 227198431.0, + "step": 5954 + }, + { + "epoch": 0.7575372090064877, + "ewc_loss": 0.007073541171848774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.073541200952604e-05, + "grad_norm": 3.490753650665283, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8852329254150391, + "num_tokens": 227236046.0, + "step": 5955 + }, + { + "epoch": 0.7576644192850782, + "ewc_loss": 0.007086935453116894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.086935511324555e-05, + "grad_norm": 3.5167627334594727, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8801181316375732, + "num_tokens": 227272925.0, + "step": 5956 + }, + { + "epoch": 0.7577916295636687, + "ewc_loss": 0.007094380911439657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.094380998751149e-05, + "grad_norm": 3.533172607421875, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8698375821113586, + "num_tokens": 227312311.0, + "step": 5957 + }, + { + "epoch": 0.7579188398422593, + "ewc_loss": 0.0071127405390143394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.112740422599018e-05, + "grad_norm": 3.5508813858032227, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8860389590263367, + "num_tokens": 227349632.0, + "step": 5958 + }, + { + "epoch": 0.7580460501208498, + "ewc_loss": 0.007091371808201075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.091371662681922e-05, + "grad_norm": 3.504751205444336, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8646359443664551, + "num_tokens": 227392982.0, + "step": 5959 + }, + { + "epoch": 0.7581732603994403, + "ewc_loss": 0.007060440257191658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.060440111672506e-05, + "grad_norm": 3.5966575145721436, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8617730140686035, + "num_tokens": 227431878.0, + "step": 5960 + }, + { + "epoch": 0.7583004706780307, + "ewc_loss": 0.007124480325728655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.124480180209503e-05, + "grad_norm": 3.521573543548584, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8559941053390503, + "num_tokens": 227470401.0, + "step": 5961 + }, + { + "epoch": 0.7584276809566213, + "ewc_loss": 0.007020200602710247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.020200428087264e-05, + "grad_norm": 3.55576229095459, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.870232105255127, + "num_tokens": 227507189.0, + "step": 5962 + }, + { + "epoch": 0.7585548912352118, + "ewc_loss": 0.007080969400703907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.080969226080924e-05, + "grad_norm": 3.513622999191284, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8694141507148743, + "num_tokens": 227548070.0, + "step": 5963 + }, + { + "epoch": 0.7586821015138023, + "ewc_loss": 0.007046038750559092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.046038808766752e-05, + "grad_norm": 3.553431749343872, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.870482325553894, + "num_tokens": 227584360.0, + "step": 5964 + }, + { + "epoch": 0.7588093117923929, + "ewc_loss": 0.007077822927385569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.077823102008551e-05, + "grad_norm": 3.5521061420440674, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8735036253929138, + "num_tokens": 227618961.0, + "step": 5965 + }, + { + "epoch": 0.7589365220709834, + "ewc_loss": 0.007074345834553242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.074345921864733e-05, + "grad_norm": 3.561772584915161, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8671190142631531, + "num_tokens": 227658304.0, + "step": 5966 + }, + { + "epoch": 0.7590637323495738, + "ewc_loss": 0.007064684294164181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.064684177748859e-05, + "grad_norm": 3.4930362701416016, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8637021780014038, + "num_tokens": 227701526.0, + "step": 5967 + }, + { + "epoch": 0.7591909426281643, + "ewc_loss": 0.007026087958365679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.026088132988662e-05, + "grad_norm": 3.5088047981262207, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8718309998512268, + "num_tokens": 227744930.0, + "step": 5968 + }, + { + "epoch": 0.7593181529067549, + "ewc_loss": 0.007043003104627132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.043003279250115e-05, + "grad_norm": 3.577524423599243, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8761869668960571, + "num_tokens": 227780607.0, + "step": 5969 + }, + { + "epoch": 0.7594453631853454, + "ewc_loss": 0.007055539172142744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.055539026623592e-05, + "grad_norm": 3.4998910427093506, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8623601198196411, + "num_tokens": 227825742.0, + "step": 5970 + }, + { + "epoch": 0.7595725734639359, + "ewc_loss": 0.006989257875829935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.989257963141426e-05, + "grad_norm": 3.5874850749969482, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8474922776222229, + "num_tokens": 227865278.0, + "step": 5971 + }, + { + "epoch": 0.7596997837425264, + "ewc_loss": 0.007059690076857805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.059689960442483e-05, + "grad_norm": 3.5855143070220947, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8631995916366577, + "num_tokens": 227906912.0, + "step": 5972 + }, + { + "epoch": 0.7598269940211169, + "ewc_loss": 0.007016750052571297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.016750168986619e-05, + "grad_norm": 3.562309980392456, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8641635775566101, + "num_tokens": 227947067.0, + "step": 5973 + }, + { + "epoch": 0.7599542042997074, + "ewc_loss": 0.006997259333729744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.997259333729744e-05, + "grad_norm": 3.5278329849243164, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8474202156066895, + "num_tokens": 227988563.0, + "step": 5974 + }, + { + "epoch": 0.7600814145782979, + "ewc_loss": 0.006996640935540199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.996640877332538e-05, + "grad_norm": 3.542268991470337, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8585166931152344, + "num_tokens": 228028556.0, + "step": 5975 + }, + { + "epoch": 0.7602086248568884, + "ewc_loss": 0.007014717441052198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.014717266429216e-05, + "grad_norm": 3.49861741065979, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8807628750801086, + "num_tokens": 228066265.0, + "step": 5976 + }, + { + "epoch": 0.760335835135479, + "ewc_loss": 0.00698399031534791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.983990169828758e-05, + "grad_norm": 3.529989242553711, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8627264499664307, + "num_tokens": 228103732.0, + "step": 5977 + }, + { + "epoch": 0.7604630454140695, + "ewc_loss": 0.007042041514068842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04204139765352e-05, + "grad_norm": 3.5080127716064453, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8638221621513367, + "num_tokens": 228149609.0, + "step": 5978 + }, + { + "epoch": 0.7605902556926599, + "ewc_loss": 0.007000328972935677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.000329060247168e-05, + "grad_norm": 3.5520119667053223, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8683865070343018, + "num_tokens": 228188462.0, + "step": 5979 + }, + { + "epoch": 0.7607174659712505, + "ewc_loss": 0.007045214995741844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.045215170364827e-05, + "grad_norm": 3.5727968215942383, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8801361322402954, + "num_tokens": 228229252.0, + "step": 5980 + }, + { + "epoch": 0.760844676249841, + "ewc_loss": 0.007029274944216013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.029275002423674e-05, + "grad_norm": 3.573279857635498, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8635122179985046, + "num_tokens": 228267397.0, + "step": 5981 + }, + { + "epoch": 0.7609718865284315, + "ewc_loss": 0.007016755174845457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.016755262156948e-05, + "grad_norm": 3.503918409347534, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8744996786117554, + "num_tokens": 228307561.0, + "step": 5982 + }, + { + "epoch": 0.761099096807022, + "ewc_loss": 0.006978739984333515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.978739838814363e-05, + "grad_norm": 3.584535598754883, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8685416579246521, + "num_tokens": 228347667.0, + "step": 5983 + }, + { + "epoch": 0.7612263070856126, + "ewc_loss": 0.007041083183139563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.041083154035732e-05, + "grad_norm": 3.5696232318878174, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8640592098236084, + "num_tokens": 228386820.0, + "step": 5984 + }, + { + "epoch": 0.761353517364203, + "ewc_loss": 0.0069936481304466724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.993648275965825e-05, + "grad_norm": 3.5274131298065186, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8741177916526794, + "num_tokens": 228427304.0, + "step": 5985 + }, + { + "epoch": 0.7614807276427935, + "ewc_loss": 0.006994219496846199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.994219438638538e-05, + "grad_norm": 3.547851324081421, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8567617535591125, + "num_tokens": 228470779.0, + "step": 5986 + }, + { + "epoch": 0.761607937921384, + "ewc_loss": 0.007006226107478142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.006226223893464e-05, + "grad_norm": 3.601022720336914, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8825464248657227, + "num_tokens": 228500454.0, + "step": 5987 + }, + { + "epoch": 0.7617351481999746, + "ewc_loss": 0.007032620720565319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.032620487734675e-05, + "grad_norm": 3.5488736629486084, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8726782202720642, + "num_tokens": 228537374.0, + "step": 5988 + }, + { + "epoch": 0.7618623584785651, + "ewc_loss": 0.006978515535593033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.978515739319846e-05, + "grad_norm": 3.5211033821105957, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8659694194793701, + "num_tokens": 228577215.0, + "step": 5989 + }, + { + "epoch": 0.7619895687571556, + "ewc_loss": 0.006998502649366856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.998502794886008e-05, + "grad_norm": 3.595278024673462, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8704944849014282, + "num_tokens": 228611693.0, + "step": 5990 + }, + { + "epoch": 0.762116779035746, + "ewc_loss": 0.007042261306196451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.042261131573468e-05, + "grad_norm": 3.6134562492370605, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8652215600013733, + "num_tokens": 228644231.0, + "step": 5991 + }, + { + "epoch": 0.7622439893143366, + "ewc_loss": 0.007013383787125349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.013383583398536e-05, + "grad_norm": 3.478553295135498, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8766437768936157, + "num_tokens": 228684776.0, + "step": 5992 + }, + { + "epoch": 0.7623711995929271, + "ewc_loss": 0.006955841556191444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.955841672606766e-05, + "grad_norm": 3.545121192932129, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8728471994400024, + "num_tokens": 228724214.0, + "step": 5993 + }, + { + "epoch": 0.7624984098715176, + "ewc_loss": 0.007037485018372536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.037485192995518e-05, + "grad_norm": 3.551241874694824, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8634766936302185, + "num_tokens": 228765098.0, + "step": 5994 + }, + { + "epoch": 0.7626256201501082, + "ewc_loss": 0.0069994572550058365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.999457400524989e-05, + "grad_norm": 3.5273895263671875, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8680394887924194, + "num_tokens": 228802423.0, + "step": 5995 + }, + { + "epoch": 0.7627528304286987, + "ewc_loss": 0.007003244012594223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.00324380886741e-05, + "grad_norm": 3.5815138816833496, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8555784821510315, + "num_tokens": 228840218.0, + "step": 5996 + }, + { + "epoch": 0.7628800407072891, + "ewc_loss": 0.007039555348455906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.039555202936754e-05, + "grad_norm": 3.4974799156188965, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8715001344680786, + "num_tokens": 228882810.0, + "step": 5997 + }, + { + "epoch": 0.7630072509858796, + "ewc_loss": 0.006976912263780832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.97691211826168e-05, + "grad_norm": 3.6508922576904297, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8733123540878296, + "num_tokens": 228917664.0, + "step": 5998 + }, + { + "epoch": 0.7631344612644702, + "ewc_loss": 0.007090114522725344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.090114377206191e-05, + "grad_norm": 3.5788021087646484, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8737341165542603, + "num_tokens": 228951017.0, + "step": 5999 + }, + { + "epoch": 0.7632616715430607, + "ewc_loss": 0.006999417208135128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.999417382758111e-05, + "grad_norm": 3.5066394805908203, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8674007654190063, + "num_tokens": 228990949.0, + "step": 6000 + }, + { + "epoch": 0.7633888818216512, + "ewc_loss": 0.00698567321524024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.985673098824918e-05, + "grad_norm": 3.562464475631714, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8719841241836548, + "num_tokens": 229026604.0, + "step": 6001 + }, + { + "epoch": 0.7635160921002417, + "ewc_loss": 0.007034735754132271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.034735608613119e-05, + "grad_norm": 3.6164286136627197, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8594189286231995, + "num_tokens": 229061810.0, + "step": 6002 + }, + { + "epoch": 0.7636433023788322, + "ewc_loss": 0.007041123695671558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.041123899398372e-05, + "grad_norm": 3.520561933517456, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8432812094688416, + "num_tokens": 229107765.0, + "step": 6003 + }, + { + "epoch": 0.7637705126574227, + "ewc_loss": 0.00698564387857914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.985643994994462e-05, + "grad_norm": 3.575584650039673, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8537552356719971, + "num_tokens": 229147822.0, + "step": 6004 + }, + { + "epoch": 0.7638977229360132, + "ewc_loss": 0.007054830901324749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.054831075947732e-05, + "grad_norm": 3.555724620819092, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8755828142166138, + "num_tokens": 229184421.0, + "step": 6005 + }, + { + "epoch": 0.7640249332146037, + "ewc_loss": 0.00701425364241004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.014253787929192e-05, + "grad_norm": 3.571781873703003, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8672404885292053, + "num_tokens": 229221737.0, + "step": 6006 + }, + { + "epoch": 0.7641521434931943, + "ewc_loss": 0.007042719516903162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.042719516903162e-05, + "grad_norm": 3.5904054641723633, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8607991933822632, + "num_tokens": 229254727.0, + "step": 6007 + }, + { + "epoch": 0.7642793537717848, + "ewc_loss": 0.007044533267617226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.044533413136378e-05, + "grad_norm": 3.5463268756866455, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8762417435646057, + "num_tokens": 229292648.0, + "step": 6008 + }, + { + "epoch": 0.7644065640503753, + "ewc_loss": 0.0070224180817604065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.022418139968067e-05, + "grad_norm": 3.4742836952209473, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8722548484802246, + "num_tokens": 229335708.0, + "step": 6009 + }, + { + "epoch": 0.7645337743289657, + "ewc_loss": 0.007014848757535219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.014848961262032e-05, + "grad_norm": 3.572859048843384, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8749778866767883, + "num_tokens": 229373791.0, + "step": 6010 + }, + { + "epoch": 0.7646609846075563, + "ewc_loss": 0.007073922548443079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.073922461131588e-05, + "grad_norm": 3.5749471187591553, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8765407800674438, + "num_tokens": 229409891.0, + "step": 6011 + }, + { + "epoch": 0.7647881948861468, + "ewc_loss": 0.007043126970529556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.043126970529556e-05, + "grad_norm": 3.577920436859131, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8676284551620483, + "num_tokens": 229449859.0, + "step": 6012 + }, + { + "epoch": 0.7649154051647373, + "ewc_loss": 0.007043310906738043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.043311052257195e-05, + "grad_norm": 3.5489282608032227, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8739510178565979, + "num_tokens": 229484224.0, + "step": 6013 + }, + { + "epoch": 0.7650426154433279, + "ewc_loss": 0.007044844329357147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.044844096526504e-05, + "grad_norm": 3.568504571914673, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8622896671295166, + "num_tokens": 229521104.0, + "step": 6014 + }, + { + "epoch": 0.7651698257219184, + "ewc_loss": 0.007061278913170099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.061279029585421e-05, + "grad_norm": 3.5802016258239746, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8516826629638672, + "num_tokens": 229558918.0, + "step": 6015 + }, + { + "epoch": 0.7652970360005088, + "ewc_loss": 0.007073770277202129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.073770393617451e-05, + "grad_norm": 3.613314151763916, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8533220291137695, + "num_tokens": 229596931.0, + "step": 6016 + }, + { + "epoch": 0.7654242462790993, + "ewc_loss": 0.007080347277224064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.080347131704912e-05, + "grad_norm": 3.601649045944214, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8513522148132324, + "num_tokens": 229633903.0, + "step": 6017 + }, + { + "epoch": 0.7655514565576899, + "ewc_loss": 0.0070714084431529045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.071408617775887e-05, + "grad_norm": 3.5442137718200684, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8633914589881897, + "num_tokens": 229674224.0, + "step": 6018 + }, + { + "epoch": 0.7656786668362804, + "ewc_loss": 0.007032205816358328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.032205758150667e-05, + "grad_norm": 3.5749785900115967, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8664299845695496, + "num_tokens": 229710547.0, + "step": 6019 + }, + { + "epoch": 0.7658058771148709, + "ewc_loss": 0.0070701297372579575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.070129504427314e-05, + "grad_norm": 3.4876437187194824, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8816717267036438, + "num_tokens": 229752020.0, + "step": 6020 + }, + { + "epoch": 0.7659330873934614, + "ewc_loss": 0.007026389241218567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.026389357633889e-05, + "grad_norm": 3.634974718093872, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8628395199775696, + "num_tokens": 229785710.0, + "step": 6021 + }, + { + "epoch": 0.7660602976720519, + "ewc_loss": 0.007129339035600424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.129339064704254e-05, + "grad_norm": 3.5218567848205566, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8733257055282593, + "num_tokens": 229824118.0, + "step": 6022 + }, + { + "epoch": 0.7661875079506424, + "ewc_loss": 0.007001134566962719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.001134508755058e-05, + "grad_norm": 3.6045446395874023, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8786224126815796, + "num_tokens": 229854231.0, + "step": 6023 + }, + { + "epoch": 0.7663147182292329, + "ewc_loss": 0.007106848526746035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.106848352123052e-05, + "grad_norm": 3.5884008407592773, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8573346734046936, + "num_tokens": 229888102.0, + "step": 6024 + }, + { + "epoch": 0.7664419285078234, + "ewc_loss": 0.007068205624818802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.068205741234124e-05, + "grad_norm": 3.5499820709228516, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8697115778923035, + "num_tokens": 229927634.0, + "step": 6025 + }, + { + "epoch": 0.766569138786414, + "ewc_loss": 0.007047397084534168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04739723005332e-05, + "grad_norm": 3.5026662349700928, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8742879629135132, + "num_tokens": 229966526.0, + "step": 6026 + }, + { + "epoch": 0.7666963490650045, + "ewc_loss": 0.007052074186503887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.052074215607718e-05, + "grad_norm": 3.5319104194641113, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8772062063217163, + "num_tokens": 230002761.0, + "step": 6027 + }, + { + "epoch": 0.7668235593435949, + "ewc_loss": 0.0070810094475746155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.081009243847802e-05, + "grad_norm": 3.5118789672851562, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8721144795417786, + "num_tokens": 230041303.0, + "step": 6028 + }, + { + "epoch": 0.7669507696221854, + "ewc_loss": 0.00706153130158782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.061531505314633e-05, + "grad_norm": 3.55021333694458, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8699248433113098, + "num_tokens": 230081117.0, + "step": 6029 + }, + { + "epoch": 0.767077979900776, + "ewc_loss": 0.007086129393428564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.086129335220903e-05, + "grad_norm": 3.5526275634765625, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8624867796897888, + "num_tokens": 230119411.0, + "step": 6030 + }, + { + "epoch": 0.7672051901793665, + "ewc_loss": 0.007055305410176516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.055305468384176e-05, + "grad_norm": 3.533644914627075, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8737972974777222, + "num_tokens": 230161111.0, + "step": 6031 + }, + { + "epoch": 0.767332400457957, + "ewc_loss": 0.007040746510028839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.040746277198195e-05, + "grad_norm": 3.5728812217712402, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8629047870635986, + "num_tokens": 230196999.0, + "step": 6032 + }, + { + "epoch": 0.7674596107365476, + "ewc_loss": 0.007073508575558662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.07350845914334e-05, + "grad_norm": 3.5969913005828857, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8685927987098694, + "num_tokens": 230231495.0, + "step": 6033 + }, + { + "epoch": 0.767586821015138, + "ewc_loss": 0.007059273775666952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.059273775666952e-05, + "grad_norm": 3.5781478881835938, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8596138954162598, + "num_tokens": 230267732.0, + "step": 6034 + }, + { + "epoch": 0.7677140312937285, + "ewc_loss": 0.007056236267089844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.05623606336303e-05, + "grad_norm": 3.6008739471435547, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8671050071716309, + "num_tokens": 230297559.0, + "step": 6035 + }, + { + "epoch": 0.767841241572319, + "ewc_loss": 0.007078791502863169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.078791531966999e-05, + "grad_norm": 3.551401138305664, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8556013107299805, + "num_tokens": 230336396.0, + "step": 6036 + }, + { + "epoch": 0.7679684518509096, + "ewc_loss": 0.007051221560686827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.051221473375335e-05, + "grad_norm": 3.5732271671295166, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8669912815093994, + "num_tokens": 230368152.0, + "step": 6037 + }, + { + "epoch": 0.7680956621295001, + "ewc_loss": 0.007083838805556297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.083838863763958e-05, + "grad_norm": 3.5115184783935547, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8716477155685425, + "num_tokens": 230408081.0, + "step": 6038 + }, + { + "epoch": 0.7682228724080906, + "ewc_loss": 0.007042432203888893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.042432116577402e-05, + "grad_norm": 3.593172788619995, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8588292002677917, + "num_tokens": 230441691.0, + "step": 6039 + }, + { + "epoch": 0.768350082686681, + "ewc_loss": 0.00711616687476635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.116166671039537e-05, + "grad_norm": 3.509662628173828, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8657646179199219, + "num_tokens": 230480089.0, + "step": 6040 + }, + { + "epoch": 0.7684772929652716, + "ewc_loss": 0.007069330662488937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.069330604281276e-05, + "grad_norm": 3.530407190322876, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8619599938392639, + "num_tokens": 230518655.0, + "step": 6041 + }, + { + "epoch": 0.7686045032438621, + "ewc_loss": 0.007108358200639486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.108358113327995e-05, + "grad_norm": 3.5588698387145996, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8700674772262573, + "num_tokens": 230555717.0, + "step": 6042 + }, + { + "epoch": 0.7687317135224526, + "ewc_loss": 0.007127414457499981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.127414573915303e-05, + "grad_norm": 3.491814136505127, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.874384880065918, + "num_tokens": 230596699.0, + "step": 6043 + }, + { + "epoch": 0.7688589238010431, + "ewc_loss": 0.007065975107252598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.065974932629615e-05, + "grad_norm": 3.538041830062866, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8760399222373962, + "num_tokens": 230633284.0, + "step": 6044 + }, + { + "epoch": 0.7689861340796337, + "ewc_loss": 0.007127254270017147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.127254502847791e-05, + "grad_norm": 3.504479169845581, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8705390691757202, + "num_tokens": 230673133.0, + "step": 6045 + }, + { + "epoch": 0.7691133443582241, + "ewc_loss": 0.007074693217873573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.07469298504293e-05, + "grad_norm": 3.4766921997070312, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8676683902740479, + "num_tokens": 230719003.0, + "step": 6046 + }, + { + "epoch": 0.7692405546368146, + "ewc_loss": 0.007071219384670258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.071219442877918e-05, + "grad_norm": 3.5411086082458496, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8764950037002563, + "num_tokens": 230754805.0, + "step": 6047 + }, + { + "epoch": 0.7693677649154052, + "ewc_loss": 0.007111378945410252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.111379090929404e-05, + "grad_norm": 3.569626569747925, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.853251576423645, + "num_tokens": 230793249.0, + "step": 6048 + }, + { + "epoch": 0.7694949751939957, + "ewc_loss": 0.007090408354997635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.090408325893804e-05, + "grad_norm": 3.564645528793335, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8823130130767822, + "num_tokens": 230826025.0, + "step": 6049 + }, + { + "epoch": 0.7696221854725862, + "ewc_loss": 0.00707095256075263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.070952415233478e-05, + "grad_norm": 3.4639053344726562, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8775688409805298, + "num_tokens": 230866615.0, + "step": 6050 + }, + { + "epoch": 0.7697493957511767, + "ewc_loss": 0.007023385725915432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.023385842330754e-05, + "grad_norm": 3.6647980213165283, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8527011871337891, + "num_tokens": 230901587.0, + "step": 6051 + }, + { + "epoch": 0.7698766060297672, + "ewc_loss": 0.007185628637671471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.185628783190623e-05, + "grad_norm": 3.4971673488616943, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8726873397827148, + "num_tokens": 230944267.0, + "step": 6052 + }, + { + "epoch": 0.7700038163083577, + "ewc_loss": 0.006999018602073193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.999018660280854e-05, + "grad_norm": 3.632723331451416, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.858180820941925, + "num_tokens": 230980022.0, + "step": 6053 + }, + { + "epoch": 0.7701310265869482, + "ewc_loss": 0.007145674899220467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.145675044739619e-05, + "grad_norm": 3.5220232009887695, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8736623525619507, + "num_tokens": 231018110.0, + "step": 6054 + }, + { + "epoch": 0.7702582368655387, + "ewc_loss": 0.0070150201208889484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.015019946265966e-05, + "grad_norm": 3.542778730392456, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8729391694068909, + "num_tokens": 231056531.0, + "step": 6055 + }, + { + "epoch": 0.7703854471441293, + "ewc_loss": 0.007075917441397905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.075917528709397e-05, + "grad_norm": 3.526301622390747, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8729900121688843, + "num_tokens": 231096270.0, + "step": 6056 + }, + { + "epoch": 0.7705126574227198, + "ewc_loss": 0.0070572285912930965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.057228503981605e-05, + "grad_norm": 3.545497417449951, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8600254654884338, + "num_tokens": 231136300.0, + "step": 6057 + }, + { + "epoch": 0.7706398677013102, + "ewc_loss": 0.007062573451548815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.062573422444984e-05, + "grad_norm": 3.592263698577881, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8531856536865234, + "num_tokens": 231169005.0, + "step": 6058 + }, + { + "epoch": 0.7707670779799007, + "ewc_loss": 0.0070863510482013226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.086351251928136e-05, + "grad_norm": 3.5922176837921143, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8737360239028931, + "num_tokens": 231205274.0, + "step": 6059 + }, + { + "epoch": 0.7708942882584913, + "ewc_loss": 0.007070132531225681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.07013241481036e-05, + "grad_norm": 3.5749993324279785, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8735054135322571, + "num_tokens": 231239022.0, + "step": 6060 + }, + { + "epoch": 0.7710214985370818, + "ewc_loss": 0.0070760492235422134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.076049223542213e-05, + "grad_norm": 3.5213236808776855, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8483391404151917, + "num_tokens": 231282952.0, + "step": 6061 + }, + { + "epoch": 0.7711487088156723, + "ewc_loss": 0.007039004936814308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.039005140541121e-05, + "grad_norm": 3.5724782943725586, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8650193214416504, + "num_tokens": 231322980.0, + "step": 6062 + }, + { + "epoch": 0.7712759190942629, + "ewc_loss": 0.007094476372003555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.094476313795894e-05, + "grad_norm": 3.615769147872925, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8586215972900391, + "num_tokens": 231358382.0, + "step": 6063 + }, + { + "epoch": 0.7714031293728534, + "ewc_loss": 0.007095129694789648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.095129694789648e-05, + "grad_norm": 3.474675178527832, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.875755250453949, + "num_tokens": 231401915.0, + "step": 6064 + }, + { + "epoch": 0.7715303396514438, + "ewc_loss": 0.007001308258622885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.001308404142037e-05, + "grad_norm": 3.667693853378296, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8450224995613098, + "num_tokens": 231433241.0, + "step": 6065 + }, + { + "epoch": 0.7716575499300343, + "ewc_loss": 0.007173915393650532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.173915219027549e-05, + "grad_norm": 3.5440497398376465, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8614262938499451, + "num_tokens": 231478318.0, + "step": 6066 + }, + { + "epoch": 0.7717847602086249, + "ewc_loss": 0.007030907552689314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.030907727312297e-05, + "grad_norm": 3.5297930240631104, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8596189022064209, + "num_tokens": 231518584.0, + "step": 6067 + }, + { + "epoch": 0.7719119704872154, + "ewc_loss": 0.007073868066072464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.073867891449481e-05, + "grad_norm": 3.5488898754119873, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8670142889022827, + "num_tokens": 231556689.0, + "step": 6068 + }, + { + "epoch": 0.7720391807658059, + "ewc_loss": 0.007086752913892269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.086752884788439e-05, + "grad_norm": 3.576504707336426, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8664358258247375, + "num_tokens": 231593936.0, + "step": 6069 + }, + { + "epoch": 0.7721663910443964, + "ewc_loss": 0.0070957522839307785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.095752516761422e-05, + "grad_norm": 3.492781639099121, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8848147392272949, + "num_tokens": 231634785.0, + "step": 6070 + }, + { + "epoch": 0.7722936013229869, + "ewc_loss": 0.007054008077830076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.054008165141568e-05, + "grad_norm": 3.553903102874756, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8614012002944946, + "num_tokens": 231675798.0, + "step": 6071 + }, + { + "epoch": 0.7724208116015774, + "ewc_loss": 0.0071012903936207294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.101290248101577e-05, + "grad_norm": 3.5715041160583496, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8641694188117981, + "num_tokens": 231711611.0, + "step": 6072 + }, + { + "epoch": 0.7725480218801679, + "ewc_loss": 0.007111147046089172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.111146987881511e-05, + "grad_norm": 3.629984140396118, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8582669496536255, + "num_tokens": 231746997.0, + "step": 6073 + }, + { + "epoch": 0.7726752321587584, + "ewc_loss": 0.007129780016839504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.129779987735674e-05, + "grad_norm": 3.561767101287842, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8475556373596191, + "num_tokens": 231787696.0, + "step": 6074 + }, + { + "epoch": 0.772802442437349, + "ewc_loss": 0.007074142340570688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.074142195051536e-05, + "grad_norm": 3.5569920539855957, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.858816385269165, + "num_tokens": 231826127.0, + "step": 6075 + }, + { + "epoch": 0.7729296527159395, + "ewc_loss": 0.007093969266861677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.093969179550186e-05, + "grad_norm": 3.599153995513916, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8616494536399841, + "num_tokens": 231857538.0, + "step": 6076 + }, + { + "epoch": 0.7730568629945299, + "ewc_loss": 0.007126181852072477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.12618202669546e-05, + "grad_norm": 3.572357416152954, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8644120693206787, + "num_tokens": 231893913.0, + "step": 6077 + }, + { + "epoch": 0.7731840732731204, + "ewc_loss": 0.007099210284650326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.099210051819682e-05, + "grad_norm": 3.5564417839050293, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8683973550796509, + "num_tokens": 231934274.0, + "step": 6078 + }, + { + "epoch": 0.773311283551711, + "ewc_loss": 0.007110395934432745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.110396109055728e-05, + "grad_norm": 3.5140647888183594, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8606591820716858, + "num_tokens": 231983197.0, + "step": 6079 + }, + { + "epoch": 0.7734384938303015, + "ewc_loss": 0.007081438321620226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.081438525347039e-05, + "grad_norm": 3.527528762817383, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8611892461776733, + "num_tokens": 232021265.0, + "step": 6080 + }, + { + "epoch": 0.773565704108892, + "ewc_loss": 0.00709888432174921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.098884088918567e-05, + "grad_norm": 3.5182034969329834, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8658486604690552, + "num_tokens": 232060992.0, + "step": 6081 + }, + { + "epoch": 0.7736929143874826, + "ewc_loss": 0.007068694569170475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.068694685585797e-05, + "grad_norm": 3.5589520931243896, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8648857474327087, + "num_tokens": 232096758.0, + "step": 6082 + }, + { + "epoch": 0.773820124666073, + "ewc_loss": 0.007110903039574623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.110903243301436e-05, + "grad_norm": 3.59587025642395, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8629185557365417, + "num_tokens": 232133773.0, + "step": 6083 + }, + { + "epoch": 0.7739473349446635, + "ewc_loss": 0.007112687919288874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.112688035704195e-05, + "grad_norm": 3.507520914077759, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8646072149276733, + "num_tokens": 232173182.0, + "step": 6084 + }, + { + "epoch": 0.774074545223254, + "ewc_loss": 0.0070472583174705505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04725825926289e-05, + "grad_norm": 3.5415289402008057, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8744663000106812, + "num_tokens": 232206478.0, + "step": 6085 + }, + { + "epoch": 0.7742017555018446, + "ewc_loss": 0.007101760245859623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.101760274963453e-05, + "grad_norm": 3.5490798950195312, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8606253266334534, + "num_tokens": 232242231.0, + "step": 6086 + }, + { + "epoch": 0.7743289657804351, + "ewc_loss": 0.007086905650794506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.086905679898337e-05, + "grad_norm": 3.6358582973480225, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8694863319396973, + "num_tokens": 232270636.0, + "step": 6087 + }, + { + "epoch": 0.7744561760590256, + "ewc_loss": 0.0071284035220742226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.12840337655507e-05, + "grad_norm": 3.4899747371673584, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8706693649291992, + "num_tokens": 232309974.0, + "step": 6088 + }, + { + "epoch": 0.774583386337616, + "ewc_loss": 0.0070424312725663185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04243138898164e-05, + "grad_norm": 3.614238739013672, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8739714026451111, + "num_tokens": 232344118.0, + "step": 6089 + }, + { + "epoch": 0.7747105966162066, + "ewc_loss": 0.007164385169744492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.164385169744492e-05, + "grad_norm": 3.5167253017425537, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8559470176696777, + "num_tokens": 232385183.0, + "step": 6090 + }, + { + "epoch": 0.7748378068947971, + "ewc_loss": 0.007042362354695797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.042362267384306e-05, + "grad_norm": 3.5406131744384766, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8679441809654236, + "num_tokens": 232421670.0, + "step": 6091 + }, + { + "epoch": 0.7749650171733876, + "ewc_loss": 0.007101981434971094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.101981464074925e-05, + "grad_norm": 3.563235282897949, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8549366593360901, + "num_tokens": 232459563.0, + "step": 6092 + }, + { + "epoch": 0.7750922274519781, + "ewc_loss": 0.0071037220768630505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.103721873136237e-05, + "grad_norm": 3.582704782485962, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8673887252807617, + "num_tokens": 232496357.0, + "step": 6093 + }, + { + "epoch": 0.7752194377305687, + "ewc_loss": 0.007109685800969601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.109685975592583e-05, + "grad_norm": 3.5730714797973633, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8650999665260315, + "num_tokens": 232530436.0, + "step": 6094 + }, + { + "epoch": 0.7753466480091591, + "ewc_loss": 0.007087738253176212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.087738049449399e-05, + "grad_norm": 3.5419721603393555, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8613889217376709, + "num_tokens": 232568585.0, + "step": 6095 + }, + { + "epoch": 0.7754738582877496, + "ewc_loss": 0.00708435894921422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.084359094733372e-05, + "grad_norm": 3.5712287425994873, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8719611167907715, + "num_tokens": 232602506.0, + "step": 6096 + }, + { + "epoch": 0.7756010685663401, + "ewc_loss": 0.007107482757419348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.107482815627009e-05, + "grad_norm": 3.5023763179779053, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8743849992752075, + "num_tokens": 232642819.0, + "step": 6097 + }, + { + "epoch": 0.7757282788449307, + "ewc_loss": 0.0070528374053537846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.052837463561445e-05, + "grad_norm": 3.5701653957366943, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8533110022544861, + "num_tokens": 232679505.0, + "step": 6098 + }, + { + "epoch": 0.7758554891235212, + "ewc_loss": 0.0071329777128994465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.132977771107107e-05, + "grad_norm": 3.5575873851776123, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8701805472373962, + "num_tokens": 232715701.0, + "step": 6099 + }, + { + "epoch": 0.7759826994021117, + "ewc_loss": 0.007089816965162754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.089816790539771e-05, + "grad_norm": 3.5794286727905273, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8671339154243469, + "num_tokens": 232750786.0, + "step": 6100 + }, + { + "epoch": 0.7761099096807021, + "ewc_loss": 0.007094020023941994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.094020111253485e-05, + "grad_norm": 3.514033317565918, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8540769815444946, + "num_tokens": 232791370.0, + "step": 6101 + }, + { + "epoch": 0.7762371199592927, + "ewc_loss": 0.007062200922518969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.062200893415138e-05, + "grad_norm": 3.5377085208892822, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8673157691955566, + "num_tokens": 232830895.0, + "step": 6102 + }, + { + "epoch": 0.7763643302378832, + "ewc_loss": 0.007117013446986675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.117013592505828e-05, + "grad_norm": 3.5322935581207275, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8655957579612732, + "num_tokens": 232869940.0, + "step": 6103 + }, + { + "epoch": 0.7764915405164737, + "ewc_loss": 0.007069679908454418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.069679850246757e-05, + "grad_norm": 3.53224515914917, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8601852059364319, + "num_tokens": 232910743.0, + "step": 6104 + }, + { + "epoch": 0.7766187507950643, + "ewc_loss": 0.007092401385307312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.092401210684329e-05, + "grad_norm": 3.5685999393463135, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8641369342803955, + "num_tokens": 232948428.0, + "step": 6105 + }, + { + "epoch": 0.7767459610736548, + "ewc_loss": 0.007094215601682663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.094215834513307e-05, + "grad_norm": 3.561121702194214, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8719881176948547, + "num_tokens": 232985142.0, + "step": 6106 + }, + { + "epoch": 0.7768731713522452, + "ewc_loss": 0.007079313043504953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.079313218127936e-05, + "grad_norm": 3.5398688316345215, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8772350549697876, + "num_tokens": 233020810.0, + "step": 6107 + }, + { + "epoch": 0.7770003816308357, + "ewc_loss": 0.00707646831870079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.07646831870079e-05, + "grad_norm": 3.599008083343506, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8583571910858154, + "num_tokens": 233055674.0, + "step": 6108 + }, + { + "epoch": 0.7771275919094263, + "ewc_loss": 0.007131613325327635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.131613529054448e-05, + "grad_norm": 3.678009271621704, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8607433438301086, + "num_tokens": 233084506.0, + "step": 6109 + }, + { + "epoch": 0.7772548021880168, + "ewc_loss": 0.007149295415729284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.149295561248437e-05, + "grad_norm": 3.5264995098114014, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8819856643676758, + "num_tokens": 233120172.0, + "step": 6110 + }, + { + "epoch": 0.7773820124666073, + "ewc_loss": 0.007024949416518211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.024949445622042e-05, + "grad_norm": 3.5186707973480225, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8569426536560059, + "num_tokens": 233160075.0, + "step": 6111 + }, + { + "epoch": 0.7775092227451978, + "ewc_loss": 0.0071065244264900684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.106524572009221e-05, + "grad_norm": 3.5452234745025635, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8589740991592407, + "num_tokens": 233201714.0, + "step": 6112 + }, + { + "epoch": 0.7776364330237884, + "ewc_loss": 0.007109960075467825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.109960279194638e-05, + "grad_norm": 3.6168367862701416, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8734332323074341, + "num_tokens": 233237642.0, + "step": 6113 + }, + { + "epoch": 0.7777636433023788, + "ewc_loss": 0.007134280167520046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.134280167520046e-05, + "grad_norm": 3.493656873703003, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8714373707771301, + "num_tokens": 233279712.0, + "step": 6114 + }, + { + "epoch": 0.7778908535809693, + "ewc_loss": 0.007038666866719723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.038666808512062e-05, + "grad_norm": 3.518238067626953, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8713440299034119, + "num_tokens": 233322441.0, + "step": 6115 + }, + { + "epoch": 0.7780180638595598, + "ewc_loss": 0.007098606321960688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.098606147337705e-05, + "grad_norm": 3.5390470027923584, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8605437278747559, + "num_tokens": 233371053.0, + "step": 6116 + }, + { + "epoch": 0.7781452741381504, + "ewc_loss": 0.00708811404183507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.088114216458052e-05, + "grad_norm": 3.63848876953125, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8681105375289917, + "num_tokens": 233405372.0, + "step": 6117 + }, + { + "epoch": 0.7782724844167409, + "ewc_loss": 0.007147419732064009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.1474198193755e-05, + "grad_norm": 3.5335493087768555, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8725703358650208, + "num_tokens": 233447209.0, + "step": 6118 + }, + { + "epoch": 0.7783996946953314, + "ewc_loss": 0.007039160002022982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.039160118438303e-05, + "grad_norm": 3.5363881587982178, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8678839802742004, + "num_tokens": 233484621.0, + "step": 6119 + }, + { + "epoch": 0.7785269049739219, + "ewc_loss": 0.007071800995618105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.071800791891292e-05, + "grad_norm": 3.624537944793701, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8489425182342529, + "num_tokens": 233520756.0, + "step": 6120 + }, + { + "epoch": 0.7786541152525124, + "ewc_loss": 0.007113920524716377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.113920582924038e-05, + "grad_norm": 3.5952985286712646, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8591147661209106, + "num_tokens": 233559120.0, + "step": 6121 + }, + { + "epoch": 0.7787813255311029, + "ewc_loss": 0.007063987199217081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06398714100942e-05, + "grad_norm": 3.568044900894165, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8774821758270264, + "num_tokens": 233590513.0, + "step": 6122 + }, + { + "epoch": 0.7789085358096934, + "ewc_loss": 0.007064369041472673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.064369128784165e-05, + "grad_norm": 3.559633731842041, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8808512091636658, + "num_tokens": 233625986.0, + "step": 6123 + }, + { + "epoch": 0.779035746088284, + "ewc_loss": 0.007055986672639847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.055986498016864e-05, + "grad_norm": 3.5953712463378906, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8757483959197998, + "num_tokens": 233655097.0, + "step": 6124 + }, + { + "epoch": 0.7791629563668745, + "ewc_loss": 0.007080881390720606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.080881186993793e-05, + "grad_norm": 3.6476798057556152, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8572879433631897, + "num_tokens": 233687815.0, + "step": 6125 + }, + { + "epoch": 0.7792901666454649, + "ewc_loss": 0.007101286668330431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.10128661012277e-05, + "grad_norm": 3.621784210205078, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8795355558395386, + "num_tokens": 233722611.0, + "step": 6126 + }, + { + "epoch": 0.7794173769240554, + "ewc_loss": 0.007064321078360081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.064321107463911e-05, + "grad_norm": 3.539454221725464, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8530685901641846, + "num_tokens": 233765345.0, + "step": 6127 + }, + { + "epoch": 0.779544587202646, + "ewc_loss": 0.007038985379040241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.038985495455563e-05, + "grad_norm": 3.540766716003418, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8691492676734924, + "num_tokens": 233804358.0, + "step": 6128 + }, + { + "epoch": 0.7796717974812365, + "ewc_loss": 0.007074305787682533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.074305904097855e-05, + "grad_norm": 3.544733762741089, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8565139770507812, + "num_tokens": 233845117.0, + "step": 6129 + }, + { + "epoch": 0.779799007759827, + "ewc_loss": 0.007062725722789764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.062725489959121e-05, + "grad_norm": 3.554152011871338, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8602942228317261, + "num_tokens": 233884091.0, + "step": 6130 + }, + { + "epoch": 0.7799262180384176, + "ewc_loss": 0.007073038257658482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.073038432281464e-05, + "grad_norm": 3.564439058303833, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8637714385986328, + "num_tokens": 233922590.0, + "step": 6131 + }, + { + "epoch": 0.780053428317008, + "ewc_loss": 0.00707423547282815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.074235327308998e-05, + "grad_norm": 3.612100124359131, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8687257170677185, + "num_tokens": 233954138.0, + "step": 6132 + }, + { + "epoch": 0.7801806385955985, + "ewc_loss": 0.007099892012774944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.099891809048131e-05, + "grad_norm": 3.5871849060058594, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8760780096054077, + "num_tokens": 233988035.0, + "step": 6133 + }, + { + "epoch": 0.780307848874189, + "ewc_loss": 0.007074093446135521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.074093446135521e-05, + "grad_norm": 3.555928945541382, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8582828044891357, + "num_tokens": 234023747.0, + "step": 6134 + }, + { + "epoch": 0.7804350591527796, + "ewc_loss": 0.0070679946802556515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.067994738463312e-05, + "grad_norm": 3.533129930496216, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8754364252090454, + "num_tokens": 234064182.0, + "step": 6135 + }, + { + "epoch": 0.7805622694313701, + "ewc_loss": 0.0070806024596095085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.08060251781717e-05, + "grad_norm": 3.5624449253082275, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8497636914253235, + "num_tokens": 234106533.0, + "step": 6136 + }, + { + "epoch": 0.7806894797099606, + "ewc_loss": 0.007088563870638609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.088563870638609e-05, + "grad_norm": 3.522024631500244, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8780058026313782, + "num_tokens": 234145492.0, + "step": 6137 + }, + { + "epoch": 0.780816689988551, + "ewc_loss": 0.007068818900734186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.068819104461e-05, + "grad_norm": 3.5973145961761475, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8690710663795471, + "num_tokens": 234181709.0, + "step": 6138 + }, + { + "epoch": 0.7809439002671416, + "ewc_loss": 0.007118006702512503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.118006760720164e-05, + "grad_norm": 3.5538747310638428, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8676795959472656, + "num_tokens": 234217854.0, + "step": 6139 + }, + { + "epoch": 0.7810711105457321, + "ewc_loss": 0.007080784533172846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.080784416757524e-05, + "grad_norm": 3.5465285778045654, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8662313222885132, + "num_tokens": 234255769.0, + "step": 6140 + }, + { + "epoch": 0.7811983208243226, + "ewc_loss": 0.007079737260937691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.079737406456843e-05, + "grad_norm": 3.6089799404144287, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8536617755889893, + "num_tokens": 234290828.0, + "step": 6141 + }, + { + "epoch": 0.7813255311029131, + "ewc_loss": 0.0071294279769063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.129427831387147e-05, + "grad_norm": 3.5986764430999756, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8661985397338867, + "num_tokens": 234326336.0, + "step": 6142 + }, + { + "epoch": 0.7814527413815037, + "ewc_loss": 0.0071105919778347015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.110591832315549e-05, + "grad_norm": 3.559920310974121, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.860355019569397, + "num_tokens": 234366210.0, + "step": 6143 + }, + { + "epoch": 0.7815799516600941, + "ewc_loss": 0.007090700790286064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.090700819389895e-05, + "grad_norm": 3.5903537273406982, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8578965663909912, + "num_tokens": 234398975.0, + "step": 6144 + }, + { + "epoch": 0.7817071619386846, + "ewc_loss": 0.0071231769397854805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.123177056200802e-05, + "grad_norm": 3.5712242126464844, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.860032320022583, + "num_tokens": 234437746.0, + "step": 6145 + }, + { + "epoch": 0.7818343722172751, + "ewc_loss": 0.007106130011379719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.106130215106532e-05, + "grad_norm": 3.5228118896484375, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8713463544845581, + "num_tokens": 234478402.0, + "step": 6146 + }, + { + "epoch": 0.7819615824958657, + "ewc_loss": 0.007092295680195093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.092295709298924e-05, + "grad_norm": 3.6659138202667236, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8516470193862915, + "num_tokens": 234509699.0, + "step": 6147 + }, + { + "epoch": 0.7820887927744562, + "ewc_loss": 0.007199012208729982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.199012179626152e-05, + "grad_norm": 3.5044281482696533, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8609371185302734, + "num_tokens": 234551515.0, + "step": 6148 + }, + { + "epoch": 0.7822160030530467, + "ewc_loss": 0.0070627713575959206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06277132849209e-05, + "grad_norm": 3.6133763790130615, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8734972476959229, + "num_tokens": 234582811.0, + "step": 6149 + }, + { + "epoch": 0.7823432133316371, + "ewc_loss": 0.007181680761277676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.181680848589167e-05, + "grad_norm": 3.5404982566833496, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8578033447265625, + "num_tokens": 234625299.0, + "step": 6150 + }, + { + "epoch": 0.7824704236102277, + "ewc_loss": 0.007090263534337282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.090263534337282e-05, + "grad_norm": 3.513169288635254, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8722686767578125, + "num_tokens": 234665641.0, + "step": 6151 + }, + { + "epoch": 0.7825976338888182, + "ewc_loss": 0.007105057127773762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.10505701135844e-05, + "grad_norm": 3.575960159301758, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8655105233192444, + "num_tokens": 234704668.0, + "step": 6152 + }, + { + "epoch": 0.7827248441674087, + "ewc_loss": 0.007143423892557621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.14342386345379e-05, + "grad_norm": 3.6290910243988037, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8717147707939148, + "num_tokens": 234738080.0, + "step": 6153 + }, + { + "epoch": 0.7828520544459993, + "ewc_loss": 0.007131447549909353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.131447637220845e-05, + "grad_norm": 3.5302579402923584, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8646734952926636, + "num_tokens": 234781077.0, + "step": 6154 + }, + { + "epoch": 0.7829792647245898, + "ewc_loss": 0.007051768712699413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.051768625387922e-05, + "grad_norm": 3.6806721687316895, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8500363230705261, + "num_tokens": 234809374.0, + "step": 6155 + }, + { + "epoch": 0.7831064750031802, + "ewc_loss": 0.0071946908719837666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.194690988399088e-05, + "grad_norm": 3.572110176086426, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8557384014129639, + "num_tokens": 234845585.0, + "step": 6156 + }, + { + "epoch": 0.7832336852817707, + "ewc_loss": 0.0070723798125982285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.072379958117381e-05, + "grad_norm": 3.591539144515991, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8643769025802612, + "num_tokens": 234883891.0, + "step": 6157 + }, + { + "epoch": 0.7833608955603613, + "ewc_loss": 0.007121464237570763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.121464295778424e-05, + "grad_norm": 3.5811502933502197, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8661215305328369, + "num_tokens": 234919445.0, + "step": 6158 + }, + { + "epoch": 0.7834881058389518, + "ewc_loss": 0.007117350585758686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.117350469343364e-05, + "grad_norm": 3.5988571643829346, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8677077889442444, + "num_tokens": 234951465.0, + "step": 6159 + }, + { + "epoch": 0.7836153161175423, + "ewc_loss": 0.007124230731278658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.124230614863336e-05, + "grad_norm": 3.597703456878662, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8575711250305176, + "num_tokens": 234989970.0, + "step": 6160 + }, + { + "epoch": 0.7837425263961328, + "ewc_loss": 0.007108950987458229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.108951103873551e-05, + "grad_norm": 3.5150327682495117, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.867155909538269, + "num_tokens": 235029673.0, + "step": 6161 + }, + { + "epoch": 0.7838697366747234, + "ewc_loss": 0.007081193849444389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.081194053171203e-05, + "grad_norm": 3.7223060131073, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8643282651901245, + "num_tokens": 235065389.0, + "step": 6162 + }, + { + "epoch": 0.7839969469533138, + "ewc_loss": 0.007222734857350588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.222734711831436e-05, + "grad_norm": 3.568448066711426, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8763623237609863, + "num_tokens": 235099795.0, + "step": 6163 + }, + { + "epoch": 0.7841241572319043, + "ewc_loss": 0.007071644999086857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.071645086398348e-05, + "grad_norm": 3.529756546020508, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8670951128005981, + "num_tokens": 235137475.0, + "step": 6164 + }, + { + "epoch": 0.7842513675104948, + "ewc_loss": 0.0071131382137537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.113138417480513e-05, + "grad_norm": 3.5602264404296875, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8613423109054565, + "num_tokens": 235178454.0, + "step": 6165 + }, + { + "epoch": 0.7843785777890854, + "ewc_loss": 0.007136577740311623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.136577914934605e-05, + "grad_norm": 3.4860475063323975, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8839196562767029, + "num_tokens": 235216139.0, + "step": 6166 + }, + { + "epoch": 0.7845057880676759, + "ewc_loss": 0.007085287477821112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.085287506924942e-05, + "grad_norm": 3.5391032695770264, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8814449310302734, + "num_tokens": 235253571.0, + "step": 6167 + }, + { + "epoch": 0.7846329983462664, + "ewc_loss": 0.007142036221921444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.142036338336766e-05, + "grad_norm": 3.5746536254882812, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8507673740386963, + "num_tokens": 235293950.0, + "step": 6168 + }, + { + "epoch": 0.7847602086248568, + "ewc_loss": 0.007142641115933657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.142640970414504e-05, + "grad_norm": 3.514551877975464, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8704230785369873, + "num_tokens": 235336419.0, + "step": 6169 + }, + { + "epoch": 0.7848874189034474, + "ewc_loss": 0.007097275462001562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.097275374690071e-05, + "grad_norm": 3.5508170127868652, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8703557252883911, + "num_tokens": 235376039.0, + "step": 6170 + }, + { + "epoch": 0.7850146291820379, + "ewc_loss": 0.007141540292650461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.141540118027478e-05, + "grad_norm": 3.603135108947754, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8550447821617126, + "num_tokens": 235412736.0, + "step": 6171 + }, + { + "epoch": 0.7851418394606284, + "ewc_loss": 0.0071355318650603294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.135531632229686e-05, + "grad_norm": 3.4838500022888184, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8654520511627197, + "num_tokens": 235457318.0, + "step": 6172 + }, + { + "epoch": 0.785269049739219, + "ewc_loss": 0.007057254668325186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.057254697429016e-05, + "grad_norm": 3.627408742904663, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8557717800140381, + "num_tokens": 235491471.0, + "step": 6173 + }, + { + "epoch": 0.7853962600178095, + "ewc_loss": 0.007180018816143274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.180019019870088e-05, + "grad_norm": 3.5520944595336914, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8768265247344971, + "num_tokens": 235531324.0, + "step": 6174 + }, + { + "epoch": 0.7855234702963999, + "ewc_loss": 0.007073018234223127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.073018059600145e-05, + "grad_norm": 3.5824596881866455, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8537596464157104, + "num_tokens": 235573233.0, + "step": 6175 + }, + { + "epoch": 0.7856506805749904, + "ewc_loss": 0.007116919849067926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.116919732652605e-05, + "grad_norm": 3.4988903999328613, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8665133714675903, + "num_tokens": 235614400.0, + "step": 6176 + }, + { + "epoch": 0.785777890853581, + "ewc_loss": 0.007053568493574858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.053568697301671e-05, + "grad_norm": 3.566633701324463, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8715262413024902, + "num_tokens": 235652247.0, + "step": 6177 + }, + { + "epoch": 0.7859051011321715, + "ewc_loss": 0.007124484516680241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.124484545784071e-05, + "grad_norm": 3.670844316482544, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8629540205001831, + "num_tokens": 235681810.0, + "step": 6178 + }, + { + "epoch": 0.786032311410762, + "ewc_loss": 0.007152360863983631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.152360922191292e-05, + "grad_norm": 3.539289951324463, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8763916492462158, + "num_tokens": 235718422.0, + "step": 6179 + }, + { + "epoch": 0.7861595216893525, + "ewc_loss": 0.007045640144497156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.045640086289495e-05, + "grad_norm": 3.571375608444214, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8614005446434021, + "num_tokens": 235758886.0, + "step": 6180 + }, + { + "epoch": 0.786286731967943, + "ewc_loss": 0.007118685636669397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.118685607565567e-05, + "grad_norm": 3.6242222785949707, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8696738481521606, + "num_tokens": 235792270.0, + "step": 6181 + }, + { + "epoch": 0.7864139422465335, + "ewc_loss": 0.007128855213522911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.128855213522911e-05, + "grad_norm": 3.617905378341675, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8570668697357178, + "num_tokens": 235826678.0, + "step": 6182 + }, + { + "epoch": 0.786541152525124, + "ewc_loss": 0.007098032161593437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.098032074281946e-05, + "grad_norm": 3.5721044540405273, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.878609299659729, + "num_tokens": 235860437.0, + "step": 6183 + }, + { + "epoch": 0.7866683628037145, + "ewc_loss": 0.0070998105220496655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.099810318322852e-05, + "grad_norm": 3.5154666900634766, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8710623979568481, + "num_tokens": 235900865.0, + "step": 6184 + }, + { + "epoch": 0.7867955730823051, + "ewc_loss": 0.007093811873346567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.09381201886572e-05, + "grad_norm": 3.594078302383423, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8517610430717468, + "num_tokens": 235937419.0, + "step": 6185 + }, + { + "epoch": 0.7869227833608956, + "ewc_loss": 0.007156709209084511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.156709034461528e-05, + "grad_norm": 3.5778653621673584, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8570865988731384, + "num_tokens": 235976425.0, + "step": 6186 + }, + { + "epoch": 0.787049993639486, + "ewc_loss": 0.007125113159418106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.125113188521937e-05, + "grad_norm": 3.6273343563079834, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8595037460327148, + "num_tokens": 236012968.0, + "step": 6187 + }, + { + "epoch": 0.7871772039180766, + "ewc_loss": 0.007175165228545666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.175165228545666e-05, + "grad_norm": 3.5727615356445312, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8677551746368408, + "num_tokens": 236052462.0, + "step": 6188 + }, + { + "epoch": 0.7873044141966671, + "ewc_loss": 0.007111037150025368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.111037120921537e-05, + "grad_norm": 3.5739216804504395, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8816808462142944, + "num_tokens": 236087604.0, + "step": 6189 + }, + { + "epoch": 0.7874316244752576, + "ewc_loss": 0.007143332157284021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.143332186387852e-05, + "grad_norm": 3.5395286083221436, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8664633631706238, + "num_tokens": 236126344.0, + "step": 6190 + }, + { + "epoch": 0.7875588347538481, + "ewc_loss": 0.007107018493115902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.107018609531224e-05, + "grad_norm": 3.5464863777160645, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8738369941711426, + "num_tokens": 236161913.0, + "step": 6191 + }, + { + "epoch": 0.7876860450324387, + "ewc_loss": 0.007128581404685974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.128581637516618e-05, + "grad_norm": 3.6393330097198486, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8699727654457092, + "num_tokens": 236200612.0, + "step": 6192 + }, + { + "epoch": 0.7878132553110291, + "ewc_loss": 0.0071741496212780476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.174149504862726e-05, + "grad_norm": 3.5742194652557373, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8708052635192871, + "num_tokens": 236238136.0, + "step": 6193 + }, + { + "epoch": 0.7879404655896196, + "ewc_loss": 0.0070896572433412075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.089657447068021e-05, + "grad_norm": 3.5375630855560303, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8636043667793274, + "num_tokens": 236281303.0, + "step": 6194 + }, + { + "epoch": 0.7880676758682101, + "ewc_loss": 0.007093323860317469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.093323802109808e-05, + "grad_norm": 3.5751864910125732, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8622262477874756, + "num_tokens": 236322355.0, + "step": 6195 + }, + { + "epoch": 0.7881948861468007, + "ewc_loss": 0.007127969525754452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.127969729481265e-05, + "grad_norm": 3.6097166538238525, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8655441999435425, + "num_tokens": 236354263.0, + "step": 6196 + }, + { + "epoch": 0.7883220964253912, + "ewc_loss": 0.007136847823858261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.136847852962092e-05, + "grad_norm": 3.5412938594818115, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8676955103874207, + "num_tokens": 236391581.0, + "step": 6197 + }, + { + "epoch": 0.7884493067039817, + "ewc_loss": 0.007085984572768211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.085984543664381e-05, + "grad_norm": 3.594964027404785, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8736487030982971, + "num_tokens": 236426003.0, + "step": 6198 + }, + { + "epoch": 0.7885765169825721, + "ewc_loss": 0.007141624577343464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.141624519135803e-05, + "grad_norm": 3.5335185527801514, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.877448558807373, + "num_tokens": 236464874.0, + "step": 6199 + }, + { + "epoch": 0.7887037272611627, + "ewc_loss": 0.007078331895172596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.078331691445783e-05, + "grad_norm": 3.5912277698516846, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8771378993988037, + "num_tokens": 236499826.0, + "step": 6200 + }, + { + "epoch": 0.7888309375397532, + "ewc_loss": 0.007126472890377045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.126473065000027e-05, + "grad_norm": 3.5563488006591797, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8785684108734131, + "num_tokens": 236538615.0, + "step": 6201 + }, + { + "epoch": 0.7889581478183437, + "ewc_loss": 0.007096103858202696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.096103945514187e-05, + "grad_norm": 3.5556931495666504, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8645256757736206, + "num_tokens": 236577860.0, + "step": 6202 + }, + { + "epoch": 0.7890853580969343, + "ewc_loss": 0.007102471776306629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.10247186361812e-05, + "grad_norm": 3.553467273712158, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8799126148223877, + "num_tokens": 236619041.0, + "step": 6203 + }, + { + "epoch": 0.7892125683755248, + "ewc_loss": 0.007094605825841427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.094605825841427e-05, + "grad_norm": 3.565153121948242, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8692193627357483, + "num_tokens": 236655788.0, + "step": 6204 + }, + { + "epoch": 0.7893397786541152, + "ewc_loss": 0.007098793052136898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.09879313944839e-05, + "grad_norm": 3.590883731842041, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8775201439857483, + "num_tokens": 236691532.0, + "step": 6205 + }, + { + "epoch": 0.7894669889327057, + "ewc_loss": 0.007104532327502966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.104532414814457e-05, + "grad_norm": 3.579420804977417, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8741374015808105, + "num_tokens": 236733792.0, + "step": 6206 + }, + { + "epoch": 0.7895941992112963, + "ewc_loss": 0.007092821877449751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.092821761034429e-05, + "grad_norm": 3.498530864715576, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.878807783126831, + "num_tokens": 236775454.0, + "step": 6207 + }, + { + "epoch": 0.7897214094898868, + "ewc_loss": 0.007027085870504379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.027085666777566e-05, + "grad_norm": 3.57999587059021, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8770533204078674, + "num_tokens": 236811572.0, + "step": 6208 + }, + { + "epoch": 0.7898486197684773, + "ewc_loss": 0.0071118236519396305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.11182365193963e-05, + "grad_norm": 3.588716745376587, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8509213924407959, + "num_tokens": 236851018.0, + "step": 6209 + }, + { + "epoch": 0.7899758300470678, + "ewc_loss": 0.007076173555105925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.076173642417416e-05, + "grad_norm": 3.5203936100006104, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8706817626953125, + "num_tokens": 236891871.0, + "step": 6210 + }, + { + "epoch": 0.7901030403256584, + "ewc_loss": 0.0070435767993330956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.043576624710113e-05, + "grad_norm": 3.620410203933716, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8700953722000122, + "num_tokens": 236926771.0, + "step": 6211 + }, + { + "epoch": 0.7902302506042488, + "ewc_loss": 0.007128979079425335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.128978904802352e-05, + "grad_norm": 3.5715270042419434, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8550946712493896, + "num_tokens": 236964768.0, + "step": 6212 + }, + { + "epoch": 0.7903574608828393, + "ewc_loss": 0.007065651938319206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.065651880111545e-05, + "grad_norm": 3.581878900527954, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8550861477851868, + "num_tokens": 237005075.0, + "step": 6213 + }, + { + "epoch": 0.7904846711614298, + "ewc_loss": 0.007085037417709827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.085037213983014e-05, + "grad_norm": 3.6214044094085693, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8696655035018921, + "num_tokens": 237035517.0, + "step": 6214 + }, + { + "epoch": 0.7906118814400204, + "ewc_loss": 0.007098963484168053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.098963396856561e-05, + "grad_norm": 3.484354257583618, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.885013997554779, + "num_tokens": 237076319.0, + "step": 6215 + }, + { + "epoch": 0.7907390917186109, + "ewc_loss": 0.007036836817860603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.036836905172095e-05, + "grad_norm": 3.561351776123047, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8583307266235352, + "num_tokens": 237117935.0, + "step": 6216 + }, + { + "epoch": 0.7908663019972014, + "ewc_loss": 0.007114577107131481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.114576874300838e-05, + "grad_norm": 3.5379273891448975, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.876412034034729, + "num_tokens": 237155789.0, + "step": 6217 + }, + { + "epoch": 0.7909935122757918, + "ewc_loss": 0.00707236910238862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.07236904418096e-05, + "grad_norm": 3.514430284500122, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8724994659423828, + "num_tokens": 237198870.0, + "step": 6218 + }, + { + "epoch": 0.7911207225543824, + "ewc_loss": 0.007059935946017504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.059935887809843e-05, + "grad_norm": 3.539642572402954, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8724486827850342, + "num_tokens": 237243084.0, + "step": 6219 + }, + { + "epoch": 0.7912479328329729, + "ewc_loss": 0.007083599455654621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.083599484758452e-05, + "grad_norm": 3.629335880279541, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8595390319824219, + "num_tokens": 237277245.0, + "step": 6220 + }, + { + "epoch": 0.7913751431115634, + "ewc_loss": 0.007120486348867416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.120486407075077e-05, + "grad_norm": 3.5449821949005127, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8718229532241821, + "num_tokens": 237317841.0, + "step": 6221 + }, + { + "epoch": 0.791502353390154, + "ewc_loss": 0.007048776838928461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04877675161697e-05, + "grad_norm": 3.6809489727020264, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8575345277786255, + "num_tokens": 237349327.0, + "step": 6222 + }, + { + "epoch": 0.7916295636687445, + "ewc_loss": 0.007149685639888048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.149685552576557e-05, + "grad_norm": 3.5682716369628906, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8522793650627136, + "num_tokens": 237396179.0, + "step": 6223 + }, + { + "epoch": 0.7917567739473349, + "ewc_loss": 0.007031860761344433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.031860877759755e-05, + "grad_norm": 3.5659849643707275, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8632142543792725, + "num_tokens": 237437766.0, + "step": 6224 + }, + { + "epoch": 0.7918839842259254, + "ewc_loss": 0.007057431619614363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.057431503199041e-05, + "grad_norm": 3.5417680740356445, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8717490434646606, + "num_tokens": 237479239.0, + "step": 6225 + }, + { + "epoch": 0.792011194504516, + "ewc_loss": 0.00704346364364028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.043463847367093e-05, + "grad_norm": 3.601363182067871, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8800507187843323, + "num_tokens": 237514641.0, + "step": 6226 + }, + { + "epoch": 0.7921384047831065, + "ewc_loss": 0.007051152177155018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.051152351778e-05, + "grad_norm": 3.52199649810791, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8770490288734436, + "num_tokens": 237555319.0, + "step": 6227 + }, + { + "epoch": 0.792265615061697, + "ewc_loss": 0.007000190671533346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.000190817052498e-05, + "grad_norm": 3.552682399749756, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8698329925537109, + "num_tokens": 237597786.0, + "step": 6228 + }, + { + "epoch": 0.7923928253402875, + "ewc_loss": 0.00702664814889431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.026648381724954e-05, + "grad_norm": 3.5303571224212646, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8762352466583252, + "num_tokens": 237636584.0, + "step": 6229 + }, + { + "epoch": 0.792520035618878, + "ewc_loss": 0.007022856269031763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.022856152616441e-05, + "grad_norm": 3.5729992389678955, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8589185476303101, + "num_tokens": 237676000.0, + "step": 6230 + }, + { + "epoch": 0.7926472458974685, + "ewc_loss": 0.007035271264612675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.035271119093522e-05, + "grad_norm": 3.5561747550964355, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8927456140518188, + "num_tokens": 237713003.0, + "step": 6231 + }, + { + "epoch": 0.792774456176059, + "ewc_loss": 0.007019845303148031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.019845361355692e-05, + "grad_norm": 3.702540636062622, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8691584467887878, + "num_tokens": 237742054.0, + "step": 6232 + }, + { + "epoch": 0.7929016664546495, + "ewc_loss": 0.007114325184375048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.114325126167387e-05, + "grad_norm": 3.5775082111358643, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8607056140899658, + "num_tokens": 237783147.0, + "step": 6233 + }, + { + "epoch": 0.7930288767332401, + "ewc_loss": 0.006990518886595964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.990518886595964e-05, + "grad_norm": 3.6382744312286377, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.863563060760498, + "num_tokens": 237814322.0, + "step": 6234 + }, + { + "epoch": 0.7931560870118306, + "ewc_loss": 0.007085115183144808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.085115066729486e-05, + "grad_norm": 3.569457530975342, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8735175728797913, + "num_tokens": 237849278.0, + "step": 6235 + }, + { + "epoch": 0.793283297290421, + "ewc_loss": 0.007019917946308851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.019918120931834e-05, + "grad_norm": 3.534550189971924, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8704705238342285, + "num_tokens": 237893434.0, + "step": 6236 + }, + { + "epoch": 0.7934105075690115, + "ewc_loss": 0.0070461188443005085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.046118844300508e-05, + "grad_norm": 3.5637929439544678, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8752192854881287, + "num_tokens": 237932135.0, + "step": 6237 + }, + { + "epoch": 0.7935377178476021, + "ewc_loss": 0.007063799537718296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.063799421302974e-05, + "grad_norm": 3.6198630332946777, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8608787059783936, + "num_tokens": 237966320.0, + "step": 6238 + }, + { + "epoch": 0.7936649281261926, + "ewc_loss": 0.007091518025845289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.091517909429967e-05, + "grad_norm": 3.526165723800659, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8709962964057922, + "num_tokens": 238006365.0, + "step": 6239 + }, + { + "epoch": 0.7937921384047831, + "ewc_loss": 0.007025254424661398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.025254308246076e-05, + "grad_norm": 3.6018102169036865, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8667379021644592, + "num_tokens": 238039307.0, + "step": 6240 + }, + { + "epoch": 0.7939193486833737, + "ewc_loss": 0.007110359612852335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.110359729267657e-05, + "grad_norm": 3.5770418643951416, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8527612686157227, + "num_tokens": 238078015.0, + "step": 6241 + }, + { + "epoch": 0.7940465589619641, + "ewc_loss": 0.00708782160654664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.087821722961962e-05, + "grad_norm": 3.6417391300201416, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8647456765174866, + "num_tokens": 238116790.0, + "step": 6242 + }, + { + "epoch": 0.7941737692405546, + "ewc_loss": 0.007125678472220898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.125678530428559e-05, + "grad_norm": 3.5208256244659424, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8737001419067383, + "num_tokens": 238155330.0, + "step": 6243 + }, + { + "epoch": 0.7943009795191451, + "ewc_loss": 0.007061453070491552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0614529249724e-05, + "grad_norm": 3.5713443756103516, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.852901816368103, + "num_tokens": 238194492.0, + "step": 6244 + }, + { + "epoch": 0.7944281897977357, + "ewc_loss": 0.0071222903206944466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.122290116967633e-05, + "grad_norm": 3.5472826957702637, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8678739070892334, + "num_tokens": 238230451.0, + "step": 6245 + }, + { + "epoch": 0.7945554000763262, + "ewc_loss": 0.007089861202985048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.089861173881218e-05, + "grad_norm": 3.548496723175049, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8755654096603394, + "num_tokens": 238266331.0, + "step": 6246 + }, + { + "epoch": 0.7946826103549167, + "ewc_loss": 0.007105465047061443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.105465192580596e-05, + "grad_norm": 3.542219638824463, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8556420207023621, + "num_tokens": 238307753.0, + "step": 6247 + }, + { + "epoch": 0.7948098206335071, + "ewc_loss": 0.007091929204761982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.091929001035169e-05, + "grad_norm": 3.56882381439209, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8659921288490295, + "num_tokens": 238349644.0, + "step": 6248 + }, + { + "epoch": 0.7949370309120977, + "ewc_loss": 0.007114337291568518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.114337495295331e-05, + "grad_norm": 3.60493803024292, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8540582060813904, + "num_tokens": 238386308.0, + "step": 6249 + }, + { + "epoch": 0.7950642411906882, + "ewc_loss": 0.00712508475407958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.125084812287241e-05, + "grad_norm": 3.520073652267456, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8523210287094116, + "num_tokens": 238427935.0, + "step": 6250 + }, + { + "epoch": 0.7951914514692787, + "ewc_loss": 0.007056870497763157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.056870526866987e-05, + "grad_norm": 3.5912396907806396, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8726627230644226, + "num_tokens": 238463639.0, + "step": 6251 + }, + { + "epoch": 0.7953186617478692, + "ewc_loss": 0.007130894809961319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.130894664442167e-05, + "grad_norm": 3.541609048843384, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8801429271697998, + "num_tokens": 238498295.0, + "step": 6252 + }, + { + "epoch": 0.7954458720264598, + "ewc_loss": 0.007070101797580719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.07010185578838e-05, + "grad_norm": 3.5787758827209473, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8671289682388306, + "num_tokens": 238538109.0, + "step": 6253 + }, + { + "epoch": 0.7955730823050502, + "ewc_loss": 0.007116355933248997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.116355845937505e-05, + "grad_norm": 3.5885252952575684, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8533495664596558, + "num_tokens": 238574415.0, + "step": 6254 + }, + { + "epoch": 0.7957002925836407, + "ewc_loss": 0.007123148534446955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.123148679966107e-05, + "grad_norm": 3.5247879028320312, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8696231245994568, + "num_tokens": 238616368.0, + "step": 6255 + }, + { + "epoch": 0.7958275028622313, + "ewc_loss": 0.007065875455737114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.065875252010301e-05, + "grad_norm": 3.558652639389038, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8709309101104736, + "num_tokens": 238650634.0, + "step": 6256 + }, + { + "epoch": 0.7959547131408218, + "ewc_loss": 0.007123488467186689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.12348846718669e-05, + "grad_norm": 3.62941575050354, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8623038530349731, + "num_tokens": 238684383.0, + "step": 6257 + }, + { + "epoch": 0.7960819234194123, + "ewc_loss": 0.007155476603657007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.155476487241685e-05, + "grad_norm": 3.54311203956604, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8651732206344604, + "num_tokens": 238722863.0, + "step": 6258 + }, + { + "epoch": 0.7962091336980028, + "ewc_loss": 0.007093704305589199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.09370433469303e-05, + "grad_norm": 3.5469470024108887, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8685787916183472, + "num_tokens": 238765027.0, + "step": 6259 + }, + { + "epoch": 0.7963363439765934, + "ewc_loss": 0.0071266163140535355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.126616401365027e-05, + "grad_norm": 3.5859320163726807, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8707305788993835, + "num_tokens": 238801950.0, + "step": 6260 + }, + { + "epoch": 0.7964635542551838, + "ewc_loss": 0.007140644360333681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.140644447645172e-05, + "grad_norm": 3.5474514961242676, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8580590486526489, + "num_tokens": 238840963.0, + "step": 6261 + }, + { + "epoch": 0.7965907645337743, + "ewc_loss": 0.00709378719329834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.093787280609831e-05, + "grad_norm": 3.624022960662842, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8552750945091248, + "num_tokens": 238876167.0, + "step": 6262 + }, + { + "epoch": 0.7967179748123648, + "ewc_loss": 0.00716344453394413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.163444388424978e-05, + "grad_norm": 3.572087049484253, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8725266456604004, + "num_tokens": 238910293.0, + "step": 6263 + }, + { + "epoch": 0.7968451850909554, + "ewc_loss": 0.0071113454177975655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.111345621524379e-05, + "grad_norm": 3.55318284034729, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8637092709541321, + "num_tokens": 238947279.0, + "step": 6264 + }, + { + "epoch": 0.7969723953695459, + "ewc_loss": 0.007111655548214912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.111655577318743e-05, + "grad_norm": 3.6048786640167236, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8548083305358887, + "num_tokens": 238984148.0, + "step": 6265 + }, + { + "epoch": 0.7970996056481364, + "ewc_loss": 0.007155491039156914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.155491039156914e-05, + "grad_norm": 3.5282845497131348, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8524378538131714, + "num_tokens": 239022370.0, + "step": 6266 + }, + { + "epoch": 0.7972268159267268, + "ewc_loss": 0.007092554587870836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.092554733389989e-05, + "grad_norm": 3.522465229034424, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8630189895629883, + "num_tokens": 239064542.0, + "step": 6267 + }, + { + "epoch": 0.7973540262053174, + "ewc_loss": 0.00712701166048646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.127011485863477e-05, + "grad_norm": 3.537834882736206, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8655137419700623, + "num_tokens": 239103193.0, + "step": 6268 + }, + { + "epoch": 0.7974812364839079, + "ewc_loss": 0.007136030588299036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.136030762922019e-05, + "grad_norm": 3.5822577476501465, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8752844929695129, + "num_tokens": 239137952.0, + "step": 6269 + }, + { + "epoch": 0.7976084467624984, + "ewc_loss": 0.007163305301219225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.163305417634547e-05, + "grad_norm": 3.5728213787078857, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.85698401927948, + "num_tokens": 239174600.0, + "step": 6270 + }, + { + "epoch": 0.797735657041089, + "ewc_loss": 0.007137390784919262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.13739063940011e-05, + "grad_norm": 3.6147639751434326, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8718059062957764, + "num_tokens": 239206157.0, + "step": 6271 + }, + { + "epoch": 0.7978628673196795, + "ewc_loss": 0.0071747140027582645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.174714119173586e-05, + "grad_norm": 3.5406768321990967, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8728273510932922, + "num_tokens": 239243094.0, + "step": 6272 + }, + { + "epoch": 0.7979900775982699, + "ewc_loss": 0.007138458546251059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.138458749977872e-05, + "grad_norm": 3.572019338607788, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8688094615936279, + "num_tokens": 239281208.0, + "step": 6273 + }, + { + "epoch": 0.7981172878768604, + "ewc_loss": 0.0071777417324483395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.177741645136848e-05, + "grad_norm": 3.559976816177368, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8635898232460022, + "num_tokens": 239320159.0, + "step": 6274 + }, + { + "epoch": 0.798244498155451, + "ewc_loss": 0.007154129911214113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.1541297074873e-05, + "grad_norm": 3.5581581592559814, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8559077978134155, + "num_tokens": 239360160.0, + "step": 6275 + }, + { + "epoch": 0.7983717084340415, + "ewc_loss": 0.00713703315705061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.137033389881253e-05, + "grad_norm": 3.5501160621643066, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8677607774734497, + "num_tokens": 239398101.0, + "step": 6276 + }, + { + "epoch": 0.798498918712632, + "ewc_loss": 0.0071284412406384945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.128441211534664e-05, + "grad_norm": 3.5081396102905273, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8820302486419678, + "num_tokens": 239438560.0, + "step": 6277 + }, + { + "epoch": 0.7986261289912225, + "ewc_loss": 0.007093186955899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.093187014106661e-05, + "grad_norm": 3.597033739089966, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8616607189178467, + "num_tokens": 239475851.0, + "step": 6278 + }, + { + "epoch": 0.798753339269813, + "ewc_loss": 0.007175190839916468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.175190694397315e-05, + "grad_norm": 3.5025556087493896, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8731437921524048, + "num_tokens": 239519165.0, + "step": 6279 + }, + { + "epoch": 0.7988805495484035, + "ewc_loss": 0.007084495387971401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.084495155140758e-05, + "grad_norm": 3.5701076984405518, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8542119264602661, + "num_tokens": 239560602.0, + "step": 6280 + }, + { + "epoch": 0.799007759826994, + "ewc_loss": 0.00714344996958971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.143450056901202e-05, + "grad_norm": 3.6257972717285156, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8691707253456116, + "num_tokens": 239591933.0, + "step": 6281 + }, + { + "epoch": 0.7991349701055845, + "ewc_loss": 0.007161347661167383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.16134745744057e-05, + "grad_norm": 3.6293976306915283, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8607562780380249, + "num_tokens": 239625495.0, + "step": 6282 + }, + { + "epoch": 0.7992621803841751, + "ewc_loss": 0.007136494852602482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.136494969017804e-05, + "grad_norm": 3.5415682792663574, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8816946148872375, + "num_tokens": 239658249.0, + "step": 6283 + }, + { + "epoch": 0.7993893906627656, + "ewc_loss": 0.007094669155776501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.09466912667267e-05, + "grad_norm": 3.620351791381836, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8577347993850708, + "num_tokens": 239690194.0, + "step": 6284 + }, + { + "epoch": 0.799516600941356, + "ewc_loss": 0.007175012957304716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.175013161031529e-05, + "grad_norm": 3.532285451889038, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8642266392707825, + "num_tokens": 239728315.0, + "step": 6285 + }, + { + "epoch": 0.7996438112199465, + "ewc_loss": 0.0070917378179728985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.091737643349916e-05, + "grad_norm": 3.554729461669922, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8712285757064819, + "num_tokens": 239764505.0, + "step": 6286 + }, + { + "epoch": 0.7997710214985371, + "ewc_loss": 0.0071503715589642525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.150371675379574e-05, + "grad_norm": 3.5070252418518066, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8522008061408997, + "num_tokens": 239809461.0, + "step": 6287 + }, + { + "epoch": 0.7998982317771276, + "ewc_loss": 0.007116745226085186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.116745109669864e-05, + "grad_norm": 3.6155292987823486, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.861813485622406, + "num_tokens": 239845232.0, + "step": 6288 + }, + { + "epoch": 0.8000254420557181, + "ewc_loss": 0.007185911759734154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.185911817941815e-05, + "grad_norm": 3.517160654067993, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8733326196670532, + "num_tokens": 239885489.0, + "step": 6289 + }, + { + "epoch": 0.8001526523343087, + "ewc_loss": 0.007097296416759491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.097296474967152e-05, + "grad_norm": 3.6355340480804443, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8541762232780457, + "num_tokens": 239921159.0, + "step": 6290 + }, + { + "epoch": 0.8002798626128991, + "ewc_loss": 0.007210745941847563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.210746116470546e-05, + "grad_norm": 3.5591509342193604, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8719560503959656, + "num_tokens": 239960879.0, + "step": 6291 + }, + { + "epoch": 0.8004070728914896, + "ewc_loss": 0.00711109908297658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.111098966561258e-05, + "grad_norm": 3.64621901512146, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8573329448699951, + "num_tokens": 239994302.0, + "step": 6292 + }, + { + "epoch": 0.8005342831700801, + "ewc_loss": 0.007187338545918465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.187338633229956e-05, + "grad_norm": 3.5670392513275146, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8736417889595032, + "num_tokens": 240027759.0, + "step": 6293 + }, + { + "epoch": 0.8006614934486707, + "ewc_loss": 0.007128225639462471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.128225843189284e-05, + "grad_norm": 3.5810306072235107, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8622831106185913, + "num_tokens": 240065049.0, + "step": 6294 + }, + { + "epoch": 0.8007887037272612, + "ewc_loss": 0.007169107440859079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.169107266236097e-05, + "grad_norm": 3.677201986312866, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8523610234260559, + "num_tokens": 240101453.0, + "step": 6295 + }, + { + "epoch": 0.8009159140058517, + "ewc_loss": 0.00721447728574276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214477227535099e-05, + "grad_norm": 3.609771490097046, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8465762138366699, + "num_tokens": 240136831.0, + "step": 6296 + }, + { + "epoch": 0.8010431242844421, + "ewc_loss": 0.007150030229240656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.150030432967469e-05, + "grad_norm": 3.600085973739624, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8651986122131348, + "num_tokens": 240173364.0, + "step": 6297 + }, + { + "epoch": 0.8011703345630327, + "ewc_loss": 0.007156624458730221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.156624633353204e-05, + "grad_norm": 3.6291091442108154, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8468601703643799, + "num_tokens": 240206718.0, + "step": 6298 + }, + { + "epoch": 0.8012975448416232, + "ewc_loss": 0.007183588109910488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.183587877079844e-05, + "grad_norm": 3.554704427719116, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8550039529800415, + "num_tokens": 240245633.0, + "step": 6299 + }, + { + "epoch": 0.8014247551202137, + "ewc_loss": 0.0071481927298009396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.148192526074126e-05, + "grad_norm": 3.537165403366089, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8815963864326477, + "num_tokens": 240283887.0, + "step": 6300 + }, + { + "epoch": 0.8015519653988042, + "ewc_loss": 0.007160751614719629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160751556511968e-05, + "grad_norm": 3.5425257682800293, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8575183153152466, + "num_tokens": 240322208.0, + "step": 6301 + }, + { + "epoch": 0.8016791756773948, + "ewc_loss": 0.007168745622038841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.168745651142672e-05, + "grad_norm": 3.5893521308898926, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8709001541137695, + "num_tokens": 240356154.0, + "step": 6302 + }, + { + "epoch": 0.8018063859559852, + "ewc_loss": 0.007203071378171444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.203071436379105e-05, + "grad_norm": 3.633857250213623, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8648777008056641, + "num_tokens": 240389103.0, + "step": 6303 + }, + { + "epoch": 0.8019335962345757, + "ewc_loss": 0.007207568734884262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.207568705780432e-05, + "grad_norm": 3.5124032497406006, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8829406499862671, + "num_tokens": 240429545.0, + "step": 6304 + }, + { + "epoch": 0.8020608065131662, + "ewc_loss": 0.007129526697099209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.1295267844107e-05, + "grad_norm": 3.535911798477173, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8821073770523071, + "num_tokens": 240468556.0, + "step": 6305 + }, + { + "epoch": 0.8021880167917568, + "ewc_loss": 0.007189965341240168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.189965253928676e-05, + "grad_norm": 3.5403170585632324, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8680647015571594, + "num_tokens": 240508534.0, + "step": 6306 + }, + { + "epoch": 0.8023152270703473, + "ewc_loss": 0.007182214874774218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.182214903878048e-05, + "grad_norm": 3.625180959701538, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8655872344970703, + "num_tokens": 240540833.0, + "step": 6307 + }, + { + "epoch": 0.8024424373489378, + "ewc_loss": 0.007208801805973053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.208801980596036e-05, + "grad_norm": 3.5831995010375977, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8578106164932251, + "num_tokens": 240577768.0, + "step": 6308 + }, + { + "epoch": 0.8025696476275284, + "ewc_loss": 0.007178605534136295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.178605301305652e-05, + "grad_norm": 3.61745023727417, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8617758750915527, + "num_tokens": 240610607.0, + "step": 6309 + }, + { + "epoch": 0.8026968579061188, + "ewc_loss": 0.007207977585494518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.207977614598349e-05, + "grad_norm": 3.616274833679199, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8626464009284973, + "num_tokens": 240646056.0, + "step": 6310 + }, + { + "epoch": 0.8028240681847093, + "ewc_loss": 0.007200913969427347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2009141149465e-05, + "grad_norm": 3.569390058517456, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8740706443786621, + "num_tokens": 240682513.0, + "step": 6311 + }, + { + "epoch": 0.8029512784632998, + "ewc_loss": 0.007174342405050993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.174342317739502e-05, + "grad_norm": 3.6376028060913086, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8669587969779968, + "num_tokens": 240716311.0, + "step": 6312 + }, + { + "epoch": 0.8030784887418904, + "ewc_loss": 0.007234671618789434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.234671647893265e-05, + "grad_norm": 3.5528299808502197, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8569916486740112, + "num_tokens": 240758873.0, + "step": 6313 + }, + { + "epoch": 0.8032056990204809, + "ewc_loss": 0.007151080295443535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.151080353651196e-05, + "grad_norm": 3.5520286560058594, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8715185523033142, + "num_tokens": 240797163.0, + "step": 6314 + }, + { + "epoch": 0.8033329092990714, + "ewc_loss": 0.007190729025751352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.190729229478166e-05, + "grad_norm": 3.5517587661743164, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8780539035797119, + "num_tokens": 240835719.0, + "step": 6315 + }, + { + "epoch": 0.8034601195776618, + "ewc_loss": 0.007190344855189323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.190345058916137e-05, + "grad_norm": 3.570401430130005, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8592292070388794, + "num_tokens": 240874359.0, + "step": 6316 + }, + { + "epoch": 0.8035873298562524, + "ewc_loss": 0.007177213206887245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.177213410614058e-05, + "grad_norm": 3.55562424659729, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8794912099838257, + "num_tokens": 240909125.0, + "step": 6317 + }, + { + "epoch": 0.8037145401348429, + "ewc_loss": 0.007178482133895159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.178482337621972e-05, + "grad_norm": 3.612867832183838, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8714240789413452, + "num_tokens": 240949391.0, + "step": 6318 + }, + { + "epoch": 0.8038417504134334, + "ewc_loss": 0.0072227967903018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.222796557471156e-05, + "grad_norm": 3.6005685329437256, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8551366925239563, + "num_tokens": 240987390.0, + "step": 6319 + }, + { + "epoch": 0.803968960692024, + "ewc_loss": 0.007172944489866495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.172944606281817e-05, + "grad_norm": 3.57008695602417, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8603041172027588, + "num_tokens": 241025359.0, + "step": 6320 + }, + { + "epoch": 0.8040961709706145, + "ewc_loss": 0.007160093169659376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160093082347885e-05, + "grad_norm": 3.6375765800476074, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8612615466117859, + "num_tokens": 241060532.0, + "step": 6321 + }, + { + "epoch": 0.8042233812492049, + "ewc_loss": 0.00720964977517724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.209649629658088e-05, + "grad_norm": 3.528679132461548, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8731658458709717, + "num_tokens": 241098547.0, + "step": 6322 + }, + { + "epoch": 0.8043505915277954, + "ewc_loss": 0.007124570198357105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.124570402083918e-05, + "grad_norm": 3.6327104568481445, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8665379881858826, + "num_tokens": 241131091.0, + "step": 6323 + }, + { + "epoch": 0.804477801806386, + "ewc_loss": 0.007235662080347538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.235661905724555e-05, + "grad_norm": 3.6731674671173096, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8560013175010681, + "num_tokens": 241164495.0, + "step": 6324 + }, + { + "epoch": 0.8046050120849765, + "ewc_loss": 0.007217065431177616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.217065285658464e-05, + "grad_norm": 3.5784695148468018, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8606541156768799, + "num_tokens": 241203777.0, + "step": 6325 + }, + { + "epoch": 0.804732222363567, + "ewc_loss": 0.007128985598683357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.128985453164205e-05, + "grad_norm": 3.5522239208221436, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.860281765460968, + "num_tokens": 241243219.0, + "step": 6326 + }, + { + "epoch": 0.8048594326421575, + "ewc_loss": 0.007180658634752035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.180658576544374e-05, + "grad_norm": 3.528873920440674, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8505948781967163, + "num_tokens": 241284547.0, + "step": 6327 + }, + { + "epoch": 0.804986642920748, + "ewc_loss": 0.00716162845492363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.161628309404477e-05, + "grad_norm": 3.5456953048706055, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8497198820114136, + "num_tokens": 241331110.0, + "step": 6328 + }, + { + "epoch": 0.8051138531993385, + "ewc_loss": 0.007160339504480362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160339737311006e-05, + "grad_norm": 3.539478063583374, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8663942813873291, + "num_tokens": 241372720.0, + "step": 6329 + }, + { + "epoch": 0.805241063477929, + "ewc_loss": 0.007161058485507965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.161058601923287e-05, + "grad_norm": 3.5764832496643066, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8673410415649414, + "num_tokens": 241408173.0, + "step": 6330 + }, + { + "epoch": 0.8053682737565195, + "ewc_loss": 0.007179084233939648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.179084059316665e-05, + "grad_norm": 3.545445680618286, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8721587657928467, + "num_tokens": 241445021.0, + "step": 6331 + }, + { + "epoch": 0.8054954840351101, + "ewc_loss": 0.007152283098548651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.15228306944482e-05, + "grad_norm": 3.5858235359191895, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8709096908569336, + "num_tokens": 241481874.0, + "step": 6332 + }, + { + "epoch": 0.8056226943137006, + "ewc_loss": 0.007178985048085451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.178985106293112e-05, + "grad_norm": 3.5560545921325684, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.850828230381012, + "num_tokens": 241523140.0, + "step": 6333 + }, + { + "epoch": 0.805749904592291, + "ewc_loss": 0.007145620416849852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.145620475057513e-05, + "grad_norm": 3.5795488357543945, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.849085807800293, + "num_tokens": 241563424.0, + "step": 6334 + }, + { + "epoch": 0.8058771148708815, + "ewc_loss": 0.007157885003834963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.15788482921198e-05, + "grad_norm": 3.556971788406372, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8784875273704529, + "num_tokens": 241600004.0, + "step": 6335 + }, + { + "epoch": 0.8060043251494721, + "ewc_loss": 0.007125952746719122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.125952834030613e-05, + "grad_norm": 3.534738540649414, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8644609451293945, + "num_tokens": 241642704.0, + "step": 6336 + }, + { + "epoch": 0.8061315354280626, + "ewc_loss": 0.007119625806808472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.11962566128932e-05, + "grad_norm": 3.562300205230713, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8708226084709167, + "num_tokens": 241679176.0, + "step": 6337 + }, + { + "epoch": 0.8062587457066531, + "ewc_loss": 0.007133047562092543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.133047620300204e-05, + "grad_norm": 3.5443241596221924, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8705066442489624, + "num_tokens": 241721168.0, + "step": 6338 + }, + { + "epoch": 0.8063859559852437, + "ewc_loss": 0.007105669006705284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.105668919393793e-05, + "grad_norm": 3.54622483253479, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8795080780982971, + "num_tokens": 241759735.0, + "step": 6339 + }, + { + "epoch": 0.8065131662638341, + "ewc_loss": 0.0070986379869282246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.098638161551207e-05, + "grad_norm": 3.52923846244812, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8665221929550171, + "num_tokens": 241799445.0, + "step": 6340 + }, + { + "epoch": 0.8066403765424246, + "ewc_loss": 0.007098003756254911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.09800369804725e-05, + "grad_norm": 3.576528549194336, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8695797920227051, + "num_tokens": 241839333.0, + "step": 6341 + }, + { + "epoch": 0.8067675868210151, + "ewc_loss": 0.0071101319044828415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.110131991794333e-05, + "grad_norm": 3.553307056427002, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8536233305931091, + "num_tokens": 241880248.0, + "step": 6342 + }, + { + "epoch": 0.8068947970996057, + "ewc_loss": 0.007079778239130974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.079778151819482e-05, + "grad_norm": 3.5052757263183594, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8775700926780701, + "num_tokens": 241926717.0, + "step": 6343 + }, + { + "epoch": 0.8070220073781962, + "ewc_loss": 0.007052665576338768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.052665750961751e-05, + "grad_norm": 3.587721586227417, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8565206527709961, + "num_tokens": 241966080.0, + "step": 6344 + }, + { + "epoch": 0.8071492176567867, + "ewc_loss": 0.0071125212125480175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.112521416274831e-05, + "grad_norm": 3.5383644104003906, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8629388809204102, + "num_tokens": 242011975.0, + "step": 6345 + }, + { + "epoch": 0.8072764279353771, + "ewc_loss": 0.007048733998090029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.048733823467046e-05, + "grad_norm": 3.5712037086486816, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.879090428352356, + "num_tokens": 242046320.0, + "step": 6346 + }, + { + "epoch": 0.8074036382139677, + "ewc_loss": 0.00707126734778285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.071267464198172e-05, + "grad_norm": 3.56575083732605, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.866666316986084, + "num_tokens": 242085157.0, + "step": 6347 + }, + { + "epoch": 0.8075308484925582, + "ewc_loss": 0.0070565505884587765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.056550384731963e-05, + "grad_norm": 3.662940740585327, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8622157573699951, + "num_tokens": 242116548.0, + "step": 6348 + }, + { + "epoch": 0.8076580587711487, + "ewc_loss": 0.007127499207854271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.127498975023627e-05, + "grad_norm": 3.5804498195648193, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8729350566864014, + "num_tokens": 242151883.0, + "step": 6349 + }, + { + "epoch": 0.8077852690497392, + "ewc_loss": 0.007051188498735428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.051188731566072e-05, + "grad_norm": 3.5613327026367188, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8632074594497681, + "num_tokens": 242190486.0, + "step": 6350 + }, + { + "epoch": 0.8079124793283298, + "ewc_loss": 0.007067379076033831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.067379192449152e-05, + "grad_norm": 3.5192489624023438, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8703160881996155, + "num_tokens": 242232720.0, + "step": 6351 + }, + { + "epoch": 0.8080396896069202, + "ewc_loss": 0.007059212774038315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.059212657622993e-05, + "grad_norm": 3.5677883625030518, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8682676553726196, + "num_tokens": 242273926.0, + "step": 6352 + }, + { + "epoch": 0.8081668998855107, + "ewc_loss": 0.007068191189318895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.068191189318895e-05, + "grad_norm": 3.5361194610595703, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8562940359115601, + "num_tokens": 242313484.0, + "step": 6353 + }, + { + "epoch": 0.8082941101641012, + "ewc_loss": 0.007067216094583273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.067216210998595e-05, + "grad_norm": 3.6265599727630615, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8612580895423889, + "num_tokens": 242352101.0, + "step": 6354 + }, + { + "epoch": 0.8084213204426918, + "ewc_loss": 0.007111994083970785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.111993909347802e-05, + "grad_norm": 3.4907658100128174, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8810445070266724, + "num_tokens": 242392958.0, + "step": 6355 + }, + { + "epoch": 0.8085485307212823, + "ewc_loss": 0.007010727655142546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.010727858869359e-05, + "grad_norm": 3.542489767074585, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8678432106971741, + "num_tokens": 242432327.0, + "step": 6356 + }, + { + "epoch": 0.8086757409998728, + "ewc_loss": 0.007101600989699364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.101600931491703e-05, + "grad_norm": 3.580310583114624, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8455363512039185, + "num_tokens": 242473714.0, + "step": 6357 + }, + { + "epoch": 0.8088029512784632, + "ewc_loss": 0.007095505017787218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.09550513420254e-05, + "grad_norm": 3.6000983715057373, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8740861415863037, + "num_tokens": 242513179.0, + "step": 6358 + }, + { + "epoch": 0.8089301615570538, + "ewc_loss": 0.007081582676619291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0815825893078e-05, + "grad_norm": 3.5640761852264404, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8669875860214233, + "num_tokens": 242549398.0, + "step": 6359 + }, + { + "epoch": 0.8090573718356443, + "ewc_loss": 0.007068270351737738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06827049725689e-05, + "grad_norm": 3.571470022201538, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8617024421691895, + "num_tokens": 242583705.0, + "step": 6360 + }, + { + "epoch": 0.8091845821142348, + "ewc_loss": 0.007077925838530064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.077925693010911e-05, + "grad_norm": 3.5232818126678467, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8707568049430847, + "num_tokens": 242625900.0, + "step": 6361 + }, + { + "epoch": 0.8093117923928254, + "ewc_loss": 0.0070485337637364864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.048533734632656e-05, + "grad_norm": 3.562953472137451, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8558336496353149, + "num_tokens": 242665766.0, + "step": 6362 + }, + { + "epoch": 0.8094390026714159, + "ewc_loss": 0.007094543427228928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.094543252605945e-05, + "grad_norm": 3.5365567207336426, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8692837357521057, + "num_tokens": 242707634.0, + "step": 6363 + }, + { + "epoch": 0.8095662129500064, + "ewc_loss": 0.007064523175358772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.064523379085585e-05, + "grad_norm": 3.5817179679870605, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8618578314781189, + "num_tokens": 242746791.0, + "step": 6364 + }, + { + "epoch": 0.8096934232285968, + "ewc_loss": 0.007092950399965048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0929505454842e-05, + "grad_norm": 3.53285551071167, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8781181573867798, + "num_tokens": 242784861.0, + "step": 6365 + }, + { + "epoch": 0.8098206335071874, + "ewc_loss": 0.007041908334940672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04190824762918e-05, + "grad_norm": 3.5737216472625732, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8813650608062744, + "num_tokens": 242817612.0, + "step": 6366 + }, + { + "epoch": 0.8099478437857779, + "ewc_loss": 0.007082824595272541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.082824595272541e-05, + "grad_norm": 3.540837287902832, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8737702369689941, + "num_tokens": 242854951.0, + "step": 6367 + }, + { + "epoch": 0.8100750540643684, + "ewc_loss": 0.007039989344775677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.03998957760632e-05, + "grad_norm": 3.545459508895874, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8812705874443054, + "num_tokens": 242894166.0, + "step": 6368 + }, + { + "epoch": 0.8102022643429589, + "ewc_loss": 0.007058020681142807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.05802085576579e-05, + "grad_norm": 3.5424530506134033, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8716965913772583, + "num_tokens": 242936230.0, + "step": 6369 + }, + { + "epoch": 0.8103294746215495, + "ewc_loss": 0.007059724070131779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.05972415744327e-05, + "grad_norm": 3.614131450653076, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8768302202224731, + "num_tokens": 242973148.0, + "step": 6370 + }, + { + "epoch": 0.8104566849001399, + "ewc_loss": 0.007081787101924419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.081787043716758e-05, + "grad_norm": 3.604281187057495, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8687558174133301, + "num_tokens": 243013029.0, + "step": 6371 + }, + { + "epoch": 0.8105838951787304, + "ewc_loss": 0.007034107111394405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.034106965875253e-05, + "grad_norm": 3.5414652824401855, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8642078638076782, + "num_tokens": 243054847.0, + "step": 6372 + }, + { + "epoch": 0.810711105457321, + "ewc_loss": 0.007014651782810688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.014651782810688e-05, + "grad_norm": 3.5796103477478027, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8588669300079346, + "num_tokens": 243096524.0, + "step": 6373 + }, + { + "epoch": 0.8108383157359115, + "ewc_loss": 0.007067854981869459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06785504007712e-05, + "grad_norm": 3.6291439533233643, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8590048551559448, + "num_tokens": 243132397.0, + "step": 6374 + }, + { + "epoch": 0.810965526014502, + "ewc_loss": 0.007055539172142744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.055539026623592e-05, + "grad_norm": 3.556999921798706, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8527259826660156, + "num_tokens": 243171790.0, + "step": 6375 + }, + { + "epoch": 0.8110927362930925, + "ewc_loss": 0.00700405053794384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.004050712566823e-05, + "grad_norm": 3.6000399589538574, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8636298179626465, + "num_tokens": 243208217.0, + "step": 6376 + }, + { + "epoch": 0.811219946571683, + "ewc_loss": 0.00706066656857729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.060666393954307e-05, + "grad_norm": 3.6082630157470703, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8712567090988159, + "num_tokens": 243245260.0, + "step": 6377 + }, + { + "epoch": 0.8113471568502735, + "ewc_loss": 0.007042720448225737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.042720244498923e-05, + "grad_norm": 3.5547397136688232, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8754026889801025, + "num_tokens": 243283924.0, + "step": 6378 + }, + { + "epoch": 0.811474367128864, + "ewc_loss": 0.007028263062238693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.028262916719541e-05, + "grad_norm": 3.5688326358795166, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8738645911216736, + "num_tokens": 243321977.0, + "step": 6379 + }, + { + "epoch": 0.8116015774074545, + "ewc_loss": 0.0070490967482328415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.049096893751994e-05, + "grad_norm": 3.5715811252593994, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8625900745391846, + "num_tokens": 243359349.0, + "step": 6380 + }, + { + "epoch": 0.8117287876860451, + "ewc_loss": 0.007053410168737173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.053410081425682e-05, + "grad_norm": 3.5686450004577637, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8692048788070679, + "num_tokens": 243395722.0, + "step": 6381 + }, + { + "epoch": 0.8118559979646356, + "ewc_loss": 0.007056764326989651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.05676429788582e-05, + "grad_norm": 3.6501047611236572, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8567659854888916, + "num_tokens": 243429022.0, + "step": 6382 + }, + { + "epoch": 0.811983208243226, + "ewc_loss": 0.007101818453520536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.101818482624367e-05, + "grad_norm": 3.589191198348999, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8455648422241211, + "num_tokens": 243468474.0, + "step": 6383 + }, + { + "epoch": 0.8121104185218165, + "ewc_loss": 0.007050937041640282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.050936983432621e-05, + "grad_norm": 3.5553112030029297, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8828142881393433, + "num_tokens": 243503045.0, + "step": 6384 + }, + { + "epoch": 0.8122376288004071, + "ewc_loss": 0.007063773460686207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.063773227855563e-05, + "grad_norm": 3.5860369205474854, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8567602038383484, + "num_tokens": 243543865.0, + "step": 6385 + }, + { + "epoch": 0.8123648390789976, + "ewc_loss": 0.007104652468115091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.104652468115091e-05, + "grad_norm": 3.604182243347168, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8598084449768066, + "num_tokens": 243581258.0, + "step": 6386 + }, + { + "epoch": 0.8124920493575881, + "ewc_loss": 0.007106511387974024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.106511475285515e-05, + "grad_norm": 3.5535898208618164, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8608408570289612, + "num_tokens": 243622777.0, + "step": 6387 + }, + { + "epoch": 0.8126192596361786, + "ewc_loss": 0.007084301672875881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.08430161466822e-05, + "grad_norm": 3.5899362564086914, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8544187545776367, + "num_tokens": 243664549.0, + "step": 6388 + }, + { + "epoch": 0.8127464699147691, + "ewc_loss": 0.007131919730454683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.131919846870005e-05, + "grad_norm": 3.5318405628204346, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8600346446037292, + "num_tokens": 243710408.0, + "step": 6389 + }, + { + "epoch": 0.8128736801933596, + "ewc_loss": 0.007075677625834942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.075677422108129e-05, + "grad_norm": 3.548828601837158, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8666582107543945, + "num_tokens": 243752598.0, + "step": 6390 + }, + { + "epoch": 0.8130008904719501, + "ewc_loss": 0.007096492685377598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.096492481650785e-05, + "grad_norm": 3.56767201423645, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8630936741828918, + "num_tokens": 243789805.0, + "step": 6391 + }, + { + "epoch": 0.8131281007505406, + "ewc_loss": 0.00711319874972105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.113198807928711e-05, + "grad_norm": 3.5980224609375, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8761823773384094, + "num_tokens": 243824124.0, + "step": 6392 + }, + { + "epoch": 0.8132553110291312, + "ewc_loss": 0.007127909921109676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.127910066628829e-05, + "grad_norm": 3.6004886627197266, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8652633428573608, + "num_tokens": 243863448.0, + "step": 6393 + }, + { + "epoch": 0.8133825213077217, + "ewc_loss": 0.007106560282409191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.10656022420153e-05, + "grad_norm": 3.5255393981933594, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.869193434715271, + "num_tokens": 243902085.0, + "step": 6394 + }, + { + "epoch": 0.8135097315863121, + "ewc_loss": 0.0070671625435352325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06716236891225e-05, + "grad_norm": 3.521596670150757, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8612909913063049, + "num_tokens": 243945349.0, + "step": 6395 + }, + { + "epoch": 0.8136369418649027, + "ewc_loss": 0.007098040077835321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.098040077835321e-05, + "grad_norm": 3.603719711303711, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8638679385185242, + "num_tokens": 243982221.0, + "step": 6396 + }, + { + "epoch": 0.8137641521434932, + "ewc_loss": 0.007137261796742678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.137261854950339e-05, + "grad_norm": 3.7148032188415527, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8538140058517456, + "num_tokens": 244012576.0, + "step": 6397 + }, + { + "epoch": 0.8138913624220837, + "ewc_loss": 0.007153540849685669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.153541082516313e-05, + "grad_norm": 3.510657787322998, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8728065490722656, + "num_tokens": 244052089.0, + "step": 6398 + }, + { + "epoch": 0.8140185727006742, + "ewc_loss": 0.007015854585915804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.015854498604313e-05, + "grad_norm": 3.5835537910461426, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8653136491775513, + "num_tokens": 244088712.0, + "step": 6399 + }, + { + "epoch": 0.8141457829792648, + "ewc_loss": 0.007135714869946241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.135714986361563e-05, + "grad_norm": 3.5629029273986816, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8681304454803467, + "num_tokens": 244127655.0, + "step": 6400 + }, + { + "epoch": 0.8142729932578552, + "ewc_loss": 0.007067142520099878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.067142723826692e-05, + "grad_norm": 3.6150527000427246, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8717445135116577, + "num_tokens": 244158588.0, + "step": 6401 + }, + { + "epoch": 0.8144002035364457, + "ewc_loss": 0.007136004511266947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.136004569474608e-05, + "grad_norm": 3.5893590450286865, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8849079608917236, + "num_tokens": 244193872.0, + "step": 6402 + }, + { + "epoch": 0.8145274138150362, + "ewc_loss": 0.0070806569419801235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.080657087499276e-05, + "grad_norm": 3.5403120517730713, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8515351414680481, + "num_tokens": 244234055.0, + "step": 6403 + }, + { + "epoch": 0.8146546240936268, + "ewc_loss": 0.0070805624127388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.080562500050291e-05, + "grad_norm": 3.5665230751037598, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8677324652671814, + "num_tokens": 244275163.0, + "step": 6404 + }, + { + "epoch": 0.8147818343722173, + "ewc_loss": 0.007121731992810965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.121732051018625e-05, + "grad_norm": 3.5396692752838135, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8632419109344482, + "num_tokens": 244315241.0, + "step": 6405 + }, + { + "epoch": 0.8149090446508078, + "ewc_loss": 0.007083646021783352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.083646050887182e-05, + "grad_norm": 3.598890781402588, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8810335397720337, + "num_tokens": 244351947.0, + "step": 6406 + }, + { + "epoch": 0.8150362549293982, + "ewc_loss": 0.007134561892598867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.134561747079715e-05, + "grad_norm": 3.5986101627349854, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8641743659973145, + "num_tokens": 244387706.0, + "step": 6407 + }, + { + "epoch": 0.8151634652079888, + "ewc_loss": 0.007107230369001627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.107230339897797e-05, + "grad_norm": 3.566602945327759, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8726896047592163, + "num_tokens": 244428447.0, + "step": 6408 + }, + { + "epoch": 0.8152906754865793, + "ewc_loss": 0.007095545995980501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.09554587956518e-05, + "grad_norm": 3.5740437507629395, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8520110845565796, + "num_tokens": 244471191.0, + "step": 6409 + }, + { + "epoch": 0.8154178857651698, + "ewc_loss": 0.007104092743247747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.10409294697456e-05, + "grad_norm": 3.6022744178771973, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8484406471252441, + "num_tokens": 244509198.0, + "step": 6410 + }, + { + "epoch": 0.8155450960437604, + "ewc_loss": 0.007118030916899443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.11803077138029e-05, + "grad_norm": 3.5705041885375977, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.858475923538208, + "num_tokens": 244549677.0, + "step": 6411 + }, + { + "epoch": 0.8156723063223509, + "ewc_loss": 0.00708737364038825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.087373523972929e-05, + "grad_norm": 3.6498942375183105, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8540406227111816, + "num_tokens": 244583586.0, + "step": 6412 + }, + { + "epoch": 0.8157995166009414, + "ewc_loss": 0.00715657277032733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.156572974054143e-05, + "grad_norm": 3.542241334915161, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8732326030731201, + "num_tokens": 244621220.0, + "step": 6413 + }, + { + "epoch": 0.8159267268795318, + "ewc_loss": 0.007065213285386562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06521313986741e-05, + "grad_norm": 3.5916221141815186, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8729511499404907, + "num_tokens": 244655941.0, + "step": 6414 + }, + { + "epoch": 0.8160539371581224, + "ewc_loss": 0.007143076974898577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.143076800275594e-05, + "grad_norm": 3.611262559890747, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8835605978965759, + "num_tokens": 244685807.0, + "step": 6415 + }, + { + "epoch": 0.8161811474367129, + "ewc_loss": 0.007129346486181021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.129346340661868e-05, + "grad_norm": 3.5611608028411865, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8650187253952026, + "num_tokens": 244726866.0, + "step": 6416 + }, + { + "epoch": 0.8163083577153034, + "ewc_loss": 0.00709761306643486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.097612979123369e-05, + "grad_norm": 3.7999398708343506, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8563023805618286, + "num_tokens": 244767273.0, + "step": 6417 + }, + { + "epoch": 0.8164355679938939, + "ewc_loss": 0.0072606410831213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.260640995809808e-05, + "grad_norm": 3.565844774246216, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8626205325126648, + "num_tokens": 244806111.0, + "step": 6418 + }, + { + "epoch": 0.8165627782724845, + "ewc_loss": 0.007036334834992886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.036334864096716e-05, + "grad_norm": 3.579230546951294, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8753023743629456, + "num_tokens": 244841706.0, + "step": 6419 + }, + { + "epoch": 0.8166899885510749, + "ewc_loss": 0.007129224948585033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.129224832169712e-05, + "grad_norm": 3.530245304107666, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8677033185958862, + "num_tokens": 244883908.0, + "step": 6420 + }, + { + "epoch": 0.8168171988296654, + "ewc_loss": 0.0070902686566114426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.090268627507612e-05, + "grad_norm": 3.617528200149536, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8598856925964355, + "num_tokens": 244919090.0, + "step": 6421 + }, + { + "epoch": 0.8169444091082559, + "ewc_loss": 0.007170618511736393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.170618482632563e-05, + "grad_norm": 3.5362086296081543, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8800544142723083, + "num_tokens": 244961141.0, + "step": 6422 + }, + { + "epoch": 0.8170716193868465, + "ewc_loss": 0.007083713077008724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.083712989697233e-05, + "grad_norm": 3.5911335945129395, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8515212535858154, + "num_tokens": 245004503.0, + "step": 6423 + }, + { + "epoch": 0.817198829665437, + "ewc_loss": 0.007144863251596689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.144863047869876e-05, + "grad_norm": 3.573394775390625, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8650825023651123, + "num_tokens": 245045042.0, + "step": 6424 + }, + { + "epoch": 0.8173260399440275, + "ewc_loss": 0.007113775238394737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.113775063771755e-05, + "grad_norm": 3.563086986541748, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.864915132522583, + "num_tokens": 245085416.0, + "step": 6425 + }, + { + "epoch": 0.8174532502226179, + "ewc_loss": 0.007115693762898445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.115693733794615e-05, + "grad_norm": 3.662703037261963, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8569170236587524, + "num_tokens": 245116137.0, + "step": 6426 + }, + { + "epoch": 0.8175804605012085, + "ewc_loss": 0.007172019220888615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.172019104473293e-05, + "grad_norm": 3.4970641136169434, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8788629174232483, + "num_tokens": 245159181.0, + "step": 6427 + }, + { + "epoch": 0.817707670779799, + "ewc_loss": 0.007044039200991392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.044039375614375e-05, + "grad_norm": 3.5351877212524414, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8586667776107788, + "num_tokens": 245205199.0, + "step": 6428 + }, + { + "epoch": 0.8178348810583895, + "ewc_loss": 0.007122731767594814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.122731767594814e-05, + "grad_norm": 3.6092443466186523, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8707054257392883, + "num_tokens": 245242455.0, + "step": 6429 + }, + { + "epoch": 0.8179620913369801, + "ewc_loss": 0.007144523784518242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.144523988245055e-05, + "grad_norm": 3.5309629440307617, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8854536414146423, + "num_tokens": 245280675.0, + "step": 6430 + }, + { + "epoch": 0.8180893016155706, + "ewc_loss": 0.00705358199775219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.053581794025376e-05, + "grad_norm": 3.579753875732422, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8816560506820679, + "num_tokens": 245316947.0, + "step": 6431 + }, + { + "epoch": 0.818216511894161, + "ewc_loss": 0.007122435607016087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.122435636119917e-05, + "grad_norm": 3.5686800479888916, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8704546093940735, + "num_tokens": 245355145.0, + "step": 6432 + }, + { + "epoch": 0.8183437221727515, + "ewc_loss": 0.007086247205734253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.086247205734253e-05, + "grad_norm": 3.575556516647339, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8770086765289307, + "num_tokens": 245390282.0, + "step": 6433 + }, + { + "epoch": 0.8184709324513421, + "ewc_loss": 0.007104539778083563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.104539690772071e-05, + "grad_norm": 3.5431175231933594, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8744208812713623, + "num_tokens": 245430262.0, + "step": 6434 + }, + { + "epoch": 0.8185981427299326, + "ewc_loss": 0.007084406912326813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.084407116053626e-05, + "grad_norm": 3.59517240524292, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8646879196166992, + "num_tokens": 245470874.0, + "step": 6435 + }, + { + "epoch": 0.8187253530085231, + "ewc_loss": 0.007111524697393179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.111524610081688e-05, + "grad_norm": 3.5905609130859375, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8742302656173706, + "num_tokens": 245506626.0, + "step": 6436 + }, + { + "epoch": 0.8188525632871136, + "ewc_loss": 0.007086669094860554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.086669211275876e-05, + "grad_norm": 3.60990047454834, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8503445386886597, + "num_tokens": 245546794.0, + "step": 6437 + }, + { + "epoch": 0.8189797735657041, + "ewc_loss": 0.007094677072018385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.094677130226046e-05, + "grad_norm": 3.531191825866699, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8689064979553223, + "num_tokens": 245589544.0, + "step": 6438 + }, + { + "epoch": 0.8191069838442946, + "ewc_loss": 0.007042679004371166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.042678771540523e-05, + "grad_norm": 3.5795867443084717, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8826230764389038, + "num_tokens": 245628889.0, + "step": 6439 + }, + { + "epoch": 0.8192341941228851, + "ewc_loss": 0.007102185860276222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.102185918483883e-05, + "grad_norm": 3.543449878692627, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8633243441581726, + "num_tokens": 245674615.0, + "step": 6440 + }, + { + "epoch": 0.8193614044014756, + "ewc_loss": 0.007050512824207544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.050512795103714e-05, + "grad_norm": 3.582878828048706, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8669660687446594, + "num_tokens": 245713966.0, + "step": 6441 + }, + { + "epoch": 0.8194886146800662, + "ewc_loss": 0.007075486239045858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.075486064422876e-05, + "grad_norm": 3.5558815002441406, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8703388571739197, + "num_tokens": 245750035.0, + "step": 6442 + }, + { + "epoch": 0.8196158249586567, + "ewc_loss": 0.007055133115500212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.05513302818872e-05, + "grad_norm": 3.59486722946167, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8828063011169434, + "num_tokens": 245788506.0, + "step": 6443 + }, + { + "epoch": 0.8197430352372471, + "ewc_loss": 0.007078688126057386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.078688213368878e-05, + "grad_norm": 3.5442519187927246, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8698795437812805, + "num_tokens": 245826547.0, + "step": 6444 + }, + { + "epoch": 0.8198702455158376, + "ewc_loss": 0.007035247050225735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.035247108433396e-05, + "grad_norm": 3.539811849594116, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8572062253952026, + "num_tokens": 245874363.0, + "step": 6445 + }, + { + "epoch": 0.8199974557944282, + "ewc_loss": 0.007056715432554483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.056715548969805e-05, + "grad_norm": 3.563537836074829, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8481444120407104, + "num_tokens": 245917813.0, + "step": 6446 + }, + { + "epoch": 0.8201246660730187, + "ewc_loss": 0.007052574306726456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.052574073895812e-05, + "grad_norm": 3.66707706451416, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8629198670387268, + "num_tokens": 245952177.0, + "step": 6447 + }, + { + "epoch": 0.8202518763516092, + "ewc_loss": 0.00709535600617528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.095355977071449e-05, + "grad_norm": 3.522221565246582, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8693612217903137, + "num_tokens": 245992897.0, + "step": 6448 + }, + { + "epoch": 0.8203790866301998, + "ewc_loss": 0.006994408089667559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.994407885940745e-05, + "grad_norm": 3.5997939109802246, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8751899600028992, + "num_tokens": 246029657.0, + "step": 6449 + }, + { + "epoch": 0.8205062969087902, + "ewc_loss": 0.0070938775315880775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.093877502484247e-05, + "grad_norm": 3.5397181510925293, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8575247526168823, + "num_tokens": 246072425.0, + "step": 6450 + }, + { + "epoch": 0.8206335071873807, + "ewc_loss": 0.007037199102342129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.037199247861281e-05, + "grad_norm": 3.519543170928955, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8839043378829956, + "num_tokens": 246113131.0, + "step": 6451 + }, + { + "epoch": 0.8207607174659712, + "ewc_loss": 0.007047751452773809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.047751569189131e-05, + "grad_norm": 3.5645463466644287, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8698931932449341, + "num_tokens": 246154548.0, + "step": 6452 + }, + { + "epoch": 0.8208879277445618, + "ewc_loss": 0.007077245507389307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.077245390973985e-05, + "grad_norm": 3.6431851387023926, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8642152547836304, + "num_tokens": 246189518.0, + "step": 6453 + }, + { + "epoch": 0.8210151380231523, + "ewc_loss": 0.007094591856002808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.09459200152196e-05, + "grad_norm": 3.542072057723999, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8623273372650146, + "num_tokens": 246235248.0, + "step": 6454 + }, + { + "epoch": 0.8211423483017428, + "ewc_loss": 0.007018490694463253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.018490578047931e-05, + "grad_norm": 3.574939250946045, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8597260117530823, + "num_tokens": 246277016.0, + "step": 6455 + }, + { + "epoch": 0.8212695585803332, + "ewc_loss": 0.007097181398421526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.097181514836848e-05, + "grad_norm": 3.578322172164917, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8642986416816711, + "num_tokens": 246312352.0, + "step": 6456 + }, + { + "epoch": 0.8213967688589238, + "ewc_loss": 0.007068378385156393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06837818142958e-05, + "grad_norm": 3.5466647148132324, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8530725240707397, + "num_tokens": 246355079.0, + "step": 6457 + }, + { + "epoch": 0.8215239791375143, + "ewc_loss": 0.007057052571326494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.057052425807342e-05, + "grad_norm": 3.6231236457824707, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8535924553871155, + "num_tokens": 246391073.0, + "step": 6458 + }, + { + "epoch": 0.8216511894161048, + "ewc_loss": 0.007119960151612759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.119960355339572e-05, + "grad_norm": 3.5266921520233154, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8737689852714539, + "num_tokens": 246435407.0, + "step": 6459 + }, + { + "epoch": 0.8217783996946953, + "ewc_loss": 0.007023435086011887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.02343531884253e-05, + "grad_norm": 3.610781192779541, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8516527414321899, + "num_tokens": 246475303.0, + "step": 6460 + }, + { + "epoch": 0.8219056099732859, + "ewc_loss": 0.007121313828974962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.12131368345581e-05, + "grad_norm": 3.553027391433716, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8642061948776245, + "num_tokens": 246515237.0, + "step": 6461 + }, + { + "epoch": 0.8220328202518764, + "ewc_loss": 0.007039390038698912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.039390038698912e-05, + "grad_norm": 3.5815913677215576, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8599832057952881, + "num_tokens": 246554958.0, + "step": 6462 + }, + { + "epoch": 0.8221600305304668, + "ewc_loss": 0.007088754326105118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0887545007281e-05, + "grad_norm": 3.6296889781951904, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8675218224525452, + "num_tokens": 246592327.0, + "step": 6463 + }, + { + "epoch": 0.8222872408090574, + "ewc_loss": 0.007088817190378904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.088817073963583e-05, + "grad_norm": 3.5670084953308105, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.872377336025238, + "num_tokens": 246626965.0, + "step": 6464 + }, + { + "epoch": 0.8224144510876479, + "ewc_loss": 0.007039212621748447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.039212505333126e-05, + "grad_norm": 3.5833020210266113, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8631495237350464, + "num_tokens": 246662049.0, + "step": 6465 + }, + { + "epoch": 0.8225416613662384, + "ewc_loss": 0.007099452428519726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.099452341208234e-05, + "grad_norm": 3.6062495708465576, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8670455813407898, + "num_tokens": 246696989.0, + "step": 6466 + }, + { + "epoch": 0.8226688716448289, + "ewc_loss": 0.007104410789906979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.1044109063223e-05, + "grad_norm": 3.5097482204437256, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8703224062919617, + "num_tokens": 246738418.0, + "step": 6467 + }, + { + "epoch": 0.8227960819234195, + "ewc_loss": 0.007047401741147041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.047401595627889e-05, + "grad_norm": 3.58210825920105, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8667272925376892, + "num_tokens": 246776779.0, + "step": 6468 + }, + { + "epoch": 0.8229232922020099, + "ewc_loss": 0.007127555552870035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.127555727493018e-05, + "grad_norm": 3.563375234603882, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8717234134674072, + "num_tokens": 246814029.0, + "step": 6469 + }, + { + "epoch": 0.8230505024806004, + "ewc_loss": 0.00707991560921073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.07991566741839e-05, + "grad_norm": 3.5536043643951416, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8582749366760254, + "num_tokens": 246856390.0, + "step": 6470 + }, + { + "epoch": 0.8231777127591909, + "ewc_loss": 0.007077746093273163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.077745976857841e-05, + "grad_norm": 3.5773849487304688, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8688759207725525, + "num_tokens": 246895563.0, + "step": 6471 + }, + { + "epoch": 0.8233049230377815, + "ewc_loss": 0.007106530945748091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.106531120371073e-05, + "grad_norm": 3.614215612411499, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8630022406578064, + "num_tokens": 246935956.0, + "step": 6472 + }, + { + "epoch": 0.823432133316372, + "ewc_loss": 0.007112856954336166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.112856837920845e-05, + "grad_norm": 3.5183639526367188, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8625731468200684, + "num_tokens": 246984765.0, + "step": 6473 + }, + { + "epoch": 0.8235593435949625, + "ewc_loss": 0.007048068102449179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.048068073345348e-05, + "grad_norm": 3.6387791633605957, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8710339069366455, + "num_tokens": 247017086.0, + "step": 6474 + }, + { + "epoch": 0.8236865538735529, + "ewc_loss": 0.0071470350958406925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.14703492121771e-05, + "grad_norm": 3.55570387840271, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8714686632156372, + "num_tokens": 247056195.0, + "step": 6475 + }, + { + "epoch": 0.8238137641521435, + "ewc_loss": 0.00704981479793787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.049815030768514e-05, + "grad_norm": 3.613081216812134, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8792227506637573, + "num_tokens": 247094063.0, + "step": 6476 + }, + { + "epoch": 0.823940974430734, + "ewc_loss": 0.007118684705346823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.118684879969805e-05, + "grad_norm": 3.636497974395752, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8645829558372498, + "num_tokens": 247131743.0, + "step": 6477 + }, + { + "epoch": 0.8240681847093245, + "ewc_loss": 0.007104213815182447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.104213727870956e-05, + "grad_norm": 3.6186013221740723, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8639082312583923, + "num_tokens": 247169477.0, + "step": 6478 + }, + { + "epoch": 0.824195394987915, + "ewc_loss": 0.007100878283381462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.100878428900614e-05, + "grad_norm": 3.5876078605651855, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.87521892786026, + "num_tokens": 247209159.0, + "step": 6479 + }, + { + "epoch": 0.8243226052665056, + "ewc_loss": 0.007086541969329119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.086541882017627e-05, + "grad_norm": 3.574760913848877, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8671793937683105, + "num_tokens": 247247603.0, + "step": 6480 + }, + { + "epoch": 0.824449815545096, + "ewc_loss": 0.007089535240083933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.089535210980102e-05, + "grad_norm": 3.5395071506500244, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8758556842803955, + "num_tokens": 247290184.0, + "step": 6481 + }, + { + "epoch": 0.8245770258236865, + "ewc_loss": 0.007056076545268297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.05607671989128e-05, + "grad_norm": 3.5629220008850098, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8490117788314819, + "num_tokens": 247331862.0, + "step": 6482 + }, + { + "epoch": 0.824704236102277, + "ewc_loss": 0.007103259209543467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.103259122231975e-05, + "grad_norm": 3.6010990142822266, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8671303987503052, + "num_tokens": 247368271.0, + "step": 6483 + }, + { + "epoch": 0.8248314463808676, + "ewc_loss": 0.007113451138138771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.113451283657923e-05, + "grad_norm": 3.584861993789673, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8656840920448303, + "num_tokens": 247408077.0, + "step": 6484 + }, + { + "epoch": 0.8249586566594581, + "ewc_loss": 0.007105220574885607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.105220720404759e-05, + "grad_norm": 3.603430986404419, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.873210072517395, + "num_tokens": 247447408.0, + "step": 6485 + }, + { + "epoch": 0.8250858669380486, + "ewc_loss": 0.007109226658940315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.109226862667128e-05, + "grad_norm": 3.5519826412200928, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.876151978969574, + "num_tokens": 247489863.0, + "step": 6486 + }, + { + "epoch": 0.8252130772166391, + "ewc_loss": 0.007072221487760544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.072221342241392e-05, + "grad_norm": 3.5336625576019287, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8629318475723267, + "num_tokens": 247532233.0, + "step": 6487 + }, + { + "epoch": 0.8253402874952296, + "ewc_loss": 0.00707134697586298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.071346772136167e-05, + "grad_norm": 3.681434392929077, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8424685001373291, + "num_tokens": 247570719.0, + "step": 6488 + }, + { + "epoch": 0.8254674977738201, + "ewc_loss": 0.0071590738371014595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.159073720686138e-05, + "grad_norm": 3.5996010303497314, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8719279170036316, + "num_tokens": 247603302.0, + "step": 6489 + }, + { + "epoch": 0.8255947080524106, + "ewc_loss": 0.007061042357236147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06104256096296e-05, + "grad_norm": 3.552884578704834, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8681109547615051, + "num_tokens": 247642837.0, + "step": 6490 + }, + { + "epoch": 0.8257219183310012, + "ewc_loss": 0.007073217537254095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.073217420838773e-05, + "grad_norm": 3.540999174118042, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8634539246559143, + "num_tokens": 247685246.0, + "step": 6491 + }, + { + "epoch": 0.8258491286095917, + "ewc_loss": 0.007074567023664713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.074567110976204e-05, + "grad_norm": 3.615478754043579, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.856932520866394, + "num_tokens": 247720871.0, + "step": 6492 + }, + { + "epoch": 0.8259763388881821, + "ewc_loss": 0.007125159725546837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.125159754650667e-05, + "grad_norm": 3.5771327018737793, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8681930303573608, + "num_tokens": 247759796.0, + "step": 6493 + }, + { + "epoch": 0.8261035491667726, + "ewc_loss": 0.007088715210556984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.088715210556984e-05, + "grad_norm": 3.5646188259124756, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8462397456169128, + "num_tokens": 247806486.0, + "step": 6494 + }, + { + "epoch": 0.8262307594453632, + "ewc_loss": 0.0070801121182739735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.080112118273973e-05, + "grad_norm": 3.584171772003174, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8755710124969482, + "num_tokens": 247842031.0, + "step": 6495 + }, + { + "epoch": 0.8263579697239537, + "ewc_loss": 0.007120795547962189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.12079563527368e-05, + "grad_norm": 3.578239679336548, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8698341250419617, + "num_tokens": 247883438.0, + "step": 6496 + }, + { + "epoch": 0.8264851800025442, + "ewc_loss": 0.007097255904227495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.097255729604512e-05, + "grad_norm": 3.5592265129089355, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8743764758110046, + "num_tokens": 247921491.0, + "step": 6497 + }, + { + "epoch": 0.8266123902811348, + "ewc_loss": 0.007094493601471186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.094493776094168e-05, + "grad_norm": 3.6018996238708496, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8763736486434937, + "num_tokens": 247957454.0, + "step": 6498 + }, + { + "epoch": 0.8267396005597252, + "ewc_loss": 0.007129736244678497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.129736331989989e-05, + "grad_norm": 3.5863096714019775, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8515913486480713, + "num_tokens": 247997705.0, + "step": 6499 + }, + { + "epoch": 0.8268668108383157, + "ewc_loss": 0.0070983171463012695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.098317291820422e-05, + "grad_norm": 3.618338108062744, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.861656665802002, + "num_tokens": 248034536.0, + "step": 6500 + }, + { + "epoch": 0.8269940211169062, + "ewc_loss": 0.007118972484022379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.118972280295566e-05, + "grad_norm": 3.5482611656188965, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8680777549743652, + "num_tokens": 248080916.0, + "step": 6501 + }, + { + "epoch": 0.8271212313954968, + "ewc_loss": 0.007069763727486134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.06976352375932e-05, + "grad_norm": 3.5639119148254395, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8658044338226318, + "num_tokens": 248119719.0, + "step": 6502 + }, + { + "epoch": 0.8272484416740873, + "ewc_loss": 0.007111582905054092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.111582817742601e-05, + "grad_norm": 3.605940818786621, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8631855249404907, + "num_tokens": 248156221.0, + "step": 6503 + }, + { + "epoch": 0.8273756519526778, + "ewc_loss": 0.007105888798832893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.105888653313741e-05, + "grad_norm": 3.5682265758514404, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8717461824417114, + "num_tokens": 248191780.0, + "step": 6504 + }, + { + "epoch": 0.8275028622312682, + "ewc_loss": 0.00707834679633379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.078346970956773e-05, + "grad_norm": 3.628718376159668, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8628708124160767, + "num_tokens": 248228594.0, + "step": 6505 + }, + { + "epoch": 0.8276300725098588, + "ewc_loss": 0.007148661185055971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.14866109774448e-05, + "grad_norm": 3.6180577278137207, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8709599375724792, + "num_tokens": 248260372.0, + "step": 6506 + }, + { + "epoch": 0.8277572827884493, + "ewc_loss": 0.007116048131138086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.116048072930425e-05, + "grad_norm": 3.5869367122650146, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8588167428970337, + "num_tokens": 248299361.0, + "step": 6507 + }, + { + "epoch": 0.8278844930670398, + "ewc_loss": 0.007104856893420219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.10485692252405e-05, + "grad_norm": 3.5822813510894775, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8585668802261353, + "num_tokens": 248337370.0, + "step": 6508 + }, + { + "epoch": 0.8280117033456303, + "ewc_loss": 0.007133478298783302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.133478356990963e-05, + "grad_norm": 3.5629262924194336, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8687726259231567, + "num_tokens": 248377541.0, + "step": 6509 + }, + { + "epoch": 0.8281389136242209, + "ewc_loss": 0.00711251562461257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.11251559550874e-05, + "grad_norm": 3.621602773666382, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8498402833938599, + "num_tokens": 248413928.0, + "step": 6510 + }, + { + "epoch": 0.8282661239028114, + "ewc_loss": 0.007165201473981142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.165201532188803e-05, + "grad_norm": 3.5533089637756348, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8657795786857605, + "num_tokens": 248453283.0, + "step": 6511 + }, + { + "epoch": 0.8283933341814018, + "ewc_loss": 0.00710923271253705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.10923268343322e-05, + "grad_norm": 3.540931224822998, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8717122077941895, + "num_tokens": 248490286.0, + "step": 6512 + }, + { + "epoch": 0.8285205444599923, + "ewc_loss": 0.007141076494008303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.141076639527455e-05, + "grad_norm": 3.6140196323394775, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8523104190826416, + "num_tokens": 248526349.0, + "step": 6513 + }, + { + "epoch": 0.8286477547385829, + "ewc_loss": 0.0071708522737026215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.170852040871978e-05, + "grad_norm": 3.578153371810913, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8608827590942383, + "num_tokens": 248561751.0, + "step": 6514 + }, + { + "epoch": 0.8287749650171734, + "ewc_loss": 0.007140578236430883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.140578236430883e-05, + "grad_norm": 3.5065436363220215, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8812160491943359, + "num_tokens": 248601921.0, + "step": 6515 + }, + { + "epoch": 0.8289021752957639, + "ewc_loss": 0.007106208708137274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.106208795448765e-05, + "grad_norm": 3.4991893768310547, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8777363896369934, + "num_tokens": 248641218.0, + "step": 6516 + }, + { + "epoch": 0.8290293855743545, + "ewc_loss": 0.007149939425289631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.149939483497292e-05, + "grad_norm": 3.718332529067993, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.868588387966156, + "num_tokens": 248672616.0, + "step": 6517 + }, + { + "epoch": 0.8291565958529449, + "ewc_loss": 0.007264621555805206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.264621672220528e-05, + "grad_norm": 3.553493022918701, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8702963590621948, + "num_tokens": 248708212.0, + "step": 6518 + }, + { + "epoch": 0.8292838061315354, + "ewc_loss": 0.007116981782019138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.116981578292325e-05, + "grad_norm": 3.6597163677215576, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8712335824966431, + "num_tokens": 248746575.0, + "step": 6519 + }, + { + "epoch": 0.8294110164101259, + "ewc_loss": 0.007230718620121479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.230718620121479e-05, + "grad_norm": 3.534365653991699, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8627287149429321, + "num_tokens": 248789423.0, + "step": 6520 + }, + { + "epoch": 0.8295382266887165, + "ewc_loss": 0.007114715874195099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.114715845091268e-05, + "grad_norm": 3.6332757472991943, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.867714524269104, + "num_tokens": 248824044.0, + "step": 6521 + }, + { + "epoch": 0.829665436967307, + "ewc_loss": 0.00721673434600234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.216734229587018e-05, + "grad_norm": 3.592386484146118, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8643914461135864, + "num_tokens": 248860805.0, + "step": 6522 + }, + { + "epoch": 0.8297926472458975, + "ewc_loss": 0.007153244223445654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.153244223445654e-05, + "grad_norm": 3.5993471145629883, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8725316524505615, + "num_tokens": 248898958.0, + "step": 6523 + }, + { + "epoch": 0.8299198575244879, + "ewc_loss": 0.007167540956288576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.167540752561763e-05, + "grad_norm": 3.91373610496521, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8654651641845703, + "num_tokens": 248938847.0, + "step": 6524 + }, + { + "epoch": 0.8300470678030785, + "ewc_loss": 0.007382402196526527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.382401963695884e-05, + "grad_norm": 3.570443868637085, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8638270497322083, + "num_tokens": 248977944.0, + "step": 6525 + }, + { + "epoch": 0.830174278081669, + "ewc_loss": 0.0070963711477816105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.096370973158628e-05, + "grad_norm": 3.5624637603759766, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8584619760513306, + "num_tokens": 249014610.0, + "step": 6526 + }, + { + "epoch": 0.8303014883602595, + "ewc_loss": 0.007229279261082411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229279435705394e-05, + "grad_norm": 3.6549344062805176, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.83660888671875, + "num_tokens": 249053869.0, + "step": 6527 + }, + { + "epoch": 0.83042869863885, + "ewc_loss": 0.007287044543772936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.287044718395919e-05, + "grad_norm": 3.517836093902588, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8587647080421448, + "num_tokens": 249096874.0, + "step": 6528 + }, + { + "epoch": 0.8305559089174406, + "ewc_loss": 0.007179297041147947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.17929724487476e-05, + "grad_norm": 3.5684449672698975, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8651638031005859, + "num_tokens": 249133141.0, + "step": 6529 + }, + { + "epoch": 0.830683119196031, + "ewc_loss": 0.007270285859704018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.27028600522317e-05, + "grad_norm": 3.616745948791504, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8718702793121338, + "num_tokens": 249163813.0, + "step": 6530 + }, + { + "epoch": 0.8308103294746215, + "ewc_loss": 0.007280566729605198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.280566933332011e-05, + "grad_norm": 3.5468389987945557, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8589348793029785, + "num_tokens": 249202935.0, + "step": 6531 + }, + { + "epoch": 0.830937539753212, + "ewc_loss": 0.0072440169751644135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.244016887852922e-05, + "grad_norm": 3.5686252117156982, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8738775253295898, + "num_tokens": 249241061.0, + "step": 6532 + }, + { + "epoch": 0.8310647500318026, + "ewc_loss": 0.007293850649148226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.293850649148226e-05, + "grad_norm": 3.619065523147583, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8560081124305725, + "num_tokens": 249277863.0, + "step": 6533 + }, + { + "epoch": 0.8311919603103931, + "ewc_loss": 0.007297111675143242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.297111733350903e-05, + "grad_norm": 3.5898540019989014, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8655604124069214, + "num_tokens": 249318047.0, + "step": 6534 + }, + { + "epoch": 0.8313191705889836, + "ewc_loss": 0.007252908311784267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.252908108057454e-05, + "grad_norm": 3.5565478801727295, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8608869910240173, + "num_tokens": 249362095.0, + "step": 6535 + }, + { + "epoch": 0.831446380867574, + "ewc_loss": 0.007260904181748629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.260904385475442e-05, + "grad_norm": 3.564872980117798, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.880199134349823, + "num_tokens": 249403362.0, + "step": 6536 + }, + { + "epoch": 0.8315735911461646, + "ewc_loss": 0.007266467437148094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.266467582667246e-05, + "grad_norm": 3.580270290374756, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8725946545600891, + "num_tokens": 249438051.0, + "step": 6537 + }, + { + "epoch": 0.8317008014247551, + "ewc_loss": 0.007264468353241682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.264468149514869e-05, + "grad_norm": 3.5462474822998047, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8713020086288452, + "num_tokens": 249477185.0, + "step": 6538 + }, + { + "epoch": 0.8318280117033456, + "ewc_loss": 0.0072281560860574245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.228156027849764e-05, + "grad_norm": 3.615340232849121, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8603740930557251, + "num_tokens": 249516880.0, + "step": 6539 + }, + { + "epoch": 0.8319552219819362, + "ewc_loss": 0.007271063979715109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.271063805092126e-05, + "grad_norm": 3.536980152130127, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8752390146255493, + "num_tokens": 249555864.0, + "step": 6540 + }, + { + "epoch": 0.8320824322605267, + "ewc_loss": 0.007199034560471773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.199034735094756e-05, + "grad_norm": 3.599011182785034, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8658580183982849, + "num_tokens": 249590566.0, + "step": 6541 + }, + { + "epoch": 0.8322096425391171, + "ewc_loss": 0.007269184570759535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.269184425240383e-05, + "grad_norm": 3.5479702949523926, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8780602216720581, + "num_tokens": 249631523.0, + "step": 6542 + }, + { + "epoch": 0.8323368528177076, + "ewc_loss": 0.007199970539659262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.19997042324394e-05, + "grad_norm": 3.562001943588257, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8748288154602051, + "num_tokens": 249674113.0, + "step": 6543 + }, + { + "epoch": 0.8324640630962982, + "ewc_loss": 0.007201449479907751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.201449625426903e-05, + "grad_norm": 3.5934011936187744, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8767354488372803, + "num_tokens": 249712934.0, + "step": 6544 + }, + { + "epoch": 0.8325912733748887, + "ewc_loss": 0.007199738174676895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.199738320196047e-05, + "grad_norm": 3.6044082641601562, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8704663515090942, + "num_tokens": 249750295.0, + "step": 6545 + }, + { + "epoch": 0.8327184836534792, + "ewc_loss": 0.00718877324834466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.188773452071473e-05, + "grad_norm": 3.5738086700439453, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8703216314315796, + "num_tokens": 249790289.0, + "step": 6546 + }, + { + "epoch": 0.8328456939320698, + "ewc_loss": 0.007157599087804556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.157598884077743e-05, + "grad_norm": 3.625877618789673, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8806103467941284, + "num_tokens": 249822873.0, + "step": 6547 + }, + { + "epoch": 0.8329729042106602, + "ewc_loss": 0.007185387425124645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.185387221397832e-05, + "grad_norm": 3.5989747047424316, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8670284748077393, + "num_tokens": 249860564.0, + "step": 6548 + }, + { + "epoch": 0.8331001144892507, + "ewc_loss": 0.007151054684072733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.151054887799546e-05, + "grad_norm": 3.5538721084594727, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8669993281364441, + "num_tokens": 249901448.0, + "step": 6549 + }, + { + "epoch": 0.8332273247678412, + "ewc_loss": 0.007129702717065811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.129702862584963e-05, + "grad_norm": 3.633042097091675, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8577780723571777, + "num_tokens": 249933524.0, + "step": 6550 + }, + { + "epoch": 0.8333545350464318, + "ewc_loss": 0.0071852016262710094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.18520168447867e-05, + "grad_norm": 3.6026251316070557, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8624356389045715, + "num_tokens": 249971856.0, + "step": 6551 + }, + { + "epoch": 0.8334817453250223, + "ewc_loss": 0.007141653448343277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.14165362296626e-05, + "grad_norm": 3.5589983463287354, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8595616817474365, + "num_tokens": 250013940.0, + "step": 6552 + }, + { + "epoch": 0.8336089556036128, + "ewc_loss": 0.007123318035155535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.123318209778517e-05, + "grad_norm": 3.717329740524292, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8514052629470825, + "num_tokens": 250044826.0, + "step": 6553 + }, + { + "epoch": 0.8337361658822032, + "ewc_loss": 0.007240672130137682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.240672130137682e-05, + "grad_norm": 3.559917688369751, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8525837659835815, + "num_tokens": 250087409.0, + "step": 6554 + }, + { + "epoch": 0.8338633761607938, + "ewc_loss": 0.007081453688442707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.081453804858029e-05, + "grad_norm": 3.5201079845428467, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8748267292976379, + "num_tokens": 250126236.0, + "step": 6555 + }, + { + "epoch": 0.8339905864393843, + "ewc_loss": 0.007151234429329634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.151234603952616e-05, + "grad_norm": 3.5634026527404785, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.874619722366333, + "num_tokens": 250163559.0, + "step": 6556 + }, + { + "epoch": 0.8341177967179748, + "ewc_loss": 0.00716003542765975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160035602282733e-05, + "grad_norm": 3.543579339981079, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8683786392211914, + "num_tokens": 250203536.0, + "step": 6557 + }, + { + "epoch": 0.8342450069965653, + "ewc_loss": 0.00714836223050952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.148362055886537e-05, + "grad_norm": 3.6600465774536133, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8709184527397156, + "num_tokens": 250237455.0, + "step": 6558 + }, + { + "epoch": 0.8343722172751559, + "ewc_loss": 0.007222339045256376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.222338899737224e-05, + "grad_norm": 3.54781174659729, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.875325620174408, + "num_tokens": 250274559.0, + "step": 6559 + }, + { + "epoch": 0.8344994275537464, + "ewc_loss": 0.00711685698479414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.116857159417123e-05, + "grad_norm": 3.596660852432251, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8686909675598145, + "num_tokens": 250309392.0, + "step": 6560 + }, + { + "epoch": 0.8346266378323368, + "ewc_loss": 0.007182128261774778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.182128319982439e-05, + "grad_norm": 3.5306713581085205, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8709617853164673, + "num_tokens": 250352010.0, + "step": 6561 + }, + { + "epoch": 0.8347538481109273, + "ewc_loss": 0.007133059203624725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.133059261832386e-05, + "grad_norm": 3.6938366889953613, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8549817800521851, + "num_tokens": 250385745.0, + "step": 6562 + }, + { + "epoch": 0.8348810583895179, + "ewc_loss": 0.0072467271238565445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.246727182064205e-05, + "grad_norm": 3.5523664951324463, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.864482581615448, + "num_tokens": 250426055.0, + "step": 6563 + }, + { + "epoch": 0.8350082686681084, + "ewc_loss": 0.007098210509866476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.098210335243493e-05, + "grad_norm": 3.5719058513641357, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8632418513298035, + "num_tokens": 250468795.0, + "step": 6564 + }, + { + "epoch": 0.8351354789466989, + "ewc_loss": 0.0071729752235114574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.172975165303797e-05, + "grad_norm": 3.5717504024505615, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8651987314224243, + "num_tokens": 250509115.0, + "step": 6565 + }, + { + "epoch": 0.8352626892252895, + "ewc_loss": 0.007150605320930481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.15060523361899e-05, + "grad_norm": 3.6033005714416504, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8671349883079529, + "num_tokens": 250547770.0, + "step": 6566 + }, + { + "epoch": 0.8353898995038799, + "ewc_loss": 0.007161570712924004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.161570829339325e-05, + "grad_norm": 3.5550966262817383, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8645271062850952, + "num_tokens": 250589737.0, + "step": 6567 + }, + { + "epoch": 0.8355171097824704, + "ewc_loss": 0.007117079105228186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.117079076124355e-05, + "grad_norm": 3.623703956604004, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8749653100967407, + "num_tokens": 250623058.0, + "step": 6568 + }, + { + "epoch": 0.8356443200610609, + "ewc_loss": 0.007182233035564423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.182233093772084e-05, + "grad_norm": 3.5691092014312744, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8665838837623596, + "num_tokens": 250660526.0, + "step": 6569 + }, + { + "epoch": 0.8357715303396515, + "ewc_loss": 0.007124942261725664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.124942203518003e-05, + "grad_norm": 3.6212823390960693, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8499326705932617, + "num_tokens": 250698218.0, + "step": 6570 + }, + { + "epoch": 0.835898740618242, + "ewc_loss": 0.0071693481877446175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.169348100433126e-05, + "grad_norm": 3.540912628173828, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8592003583908081, + "num_tokens": 250743410.0, + "step": 6571 + }, + { + "epoch": 0.8360259508968325, + "ewc_loss": 0.007110107224434614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.110107253538445e-05, + "grad_norm": 3.5500600337982178, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8762470483779907, + "num_tokens": 250781919.0, + "step": 6572 + }, + { + "epoch": 0.8361531611754229, + "ewc_loss": 0.00714041106402874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.140410889405757e-05, + "grad_norm": 3.620131731033325, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8646962642669678, + "num_tokens": 250817051.0, + "step": 6573 + }, + { + "epoch": 0.8362803714540135, + "ewc_loss": 0.007177996449172497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.177996303653345e-05, + "grad_norm": 3.5842349529266357, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8639212846755981, + "num_tokens": 250857241.0, + "step": 6574 + }, + { + "epoch": 0.836407581732604, + "ewc_loss": 0.007124787196516991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.124787225620821e-05, + "grad_norm": 3.5153019428253174, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8677589893341064, + "num_tokens": 250903591.0, + "step": 6575 + }, + { + "epoch": 0.8365347920111945, + "ewc_loss": 0.007102038245648146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.102038216544315e-05, + "grad_norm": 3.5390994548797607, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8611879348754883, + "num_tokens": 250945788.0, + "step": 6576 + }, + { + "epoch": 0.836662002289785, + "ewc_loss": 0.007142953108996153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.142953108996153e-05, + "grad_norm": 3.5876028537750244, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8592957258224487, + "num_tokens": 250984884.0, + "step": 6577 + }, + { + "epoch": 0.8367892125683756, + "ewc_loss": 0.0071517229080200195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.151722820708528e-05, + "grad_norm": 3.546827793121338, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8775055408477783, + "num_tokens": 251024347.0, + "step": 6578 + }, + { + "epoch": 0.836916422846966, + "ewc_loss": 0.007090867962688208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.090868166415021e-05, + "grad_norm": 3.494767189025879, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8874284029006958, + "num_tokens": 251068527.0, + "step": 6579 + }, + { + "epoch": 0.8370436331255565, + "ewc_loss": 0.007072935812175274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.072935841279104e-05, + "grad_norm": 3.530637264251709, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8713216781616211, + "num_tokens": 251109914.0, + "step": 6580 + }, + { + "epoch": 0.837170843404147, + "ewc_loss": 0.007101447321474552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.101447408786044e-05, + "grad_norm": 3.6068334579467773, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.858626663684845, + "num_tokens": 251144760.0, + "step": 6581 + }, + { + "epoch": 0.8372980536827376, + "ewc_loss": 0.007134201470762491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.134201587177813e-05, + "grad_norm": 3.554717779159546, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8652687072753906, + "num_tokens": 251184456.0, + "step": 6582 + }, + { + "epoch": 0.8374252639613281, + "ewc_loss": 0.007072684820741415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.072684820741415e-05, + "grad_norm": 3.5370185375213623, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.858669102191925, + "num_tokens": 251229493.0, + "step": 6583 + }, + { + "epoch": 0.8375524742399186, + "ewc_loss": 0.0070843324065208435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0843321736902e-05, + "grad_norm": 3.556633710861206, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8717659115791321, + "num_tokens": 251270886.0, + "step": 6584 + }, + { + "epoch": 0.837679684518509, + "ewc_loss": 0.007092552725225687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.092552550602704e-05, + "grad_norm": 3.6050779819488525, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8766283988952637, + "num_tokens": 251304833.0, + "step": 6585 + }, + { + "epoch": 0.8378068947970996, + "ewc_loss": 0.007124395575374365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.124395779101178e-05, + "grad_norm": 3.66858172416687, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8693236708641052, + "num_tokens": 251334196.0, + "step": 6586 + }, + { + "epoch": 0.8379341050756901, + "ewc_loss": 0.007122159935534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.12215987732634e-05, + "grad_norm": 3.5206775665283203, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8552050590515137, + "num_tokens": 251380278.0, + "step": 6587 + }, + { + "epoch": 0.8380613153542806, + "ewc_loss": 0.007024007849395275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.024007936706766e-05, + "grad_norm": 3.593535900115967, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8621881604194641, + "num_tokens": 251418432.0, + "step": 6588 + }, + { + "epoch": 0.8381885256328712, + "ewc_loss": 0.0071273185312747955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.127318531274796e-05, + "grad_norm": 3.5870800018310547, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8647804260253906, + "num_tokens": 251453747.0, + "step": 6589 + }, + { + "epoch": 0.8383157359114617, + "ewc_loss": 0.0070906332693994045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.090633152984083e-05, + "grad_norm": 3.5662167072296143, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.867743730545044, + "num_tokens": 251492439.0, + "step": 6590 + }, + { + "epoch": 0.8384429461900521, + "ewc_loss": 0.007091418374329805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.091418228810653e-05, + "grad_norm": 3.582014799118042, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8671435713768005, + "num_tokens": 251527945.0, + "step": 6591 + }, + { + "epoch": 0.8385701564686426, + "ewc_loss": 0.0071013192646205425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.101319351932034e-05, + "grad_norm": 3.563391923904419, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8753142356872559, + "num_tokens": 251567161.0, + "step": 6592 + }, + { + "epoch": 0.8386973667472332, + "ewc_loss": 0.00709273386746645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.092733721947297e-05, + "grad_norm": 3.5989830493927, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8669653534889221, + "num_tokens": 251601189.0, + "step": 6593 + }, + { + "epoch": 0.8388245770258237, + "ewc_loss": 0.007112438324838877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.112438470358029e-05, + "grad_norm": 3.63940691947937, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8653526306152344, + "num_tokens": 251630846.0, + "step": 6594 + }, + { + "epoch": 0.8389517873044142, + "ewc_loss": 0.007147815078496933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.14781490387395e-05, + "grad_norm": 3.5525598526000977, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8743327856063843, + "num_tokens": 251665701.0, + "step": 6595 + }, + { + "epoch": 0.8390789975830047, + "ewc_loss": 0.007106427103281021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.10642707417719e-05, + "grad_norm": 3.6054399013519287, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8591215014457703, + "num_tokens": 251699909.0, + "step": 6596 + }, + { + "epoch": 0.8392062078615952, + "ewc_loss": 0.0071789296343922615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.178929809015244e-05, + "grad_norm": 3.6399641036987305, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8631168603897095, + "num_tokens": 251734052.0, + "step": 6597 + }, + { + "epoch": 0.8393334181401857, + "ewc_loss": 0.0071767475455999374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.17674774932675e-05, + "grad_norm": 3.5661919116973877, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8562822341918945, + "num_tokens": 251772518.0, + "step": 6598 + }, + { + "epoch": 0.8394606284187762, + "ewc_loss": 0.007150411140173674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.150410965550691e-05, + "grad_norm": 3.5561583042144775, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8643349409103394, + "num_tokens": 251816378.0, + "step": 6599 + }, + { + "epoch": 0.8395878386973668, + "ewc_loss": 0.0071788569912314415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.178857049439102e-05, + "grad_norm": 3.5882084369659424, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8655613660812378, + "num_tokens": 251851264.0, + "step": 6600 + }, + { + "epoch": 0.8397150489759573, + "ewc_loss": 0.007194457575678825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.194457430159673e-05, + "grad_norm": 3.570943593978882, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8678414225578308, + "num_tokens": 251888906.0, + "step": 6601 + }, + { + "epoch": 0.8398422592545478, + "ewc_loss": 0.007172033656388521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.172033656388521e-05, + "grad_norm": 3.517702102661133, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8730366230010986, + "num_tokens": 251930641.0, + "step": 6602 + }, + { + "epoch": 0.8399694695331382, + "ewc_loss": 0.007175086066126823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.175085920607671e-05, + "grad_norm": 3.63162899017334, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.848572850227356, + "num_tokens": 251968929.0, + "step": 6603 + }, + { + "epoch": 0.8400966798117288, + "ewc_loss": 0.007248320616781712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.248320616781712e-05, + "grad_norm": 3.6214075088500977, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8584255576133728, + "num_tokens": 252006751.0, + "step": 6604 + }, + { + "epoch": 0.8402238900903193, + "ewc_loss": 0.007192319259047508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.192319026216865e-05, + "grad_norm": 3.555359125137329, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8621120452880859, + "num_tokens": 252045861.0, + "step": 6605 + }, + { + "epoch": 0.8403511003689098, + "ewc_loss": 0.007184572517871857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.184572314145043e-05, + "grad_norm": 3.586475133895874, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8601859211921692, + "num_tokens": 252084412.0, + "step": 6606 + }, + { + "epoch": 0.8404783106475003, + "ewc_loss": 0.007214327342808247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214327342808247e-05, + "grad_norm": 3.5959346294403076, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8715146780014038, + "num_tokens": 252120576.0, + "step": 6607 + }, + { + "epoch": 0.8406055209260909, + "ewc_loss": 0.0072099268436431885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.209926843643188e-05, + "grad_norm": 3.51938533782959, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8635612726211548, + "num_tokens": 252161636.0, + "step": 6608 + }, + { + "epoch": 0.8407327312046813, + "ewc_loss": 0.007161680143326521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.161679968703538e-05, + "grad_norm": 3.573601245880127, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.86647629737854, + "num_tokens": 252199669.0, + "step": 6609 + }, + { + "epoch": 0.8408599414832718, + "ewc_loss": 0.007209477014839649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.209477189462632e-05, + "grad_norm": 3.556800127029419, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8737701177597046, + "num_tokens": 252236832.0, + "step": 6610 + }, + { + "epoch": 0.8409871517618623, + "ewc_loss": 0.007180351763963699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.180351531133056e-05, + "grad_norm": 3.556096076965332, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8612240552902222, + "num_tokens": 252282518.0, + "step": 6611 + }, + { + "epoch": 0.8411143620404529, + "ewc_loss": 0.007188291288912296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.188291056081653e-05, + "grad_norm": 3.619431495666504, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8600403070449829, + "num_tokens": 252316479.0, + "step": 6612 + }, + { + "epoch": 0.8412415723190434, + "ewc_loss": 0.007215886376798153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.215886580524966e-05, + "grad_norm": 3.5843918323516846, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8605059385299683, + "num_tokens": 252353789.0, + "step": 6613 + }, + { + "epoch": 0.8413687825976339, + "ewc_loss": 0.007170834578573704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.170834578573704e-05, + "grad_norm": 3.5606067180633545, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8683164119720459, + "num_tokens": 252391609.0, + "step": 6614 + }, + { + "epoch": 0.8414959928762245, + "ewc_loss": 0.007184169255197048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.184169226093218e-05, + "grad_norm": 3.5472652912139893, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8800567388534546, + "num_tokens": 252433514.0, + "step": 6615 + }, + { + "epoch": 0.8416232031548149, + "ewc_loss": 0.007171669043600559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.17166913091205e-05, + "grad_norm": 3.6093742847442627, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8696621656417847, + "num_tokens": 252469700.0, + "step": 6616 + }, + { + "epoch": 0.8417504134334054, + "ewc_loss": 0.007208077237010002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.208077295217663e-05, + "grad_norm": 3.6510589122772217, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8630728721618652, + "num_tokens": 252501658.0, + "step": 6617 + }, + { + "epoch": 0.8418776237119959, + "ewc_loss": 0.0072102900594472885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.210289913928136e-05, + "grad_norm": 3.644744396209717, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8679178953170776, + "num_tokens": 252534546.0, + "step": 6618 + }, + { + "epoch": 0.8420048339905865, + "ewc_loss": 0.007193215191364288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.193215424194932e-05, + "grad_norm": 3.5519754886627197, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8741380572319031, + "num_tokens": 252573172.0, + "step": 6619 + }, + { + "epoch": 0.842132044269177, + "ewc_loss": 0.007141399197280407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.141398964449763e-05, + "grad_norm": 3.614738702774048, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8762520551681519, + "num_tokens": 252607258.0, + "step": 6620 + }, + { + "epoch": 0.8422592545477675, + "ewc_loss": 0.007217363454401493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.217363599920645e-05, + "grad_norm": 3.5516879558563232, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8757511377334595, + "num_tokens": 252651451.0, + "step": 6621 + }, + { + "epoch": 0.8423864648263579, + "ewc_loss": 0.007152234204113483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.152234320528805e-05, + "grad_norm": 3.580017328262329, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.879024863243103, + "num_tokens": 252686348.0, + "step": 6622 + }, + { + "epoch": 0.8425136751049485, + "ewc_loss": 0.007195726968348026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.195727084763348e-05, + "grad_norm": 3.5985467433929443, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8615951538085938, + "num_tokens": 252724316.0, + "step": 6623 + }, + { + "epoch": 0.842640885383539, + "ewc_loss": 0.007188577204942703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.18857700121589e-05, + "grad_norm": 3.5638930797576904, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8557873368263245, + "num_tokens": 252764612.0, + "step": 6624 + }, + { + "epoch": 0.8427680956621295, + "ewc_loss": 0.007156320381909609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.15632049832493e-05, + "grad_norm": 3.5538666248321533, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8468567132949829, + "num_tokens": 252809510.0, + "step": 6625 + }, + { + "epoch": 0.84289530594072, + "ewc_loss": 0.007166838739067316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.166838622651994e-05, + "grad_norm": 3.5963001251220703, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8634598851203918, + "num_tokens": 252847598.0, + "step": 6626 + }, + { + "epoch": 0.8430225162193106, + "ewc_loss": 0.007192922756075859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.192922930698842e-05, + "grad_norm": 3.617361068725586, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8519085049629211, + "num_tokens": 252884401.0, + "step": 6627 + }, + { + "epoch": 0.843149726497901, + "ewc_loss": 0.007171564269810915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.171564357122406e-05, + "grad_norm": 3.5522172451019287, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8518757820129395, + "num_tokens": 252928348.0, + "step": 6628 + }, + { + "epoch": 0.8432769367764915, + "ewc_loss": 0.007136126980185509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.136126805562526e-05, + "grad_norm": 3.630366563796997, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8667564392089844, + "num_tokens": 252965084.0, + "step": 6629 + }, + { + "epoch": 0.843404147055082, + "ewc_loss": 0.007208953145891428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.208953320514411e-05, + "grad_norm": 3.5691330432891846, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8703702688217163, + "num_tokens": 253003697.0, + "step": 6630 + }, + { + "epoch": 0.8435313573336726, + "ewc_loss": 0.00713292695581913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.132926839403808e-05, + "grad_norm": 3.6437759399414062, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8525701761245728, + "num_tokens": 253037239.0, + "step": 6631 + }, + { + "epoch": 0.8436585676122631, + "ewc_loss": 0.007205367088317871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.20536700100638e-05, + "grad_norm": 3.568790912628174, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8578946590423584, + "num_tokens": 253078163.0, + "step": 6632 + }, + { + "epoch": 0.8437857778908536, + "ewc_loss": 0.007145076058804989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.145076233427972e-05, + "grad_norm": 3.556979179382324, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8648546934127808, + "num_tokens": 253118907.0, + "step": 6633 + }, + { + "epoch": 0.843912988169444, + "ewc_loss": 0.007171510718762875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.171510515036061e-05, + "grad_norm": 3.569355010986328, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8856390714645386, + "num_tokens": 253153132.0, + "step": 6634 + }, + { + "epoch": 0.8440401984480346, + "ewc_loss": 0.007189510390162468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.18951050657779e-05, + "grad_norm": 3.599262237548828, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8609012365341187, + "num_tokens": 253194446.0, + "step": 6635 + }, + { + "epoch": 0.8441674087266251, + "ewc_loss": 0.007178381085395813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.178381201811135e-05, + "grad_norm": 3.5748167037963867, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8702714443206787, + "num_tokens": 253230798.0, + "step": 6636 + }, + { + "epoch": 0.8442946190052156, + "ewc_loss": 0.007164092734456062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.164092676248401e-05, + "grad_norm": 3.574768543243408, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8585637211799622, + "num_tokens": 253269029.0, + "step": 6637 + }, + { + "epoch": 0.8444218292838062, + "ewc_loss": 0.007157758343964815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.157758227549493e-05, + "grad_norm": 3.524235248565674, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8613088130950928, + "num_tokens": 253310656.0, + "step": 6638 + }, + { + "epoch": 0.8445490395623967, + "ewc_loss": 0.0071334876120090485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.133487815735862e-05, + "grad_norm": 3.559215545654297, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8844375014305115, + "num_tokens": 253346882.0, + "step": 6639 + }, + { + "epoch": 0.8446762498409871, + "ewc_loss": 0.00717600854113698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.17600851203315e-05, + "grad_norm": 3.5736277103424072, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8567800521850586, + "num_tokens": 253389601.0, + "step": 6640 + }, + { + "epoch": 0.8448034601195776, + "ewc_loss": 0.007159743923693895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.159743836382404e-05, + "grad_norm": 3.5800936222076416, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8659408092498779, + "num_tokens": 253428725.0, + "step": 6641 + }, + { + "epoch": 0.8449306703981682, + "ewc_loss": 0.007149207405745983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.149207522161305e-05, + "grad_norm": 3.597784996032715, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.881044864654541, + "num_tokens": 253461404.0, + "step": 6642 + }, + { + "epoch": 0.8450578806767587, + "ewc_loss": 0.0071518574841320515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.15185742592439e-05, + "grad_norm": 3.5600430965423584, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8700598478317261, + "num_tokens": 253500324.0, + "step": 6643 + }, + { + "epoch": 0.8451850909553492, + "ewc_loss": 0.007124775554984808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.124775584088638e-05, + "grad_norm": 3.587583541870117, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8539989590644836, + "num_tokens": 253542500.0, + "step": 6644 + }, + { + "epoch": 0.8453123012339397, + "ewc_loss": 0.007144932169467211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.144932169467211e-05, + "grad_norm": 3.5923609733581543, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8491121530532837, + "num_tokens": 253581887.0, + "step": 6645 + }, + { + "epoch": 0.8454395115125302, + "ewc_loss": 0.00712974090129137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.129740697564557e-05, + "grad_norm": 3.5728516578674316, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8749446868896484, + "num_tokens": 253616942.0, + "step": 6646 + }, + { + "epoch": 0.8455667217911207, + "ewc_loss": 0.007137489039450884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.137488864827901e-05, + "grad_norm": 3.6284751892089844, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8606135249137878, + "num_tokens": 253652610.0, + "step": 6647 + }, + { + "epoch": 0.8456939320697112, + "ewc_loss": 0.007161976769566536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.161976827774197e-05, + "grad_norm": 3.608229398727417, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8631191253662109, + "num_tokens": 253686269.0, + "step": 6648 + }, + { + "epoch": 0.8458211423483017, + "ewc_loss": 0.00713467737659812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.13467743480578e-05, + "grad_norm": 3.60693097114563, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.861702561378479, + "num_tokens": 253723632.0, + "step": 6649 + }, + { + "epoch": 0.8459483526268923, + "ewc_loss": 0.00716062355786562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160623499657959e-05, + "grad_norm": 3.6684510707855225, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8597002625465393, + "num_tokens": 253755990.0, + "step": 6650 + }, + { + "epoch": 0.8460755629054828, + "ewc_loss": 0.007198500446975231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.198500679805875e-05, + "grad_norm": 3.600139856338501, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8609504699707031, + "num_tokens": 253794200.0, + "step": 6651 + }, + { + "epoch": 0.8462027731840732, + "ewc_loss": 0.0071553755551576614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.155375351430848e-05, + "grad_norm": 3.537130117416382, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8765845894813538, + "num_tokens": 253840554.0, + "step": 6652 + }, + { + "epoch": 0.8463299834626637, + "ewc_loss": 0.007132204249501228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.13220433681272e-05, + "grad_norm": 3.6284754276275635, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8761472105979919, + "num_tokens": 253873642.0, + "step": 6653 + }, + { + "epoch": 0.8464571937412543, + "ewc_loss": 0.007219440769404173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.219440885819495e-05, + "grad_norm": 3.6592888832092285, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.871601939201355, + "num_tokens": 253911839.0, + "step": 6654 + }, + { + "epoch": 0.8465844040198448, + "ewc_loss": 0.0071993637830019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.199363608378917e-05, + "grad_norm": 3.6621580123901367, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8561511039733887, + "num_tokens": 253944649.0, + "step": 6655 + }, + { + "epoch": 0.8467116142984353, + "ewc_loss": 0.007197476923465729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.197476952569559e-05, + "grad_norm": 24.550989151000977, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.868472695350647, + "num_tokens": 253977739.0, + "step": 6656 + }, + { + "epoch": 0.8468388245770259, + "ewc_loss": 0.011760137043893337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 0.00011760136840166524, + "grad_norm": 5.12494421005249, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8499807119369507, + "num_tokens": 254021216.0, + "step": 6657 + }, + { + "epoch": 0.8469660348556163, + "ewc_loss": 0.009254572913050652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.25457279663533e-05, + "grad_norm": 3.3844492435455322, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8632234334945679, + "num_tokens": 254063209.0, + "step": 6658 + }, + { + "epoch": 0.8470932451342068, + "ewc_loss": 0.007588588632643223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.588588778162375e-05, + "grad_norm": 4.304281711578369, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8623250722885132, + "num_tokens": 254102325.0, + "step": 6659 + }, + { + "epoch": 0.8472204554127973, + "ewc_loss": 0.009539425373077393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.53942580963485e-05, + "grad_norm": 4.247282028198242, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8708109855651855, + "num_tokens": 254136991.0, + "step": 6660 + }, + { + "epoch": 0.8473476656913879, + "ewc_loss": 0.0086127994582057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612799865659326e-05, + "grad_norm": 3.8571114540100098, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8657504320144653, + "num_tokens": 254179520.0, + "step": 6661 + }, + { + "epoch": 0.8474748759699784, + "ewc_loss": 0.007924864999949932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924864621600136e-05, + "grad_norm": 3.9032113552093506, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8666990995407104, + "num_tokens": 254220661.0, + "step": 6662 + }, + { + "epoch": 0.8476020862485689, + "ewc_loss": 0.008160277269780636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.16027750261128e-05, + "grad_norm": 3.755676031112671, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8737812042236328, + "num_tokens": 254261853.0, + "step": 6663 + }, + { + "epoch": 0.8477292965271594, + "ewc_loss": 0.007891945540904999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.891945278970525e-05, + "grad_norm": 3.8938515186309814, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8674211502075195, + "num_tokens": 254301577.0, + "step": 6664 + }, + { + "epoch": 0.8478565068057499, + "ewc_loss": 0.007877105847001076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877105963416398e-05, + "grad_norm": 3.731776714324951, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8786961436271667, + "num_tokens": 254341792.0, + "step": 6665 + }, + { + "epoch": 0.8479837170843404, + "ewc_loss": 0.007642910350114107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.642910350114107e-05, + "grad_norm": 3.8064980506896973, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8537427186965942, + "num_tokens": 254376755.0, + "step": 6666 + }, + { + "epoch": 0.8481109273629309, + "ewc_loss": 0.00769409304484725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69409307395108e-05, + "grad_norm": 3.7391722202301025, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8779526948928833, + "num_tokens": 254413565.0, + "step": 6667 + }, + { + "epoch": 0.8482381376415215, + "ewc_loss": 0.0075329807586967945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532980816904455e-05, + "grad_norm": 3.726318359375, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8564391136169434, + "num_tokens": 254449889.0, + "step": 6668 + }, + { + "epoch": 0.848365347920112, + "ewc_loss": 0.007505727466195822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.505727262469009e-05, + "grad_norm": 3.671893358230591, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8809559345245361, + "num_tokens": 254488129.0, + "step": 6669 + }, + { + "epoch": 0.8484925581987025, + "ewc_loss": 0.007410156074911356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.410156104015186e-05, + "grad_norm": 3.737734794616699, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8756898641586304, + "num_tokens": 254522031.0, + "step": 6670 + }, + { + "epoch": 0.8486197684772929, + "ewc_loss": 0.0074257333762943745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.425733201671392e-05, + "grad_norm": 3.661078453063965, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8773552179336548, + "num_tokens": 254559249.0, + "step": 6671 + }, + { + "epoch": 0.8487469787558835, + "ewc_loss": 0.007315348833799362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.315348921110854e-05, + "grad_norm": 3.643951416015625, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8599802255630493, + "num_tokens": 254603968.0, + "step": 6672 + }, + { + "epoch": 0.848874189034474, + "ewc_loss": 0.007303270045667887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.303270103875548e-05, + "grad_norm": 3.6003832817077637, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8754885196685791, + "num_tokens": 254642178.0, + "step": 6673 + }, + { + "epoch": 0.8490013993130645, + "ewc_loss": 0.007253395393490791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.253395597217605e-05, + "grad_norm": 3.6414997577667236, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8586426377296448, + "num_tokens": 254679129.0, + "step": 6674 + }, + { + "epoch": 0.849128609591655, + "ewc_loss": 0.007260972633957863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.260972779477015e-05, + "grad_norm": 3.7325522899627686, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8382247686386108, + "num_tokens": 254714952.0, + "step": 6675 + }, + { + "epoch": 0.8492558198702456, + "ewc_loss": 0.007298742420971394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.298742275452241e-05, + "grad_norm": 3.7486026287078857, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8630231618881226, + "num_tokens": 254743323.0, + "step": 6676 + }, + { + "epoch": 0.849383030148836, + "ewc_loss": 0.0072611370123922825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.261137216119096e-05, + "grad_norm": 3.538667678833008, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8726080656051636, + "num_tokens": 254784749.0, + "step": 6677 + }, + { + "epoch": 0.8495102404274265, + "ewc_loss": 0.007145026698708534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.145026756916195e-05, + "grad_norm": 3.6132426261901855, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.871264636516571, + "num_tokens": 254820102.0, + "step": 6678 + }, + { + "epoch": 0.849637450706017, + "ewc_loss": 0.007262250408530235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.262250437634066e-05, + "grad_norm": 3.6111092567443848, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8616787195205688, + "num_tokens": 254859988.0, + "step": 6679 + }, + { + "epoch": 0.8497646609846076, + "ewc_loss": 0.007200070656836033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.200070831459016e-05, + "grad_norm": 3.5472922325134277, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.862102210521698, + "num_tokens": 254906792.0, + "step": 6680 + }, + { + "epoch": 0.8498918712631981, + "ewc_loss": 0.00716979568824172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.169795571826398e-05, + "grad_norm": 3.63354229927063, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8686144948005676, + "num_tokens": 254941552.0, + "step": 6681 + }, + { + "epoch": 0.8500190815417886, + "ewc_loss": 0.007243895437568426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.243895379360765e-05, + "grad_norm": 3.7007856369018555, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8595084547996521, + "num_tokens": 254974653.0, + "step": 6682 + }, + { + "epoch": 0.850146291820379, + "ewc_loss": 0.007220656611025333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.220656698336825e-05, + "grad_norm": 3.5246543884277344, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8508173823356628, + "num_tokens": 255018418.0, + "step": 6683 + }, + { + "epoch": 0.8502735020989696, + "ewc_loss": 0.007126852869987488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.126852869987488e-05, + "grad_norm": 3.6685216426849365, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8721657395362854, + "num_tokens": 255049626.0, + "step": 6684 + }, + { + "epoch": 0.8504007123775601, + "ewc_loss": 0.007261904887855053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.261904829647392e-05, + "grad_norm": 3.5915229320526123, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.858271598815918, + "num_tokens": 255086323.0, + "step": 6685 + }, + { + "epoch": 0.8505279226561506, + "ewc_loss": 0.007151768542826176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.151768659241498e-05, + "grad_norm": 3.589081048965454, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8738707304000854, + "num_tokens": 255124102.0, + "step": 6686 + }, + { + "epoch": 0.8506551329347412, + "ewc_loss": 0.007190376985818148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.190377073129639e-05, + "grad_norm": 3.6199448108673096, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8715114593505859, + "num_tokens": 255162176.0, + "step": 6687 + }, + { + "epoch": 0.8507823432133317, + "ewc_loss": 0.007221313659101725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.221313717309386e-05, + "grad_norm": 3.637768507003784, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8693943023681641, + "num_tokens": 255196256.0, + "step": 6688 + }, + { + "epoch": 0.8509095534919221, + "ewc_loss": 0.007205287925899029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.205287693068385e-05, + "grad_norm": 3.5845015048980713, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8597416877746582, + "num_tokens": 255236752.0, + "step": 6689 + }, + { + "epoch": 0.8510367637705126, + "ewc_loss": 0.0071749817579984665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.174981874413788e-05, + "grad_norm": 3.6683542728424072, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8686295747756958, + "num_tokens": 255268694.0, + "step": 6690 + }, + { + "epoch": 0.8511639740491032, + "ewc_loss": 0.007250957190990448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.250957423821092e-05, + "grad_norm": 3.5643131732940674, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8726086020469666, + "num_tokens": 255309445.0, + "step": 6691 + }, + { + "epoch": 0.8512911843276937, + "ewc_loss": 0.00714327534660697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.143275433918461e-05, + "grad_norm": 3.5904507637023926, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8694223165512085, + "num_tokens": 255341126.0, + "step": 6692 + }, + { + "epoch": 0.8514183946062842, + "ewc_loss": 0.007225350476801395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.225350418593735e-05, + "grad_norm": 3.553605318069458, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8758092522621155, + "num_tokens": 255379553.0, + "step": 6693 + }, + { + "epoch": 0.8515456048848747, + "ewc_loss": 0.007187165319919586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.187165465438738e-05, + "grad_norm": 3.5736804008483887, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8671882748603821, + "num_tokens": 255416588.0, + "step": 6694 + }, + { + "epoch": 0.8516728151634652, + "ewc_loss": 0.0072036199271678925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.203620043583214e-05, + "grad_norm": 3.6376266479492188, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8541268110275269, + "num_tokens": 255449574.0, + "step": 6695 + }, + { + "epoch": 0.8518000254420557, + "ewc_loss": 0.007247488014400005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.247488247230649e-05, + "grad_norm": 3.554056167602539, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8644305467605591, + "num_tokens": 255494088.0, + "step": 6696 + }, + { + "epoch": 0.8519272357206462, + "ewc_loss": 0.007174232974648476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.174233178375289e-05, + "grad_norm": 3.622138023376465, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8745624423027039, + "num_tokens": 255529785.0, + "step": 6697 + }, + { + "epoch": 0.8520544459992367, + "ewc_loss": 0.007249323185533285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.249323243740946e-05, + "grad_norm": 3.6181631088256836, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8527086973190308, + "num_tokens": 255566675.0, + "step": 6698 + }, + { + "epoch": 0.8521816562778273, + "ewc_loss": 0.007224136032164097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224136061267927e-05, + "grad_norm": 3.5647194385528564, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8719589710235596, + "num_tokens": 255606502.0, + "step": 6699 + }, + { + "epoch": 0.8523088665564178, + "ewc_loss": 0.007207730785012245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.207730959635228e-05, + "grad_norm": 3.5830154418945312, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8573788404464722, + "num_tokens": 255649179.0, + "step": 6700 + }, + { + "epoch": 0.8524360768350082, + "ewc_loss": 0.00724255433306098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.242554420372471e-05, + "grad_norm": 3.6114590167999268, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8651531934738159, + "num_tokens": 255688190.0, + "step": 6701 + }, + { + "epoch": 0.8525632871135987, + "ewc_loss": 0.007262649945914745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.262649887707084e-05, + "grad_norm": 3.601846694946289, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8644567728042603, + "num_tokens": 255730401.0, + "step": 6702 + }, + { + "epoch": 0.8526904973921893, + "ewc_loss": 0.0072409347631037235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.240934792207554e-05, + "grad_norm": 3.6175007820129395, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.860859751701355, + "num_tokens": 255767881.0, + "step": 6703 + }, + { + "epoch": 0.8528177076707798, + "ewc_loss": 0.0072546107694506645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.254610682139173e-05, + "grad_norm": 3.585115909576416, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8581247329711914, + "num_tokens": 255808347.0, + "step": 6704 + }, + { + "epoch": 0.8529449179493703, + "ewc_loss": 0.007222929038107395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.222928979899734e-05, + "grad_norm": 3.630741596221924, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8466589450836182, + "num_tokens": 255849538.0, + "step": 6705 + }, + { + "epoch": 0.8530721282279609, + "ewc_loss": 0.007260323967784643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.26032376405783e-05, + "grad_norm": 3.595304250717163, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8749507665634155, + "num_tokens": 255890228.0, + "step": 6706 + }, + { + "epoch": 0.8531993385065513, + "ewc_loss": 0.007206134032458067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.206133886938915e-05, + "grad_norm": 3.578376293182373, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8642602562904358, + "num_tokens": 255930189.0, + "step": 6707 + }, + { + "epoch": 0.8533265487851418, + "ewc_loss": 0.007216938771307468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.216938683995977e-05, + "grad_norm": 3.579007148742676, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8731529712677002, + "num_tokens": 255968310.0, + "step": 6708 + }, + { + "epoch": 0.8534537590637323, + "ewc_loss": 0.007214335259050131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214335346361622e-05, + "grad_norm": 3.5822136402130127, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8780947923660278, + "num_tokens": 256005872.0, + "step": 6709 + }, + { + "epoch": 0.8535809693423229, + "ewc_loss": 0.007203833200037479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.20383322914131e-05, + "grad_norm": 3.577474594116211, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8710181713104248, + "num_tokens": 256045374.0, + "step": 6710 + }, + { + "epoch": 0.8537081796209134, + "ewc_loss": 0.007178037893027067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.178037776611745e-05, + "grad_norm": 3.57092547416687, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8688508868217468, + "num_tokens": 256084874.0, + "step": 6711 + }, + { + "epoch": 0.8538353898995039, + "ewc_loss": 0.007173164281994104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.173164340201765e-05, + "grad_norm": 3.6116597652435303, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8600488901138306, + "num_tokens": 256122251.0, + "step": 6712 + }, + { + "epoch": 0.8539626001780944, + "ewc_loss": 0.007196180988103151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.196181104518473e-05, + "grad_norm": 3.6154112815856934, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8633613586425781, + "num_tokens": 256165025.0, + "step": 6713 + }, + { + "epoch": 0.8540898104566849, + "ewc_loss": 0.007171728648245335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.171728793764487e-05, + "grad_norm": 3.576230049133301, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8728759288787842, + "num_tokens": 256202220.0, + "step": 6714 + }, + { + "epoch": 0.8542170207352754, + "ewc_loss": 0.007142292335629463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.142292452044785e-05, + "grad_norm": 3.639451742172241, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8687072992324829, + "num_tokens": 256234036.0, + "step": 6715 + }, + { + "epoch": 0.8543442310138659, + "ewc_loss": 0.007202080450952053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.202080450952053e-05, + "grad_norm": 3.604259729385376, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8638463020324707, + "num_tokens": 256275253.0, + "step": 6716 + }, + { + "epoch": 0.8544714412924564, + "ewc_loss": 0.007153637707233429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.153637852752581e-05, + "grad_norm": 3.639129161834717, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8622665405273438, + "num_tokens": 256311296.0, + "step": 6717 + }, + { + "epoch": 0.854598651571047, + "ewc_loss": 0.007184942718595266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.184942660387605e-05, + "grad_norm": 3.5715246200561523, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.872316837310791, + "num_tokens": 256348149.0, + "step": 6718 + }, + { + "epoch": 0.8547258618496375, + "ewc_loss": 0.007130548357963562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.130548328859732e-05, + "grad_norm": 3.564565896987915, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8796568512916565, + "num_tokens": 256387861.0, + "step": 6719 + }, + { + "epoch": 0.8548530721282279, + "ewc_loss": 0.007142035756260157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.142035610741004e-05, + "grad_norm": 3.6002871990203857, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8748488426208496, + "num_tokens": 256427131.0, + "step": 6720 + }, + { + "epoch": 0.8549802824068184, + "ewc_loss": 0.007166699506342411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.166699651861563e-05, + "grad_norm": 3.5607492923736572, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8712239265441895, + "num_tokens": 256465041.0, + "step": 6721 + }, + { + "epoch": 0.855107492685409, + "ewc_loss": 0.007138390559703112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.138390355976298e-05, + "grad_norm": 3.5942275524139404, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8577418923377991, + "num_tokens": 256502576.0, + "step": 6722 + }, + { + "epoch": 0.8552347029639995, + "ewc_loss": 0.007175208069384098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.17520815669559e-05, + "grad_norm": 3.5871403217315674, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8700708150863647, + "num_tokens": 256540493.0, + "step": 6723 + }, + { + "epoch": 0.85536191324259, + "ewc_loss": 0.007160812616348267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160812674555928e-05, + "grad_norm": 3.633420705795288, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8490692377090454, + "num_tokens": 256578115.0, + "step": 6724 + }, + { + "epoch": 0.8554891235211806, + "ewc_loss": 0.007174382917582989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.174383063102141e-05, + "grad_norm": 3.592897653579712, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8738902807235718, + "num_tokens": 256615957.0, + "step": 6725 + }, + { + "epoch": 0.855616333799771, + "ewc_loss": 0.007143350318074226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.143350376281887e-05, + "grad_norm": 3.6376523971557617, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8702153563499451, + "num_tokens": 256649972.0, + "step": 6726 + }, + { + "epoch": 0.8557435440783615, + "ewc_loss": 0.007183493115007877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.18349328963086e-05, + "grad_norm": 3.5840067863464355, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8740139007568359, + "num_tokens": 256686683.0, + "step": 6727 + }, + { + "epoch": 0.855870754356952, + "ewc_loss": 0.007128345780074596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.128345896489918e-05, + "grad_norm": 3.6493613719940186, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.86534184217453, + "num_tokens": 256723587.0, + "step": 6728 + }, + { + "epoch": 0.8559979646355426, + "ewc_loss": 0.00717511959373951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.175119390012696e-05, + "grad_norm": 3.5880038738250732, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8668410778045654, + "num_tokens": 256765245.0, + "step": 6729 + }, + { + "epoch": 0.8561251749141331, + "ewc_loss": 0.007137353532016277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.137353532016277e-05, + "grad_norm": 3.525251626968384, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8934363126754761, + "num_tokens": 256806930.0, + "step": 6730 + }, + { + "epoch": 0.8562523851927236, + "ewc_loss": 0.0071152979508042336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.115297921700403e-05, + "grad_norm": 3.6072261333465576, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8648759722709656, + "num_tokens": 256844568.0, + "step": 6731 + }, + { + "epoch": 0.856379595471314, + "ewc_loss": 0.007185779511928558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.185779395513237e-05, + "grad_norm": 3.668593406677246, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8664583563804626, + "num_tokens": 256877483.0, + "step": 6732 + }, + { + "epoch": 0.8565068057499046, + "ewc_loss": 0.007176095154136419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.176095095928758e-05, + "grad_norm": 3.5371954441070557, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8700728416442871, + "num_tokens": 256921139.0, + "step": 6733 + }, + { + "epoch": 0.8566340160284951, + "ewc_loss": 0.007083510048687458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.083509990479797e-05, + "grad_norm": 3.6361660957336426, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8665444254875183, + "num_tokens": 256957861.0, + "step": 6734 + }, + { + "epoch": 0.8567612263070856, + "ewc_loss": 0.007194725330919027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.194725185399875e-05, + "grad_norm": 3.5866172313690186, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8691525459289551, + "num_tokens": 256998582.0, + "step": 6735 + }, + { + "epoch": 0.8568884365856761, + "ewc_loss": 0.007119177374988794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.119177462300286e-05, + "grad_norm": 3.5812411308288574, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8654229640960693, + "num_tokens": 257037638.0, + "step": 6736 + }, + { + "epoch": 0.8570156468642667, + "ewc_loss": 0.007136145140975714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.136144995456561e-05, + "grad_norm": 3.5699715614318848, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8678321838378906, + "num_tokens": 257074427.0, + "step": 6737 + }, + { + "epoch": 0.8571428571428571, + "ewc_loss": 0.00712537532672286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.125375122996047e-05, + "grad_norm": 3.5821948051452637, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8697394132614136, + "num_tokens": 257113826.0, + "step": 6738 + }, + { + "epoch": 0.8572700674214476, + "ewc_loss": 0.007116641849279404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.116641791071743e-05, + "grad_norm": 3.634821891784668, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8542299270629883, + "num_tokens": 257152387.0, + "step": 6739 + }, + { + "epoch": 0.8573972777000382, + "ewc_loss": 0.0071598077192902565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.159807864809409e-05, + "grad_norm": 3.584779739379883, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8685354590415955, + "num_tokens": 257192083.0, + "step": 6740 + }, + { + "epoch": 0.8575244879786287, + "ewc_loss": 0.007100232876837254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.100233051460236e-05, + "grad_norm": 3.580522060394287, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8593917489051819, + "num_tokens": 257235425.0, + "step": 6741 + }, + { + "epoch": 0.8576516982572192, + "ewc_loss": 0.007132105529308319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.132105383789167e-05, + "grad_norm": 3.627342939376831, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8620644211769104, + "num_tokens": 257268621.0, + "step": 6742 + }, + { + "epoch": 0.8577789085358097, + "ewc_loss": 0.007151601370424032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.151601312216371e-05, + "grad_norm": 3.55843186378479, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8709083795547485, + "num_tokens": 257306909.0, + "step": 6743 + }, + { + "epoch": 0.8579061188144002, + "ewc_loss": 0.007093828171491623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.09382802597247e-05, + "grad_norm": 3.5915098190307617, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8808265924453735, + "num_tokens": 257343388.0, + "step": 6744 + }, + { + "epoch": 0.8580333290929907, + "ewc_loss": 0.00714036775752902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.140367961255834e-05, + "grad_norm": 3.6255061626434326, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8729889392852783, + "num_tokens": 257377172.0, + "step": 6745 + }, + { + "epoch": 0.8581605393715812, + "ewc_loss": 0.007140918634831905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.140918751247227e-05, + "grad_norm": 3.618039846420288, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8596241474151611, + "num_tokens": 257410312.0, + "step": 6746 + }, + { + "epoch": 0.8582877496501717, + "ewc_loss": 0.007136449683457613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.136449858080596e-05, + "grad_norm": 3.5699503421783447, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8702471256256104, + "num_tokens": 257449919.0, + "step": 6747 + }, + { + "epoch": 0.8584149599287623, + "ewc_loss": 0.007119984365999699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.119984365999699e-05, + "grad_norm": 3.5772643089294434, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8776698112487793, + "num_tokens": 257488296.0, + "step": 6748 + }, + { + "epoch": 0.8585421702073528, + "ewc_loss": 0.007147552911192179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.14755296939984e-05, + "grad_norm": 3.6241331100463867, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8743857145309448, + "num_tokens": 257524678.0, + "step": 6749 + }, + { + "epoch": 0.8586693804859432, + "ewc_loss": 0.007179305888712406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.179305976023898e-05, + "grad_norm": 3.6344501972198486, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8720963001251221, + "num_tokens": 257559060.0, + "step": 6750 + }, + { + "epoch": 0.8587965907645337, + "ewc_loss": 0.007177678868174553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.177679071901366e-05, + "grad_norm": 3.5855836868286133, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8771238327026367, + "num_tokens": 257594340.0, + "step": 6751 + }, + { + "epoch": 0.8589238010431243, + "ewc_loss": 0.00716088991612196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160889799706638e-05, + "grad_norm": 3.5988759994506836, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8787918090820312, + "num_tokens": 257627936.0, + "step": 6752 + }, + { + "epoch": 0.8590510113217148, + "ewc_loss": 0.007192975375801325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.192975317593664e-05, + "grad_norm": 3.655036211013794, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.864130973815918, + "num_tokens": 257660311.0, + "step": 6753 + }, + { + "epoch": 0.8591782216003053, + "ewc_loss": 0.007216545753180981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.216545782284811e-05, + "grad_norm": 3.5962026119232178, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8592712879180908, + "num_tokens": 257697496.0, + "step": 6754 + }, + { + "epoch": 0.8593054318788959, + "ewc_loss": 0.007172164507210255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.172164623625576e-05, + "grad_norm": 3.6173300743103027, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8495360612869263, + "num_tokens": 257737242.0, + "step": 6755 + }, + { + "epoch": 0.8594326421574863, + "ewc_loss": 0.007211382035166025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.211382035166025e-05, + "grad_norm": 3.592867374420166, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8679657578468323, + "num_tokens": 257772946.0, + "step": 6756 + }, + { + "epoch": 0.8595598524360768, + "ewc_loss": 0.007195921149104834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.195921352831647e-05, + "grad_norm": 3.6631479263305664, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8598717451095581, + "num_tokens": 257807198.0, + "step": 6757 + }, + { + "epoch": 0.8596870627146673, + "ewc_loss": 0.007250839378684759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.250839553307742e-05, + "grad_norm": 3.5221164226531982, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8739945292472839, + "num_tokens": 257848618.0, + "step": 6758 + }, + { + "epoch": 0.8598142729932579, + "ewc_loss": 0.007154066581279039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.154066406656057e-05, + "grad_norm": 3.5715622901916504, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8800332546234131, + "num_tokens": 257887805.0, + "step": 6759 + }, + { + "epoch": 0.8599414832718484, + "ewc_loss": 0.007244723383337259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.244723383337259e-05, + "grad_norm": 3.56714129447937, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8730757832527161, + "num_tokens": 257929276.0, + "step": 6760 + }, + { + "epoch": 0.8600686935504389, + "ewc_loss": 0.007203260436654091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.203260611277074e-05, + "grad_norm": 3.6809847354888916, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8639435172080994, + "num_tokens": 257958629.0, + "step": 6761 + }, + { + "epoch": 0.8601959038290294, + "ewc_loss": 0.007295627612620592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29562743799761e-05, + "grad_norm": 3.6890783309936523, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8648955821990967, + "num_tokens": 257990303.0, + "step": 6762 + }, + { + "epoch": 0.8603231141076199, + "ewc_loss": 0.007266389671713114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.266389729920775e-05, + "grad_norm": 3.6291420459747314, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8507720232009888, + "num_tokens": 258028704.0, + "step": 6763 + }, + { + "epoch": 0.8604503243862104, + "ewc_loss": 0.007219444029033184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.21944379620254e-05, + "grad_norm": 3.565873622894287, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8773197531700134, + "num_tokens": 258062757.0, + "step": 6764 + }, + { + "epoch": 0.8605775346648009, + "ewc_loss": 0.0072264280170202255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226427987916395e-05, + "grad_norm": 3.6016201972961426, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8670781850814819, + "num_tokens": 258098574.0, + "step": 6765 + }, + { + "epoch": 0.8607047449433914, + "ewc_loss": 0.007263884413987398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.263884617714211e-05, + "grad_norm": 3.646711826324463, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8750385046005249, + "num_tokens": 258130937.0, + "step": 6766 + }, + { + "epoch": 0.860831955221982, + "ewc_loss": 0.007280696649104357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.280696445377544e-05, + "grad_norm": 3.601062774658203, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.869175910949707, + "num_tokens": 258165142.0, + "step": 6767 + }, + { + "epoch": 0.8609591655005725, + "ewc_loss": 0.007238271180540323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.238271064125001e-05, + "grad_norm": 3.523242473602295, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8835197687149048, + "num_tokens": 258203986.0, + "step": 6768 + }, + { + "epoch": 0.8610863757791629, + "ewc_loss": 0.00720800319686532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.208003080449998e-05, + "grad_norm": 3.5922021865844727, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8674726486206055, + "num_tokens": 258238692.0, + "step": 6769 + }, + { + "epoch": 0.8612135860577534, + "ewc_loss": 0.007287247572094202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.287247717613354e-05, + "grad_norm": 3.6193175315856934, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8461391925811768, + "num_tokens": 258280942.0, + "step": 6770 + }, + { + "epoch": 0.861340796336344, + "ewc_loss": 0.0072655705735087395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.265570457093418e-05, + "grad_norm": 3.5703787803649902, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8699666261672974, + "num_tokens": 258318835.0, + "step": 6771 + }, + { + "epoch": 0.8614680066149345, + "ewc_loss": 0.0072308792732656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.230879418784752e-05, + "grad_norm": 3.5998926162719727, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8585696816444397, + "num_tokens": 258359461.0, + "step": 6772 + }, + { + "epoch": 0.861595216893525, + "ewc_loss": 0.007270029280334711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.270029163919389e-05, + "grad_norm": 3.616534948348999, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8637200593948364, + "num_tokens": 258394582.0, + "step": 6773 + }, + { + "epoch": 0.8617224271721156, + "ewc_loss": 0.007261047139763832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.26104699424468e-05, + "grad_norm": 3.563823699951172, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8715761303901672, + "num_tokens": 258438526.0, + "step": 6774 + }, + { + "epoch": 0.861849637450706, + "ewc_loss": 0.0072149718180298805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214971992652863e-05, + "grad_norm": 3.6827266216278076, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.85533607006073, + "num_tokens": 258479332.0, + "step": 6775 + }, + { + "epoch": 0.8619768477292965, + "ewc_loss": 0.0072994050569832325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.299405115190893e-05, + "grad_norm": 3.669201374053955, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8585340976715088, + "num_tokens": 258513202.0, + "step": 6776 + }, + { + "epoch": 0.862104058007887, + "ewc_loss": 0.007241890300065279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.241890125442296e-05, + "grad_norm": 3.61143159866333, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8687866926193237, + "num_tokens": 258549145.0, + "step": 6777 + }, + { + "epoch": 0.8622312682864776, + "ewc_loss": 0.007213575765490532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213575736386701e-05, + "grad_norm": 3.5764944553375244, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8597691655158997, + "num_tokens": 258586421.0, + "step": 6778 + }, + { + "epoch": 0.8623584785650681, + "ewc_loss": 0.007218660321086645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.218660175567493e-05, + "grad_norm": 3.6285905838012695, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8678873777389526, + "num_tokens": 258620014.0, + "step": 6779 + }, + { + "epoch": 0.8624856888436586, + "ewc_loss": 0.007255856413394213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.255856326082721e-05, + "grad_norm": 3.5630404949188232, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8712716698646545, + "num_tokens": 258656124.0, + "step": 6780 + }, + { + "epoch": 0.862612899122249, + "ewc_loss": 0.0072017572820186615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.201757398433983e-05, + "grad_norm": 3.5888006687164307, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8573443293571472, + "num_tokens": 258692537.0, + "step": 6781 + }, + { + "epoch": 0.8627401094008396, + "ewc_loss": 0.00726404320448637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2640432335902e-05, + "grad_norm": 3.567443370819092, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8706226348876953, + "num_tokens": 258732737.0, + "step": 6782 + }, + { + "epoch": 0.8628673196794301, + "ewc_loss": 0.007235521450638771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.235521479742602e-05, + "grad_norm": 3.5976221561431885, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.855340838432312, + "num_tokens": 258770634.0, + "step": 6783 + }, + { + "epoch": 0.8629945299580206, + "ewc_loss": 0.00727760000154376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.277599797816947e-05, + "grad_norm": 3.6064844131469727, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8665353655815125, + "num_tokens": 258809643.0, + "step": 6784 + }, + { + "epoch": 0.8631217402366111, + "ewc_loss": 0.007259928621351719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.25992867955938e-05, + "grad_norm": 3.597538709640503, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8738230466842651, + "num_tokens": 258845336.0, + "step": 6785 + }, + { + "epoch": 0.8632489505152017, + "ewc_loss": 0.007250803988426924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.250803901115432e-05, + "grad_norm": 3.6836438179016113, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.868270754814148, + "num_tokens": 258882206.0, + "step": 6786 + }, + { + "epoch": 0.8633761607937921, + "ewc_loss": 0.007298165932297707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.298166019609198e-05, + "grad_norm": 3.62976336479187, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8739690780639648, + "num_tokens": 258918661.0, + "step": 6787 + }, + { + "epoch": 0.8635033710723826, + "ewc_loss": 0.007224631495773792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224631553981453e-05, + "grad_norm": 3.5824508666992188, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.863203227519989, + "num_tokens": 258957452.0, + "step": 6788 + }, + { + "epoch": 0.8636305813509731, + "ewc_loss": 0.007229828275740147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229828042909503e-05, + "grad_norm": 3.5418994426727295, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8583887815475464, + "num_tokens": 259003507.0, + "step": 6789 + }, + { + "epoch": 0.8637577916295637, + "ewc_loss": 0.00721552362665534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.215523510240018e-05, + "grad_norm": 3.589707136154175, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8618580102920532, + "num_tokens": 259042284.0, + "step": 6790 + }, + { + "epoch": 0.8638850019081542, + "ewc_loss": 0.007241633255034685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.241633284138516e-05, + "grad_norm": 3.567474842071533, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8802601099014282, + "num_tokens": 259082929.0, + "step": 6791 + }, + { + "epoch": 0.8640122121867447, + "ewc_loss": 0.007200353313237429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.200353138614446e-05, + "grad_norm": 3.533482074737549, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.880856990814209, + "num_tokens": 259125920.0, + "step": 6792 + }, + { + "epoch": 0.8641394224653351, + "ewc_loss": 0.0071916584856808186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.191658369265497e-05, + "grad_norm": 3.6131844520568848, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8697562217712402, + "num_tokens": 259163140.0, + "step": 6793 + }, + { + "epoch": 0.8642666327439257, + "ewc_loss": 0.007248471956700087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.248471956700087e-05, + "grad_norm": 3.5953094959259033, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8649197816848755, + "num_tokens": 259199038.0, + "step": 6794 + }, + { + "epoch": 0.8643938430225162, + "ewc_loss": 0.007192079443484545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.192079647211358e-05, + "grad_norm": 3.620927095413208, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8650556802749634, + "num_tokens": 259235713.0, + "step": 6795 + }, + { + "epoch": 0.8645210533011067, + "ewc_loss": 0.0072133601643145084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213360368041322e-05, + "grad_norm": 3.571152448654175, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.856528103351593, + "num_tokens": 259278314.0, + "step": 6796 + }, + { + "epoch": 0.8646482635796973, + "ewc_loss": 0.007174681406468153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.174681377364323e-05, + "grad_norm": 3.637375831604004, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8757712841033936, + "num_tokens": 259313039.0, + "step": 6797 + }, + { + "epoch": 0.8647754738582878, + "ewc_loss": 0.007227408699691296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.227408787002787e-05, + "grad_norm": 3.590939998626709, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8742008805274963, + "num_tokens": 259348845.0, + "step": 6798 + }, + { + "epoch": 0.8649026841368782, + "ewc_loss": 0.007174025289714336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.174025085987523e-05, + "grad_norm": 3.6042087078094482, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8618757724761963, + "num_tokens": 259387345.0, + "step": 6799 + }, + { + "epoch": 0.8650298944154687, + "ewc_loss": 0.007204036228358746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204036228358746e-05, + "grad_norm": 3.558438777923584, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8666142225265503, + "num_tokens": 259427835.0, + "step": 6800 + }, + { + "epoch": 0.8651571046940593, + "ewc_loss": 0.007162879686802626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.162879774114117e-05, + "grad_norm": 3.6553428173065186, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8715163469314575, + "num_tokens": 259460453.0, + "step": 6801 + }, + { + "epoch": 0.8652843149726498, + "ewc_loss": 0.007236751727759838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.23675184417516e-05, + "grad_norm": 3.6742005348205566, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8732935190200806, + "num_tokens": 259499296.0, + "step": 6802 + }, + { + "epoch": 0.8654115252512403, + "ewc_loss": 0.007215864025056362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.215864025056362e-05, + "grad_norm": 3.591984272003174, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8659188151359558, + "num_tokens": 259537649.0, + "step": 6803 + }, + { + "epoch": 0.8655387355298308, + "ewc_loss": 0.0071569993160665035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.156999345170334e-05, + "grad_norm": 3.538231134414673, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8600319623947144, + "num_tokens": 259583136.0, + "step": 6804 + }, + { + "epoch": 0.8656659458084213, + "ewc_loss": 0.007150894030928612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.150894089136273e-05, + "grad_norm": 3.604572057723999, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8707607984542847, + "num_tokens": 259620422.0, + "step": 6805 + }, + { + "epoch": 0.8657931560870118, + "ewc_loss": 0.0072043417021632195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204341818578541e-05, + "grad_norm": 3.603731870651245, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8557467460632324, + "num_tokens": 259663948.0, + "step": 6806 + }, + { + "epoch": 0.8659203663656023, + "ewc_loss": 0.007172374054789543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.172374171204865e-05, + "grad_norm": 3.557429552078247, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8684585690498352, + "num_tokens": 259701986.0, + "step": 6807 + }, + { + "epoch": 0.8660475766441929, + "ewc_loss": 0.007153271231800318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.153271144488826e-05, + "grad_norm": 3.5425968170166016, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8808077573776245, + "num_tokens": 259740684.0, + "step": 6808 + }, + { + "epoch": 0.8661747869227834, + "ewc_loss": 0.007163968402892351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.163968257373199e-05, + "grad_norm": 3.6240763664245605, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.851243793964386, + "num_tokens": 259781659.0, + "step": 6809 + }, + { + "epoch": 0.8663019972013739, + "ewc_loss": 0.007206972688436508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.20697280485183e-05, + "grad_norm": 3.632110357284546, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8603774309158325, + "num_tokens": 259818554.0, + "step": 6810 + }, + { + "epoch": 0.8664292074799644, + "ewc_loss": 0.007180860266089439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.180860120570287e-05, + "grad_norm": 3.5946288108825684, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8692799806594849, + "num_tokens": 259856652.0, + "step": 6811 + }, + { + "epoch": 0.8665564177585549, + "ewc_loss": 0.007162729278206825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.162729161791503e-05, + "grad_norm": 3.5702223777770996, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8843761682510376, + "num_tokens": 259898255.0, + "step": 6812 + }, + { + "epoch": 0.8666836280371454, + "ewc_loss": 0.007157484535127878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.1574846515432e-05, + "grad_norm": 3.6539673805236816, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8718289136886597, + "num_tokens": 259935305.0, + "step": 6813 + }, + { + "epoch": 0.8668108383157359, + "ewc_loss": 0.007215338759124279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.215338700916618e-05, + "grad_norm": 3.641172409057617, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8705611824989319, + "num_tokens": 259975245.0, + "step": 6814 + }, + { + "epoch": 0.8669380485943264, + "ewc_loss": 0.007161704823374748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.161704706959426e-05, + "grad_norm": 3.5438601970672607, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8717162609100342, + "num_tokens": 260014150.0, + "step": 6815 + }, + { + "epoch": 0.867065258872917, + "ewc_loss": 0.00711058871820569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.110588921932504e-05, + "grad_norm": 3.6346487998962402, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8582691550254822, + "num_tokens": 260054655.0, + "step": 6816 + }, + { + "epoch": 0.8671924691515075, + "ewc_loss": 0.007198250386863947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.198250386863947e-05, + "grad_norm": 3.6461727619171143, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.867354154586792, + "num_tokens": 260091528.0, + "step": 6817 + }, + { + "epoch": 0.8673196794300979, + "ewc_loss": 0.007168613839894533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.168613956309855e-05, + "grad_norm": 3.622364044189453, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8841887712478638, + "num_tokens": 260123069.0, + "step": 6818 + }, + { + "epoch": 0.8674468897086884, + "ewc_loss": 0.007138984277844429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.138984074117616e-05, + "grad_norm": 3.745485544204712, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8522248268127441, + "num_tokens": 260154482.0, + "step": 6819 + }, + { + "epoch": 0.867574099987279, + "ewc_loss": 0.0072434935718774796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.243493746500462e-05, + "grad_norm": 3.5736122131347656, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.869605541229248, + "num_tokens": 260193983.0, + "step": 6820 + }, + { + "epoch": 0.8677013102658695, + "ewc_loss": 0.007102776784449816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.102776726242155e-05, + "grad_norm": 3.642960786819458, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8697701096534729, + "num_tokens": 260228705.0, + "step": 6821 + }, + { + "epoch": 0.86782852054446, + "ewc_loss": 0.007227997295558453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.227997411973774e-05, + "grad_norm": 3.6891603469848633, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.866166353225708, + "num_tokens": 260262821.0, + "step": 6822 + }, + { + "epoch": 0.8679557308230506, + "ewc_loss": 0.007224902976304293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224902947200462e-05, + "grad_norm": 3.5834498405456543, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8649977445602417, + "num_tokens": 260298949.0, + "step": 6823 + }, + { + "epoch": 0.868082941101641, + "ewc_loss": 0.007148407399654388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.148407166823745e-05, + "grad_norm": 3.5723495483398438, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8756561279296875, + "num_tokens": 260340028.0, + "step": 6824 + }, + { + "epoch": 0.8682101513802315, + "ewc_loss": 0.007193043828010559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.193043711595237e-05, + "grad_norm": 3.583125352859497, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8677950501441956, + "num_tokens": 260381328.0, + "step": 6825 + }, + { + "epoch": 0.868337361658822, + "ewc_loss": 0.007184267975389957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.18426817911677e-05, + "grad_norm": 3.5714097023010254, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.866551399230957, + "num_tokens": 260423421.0, + "step": 6826 + }, + { + "epoch": 0.8684645719374126, + "ewc_loss": 0.007182622328400612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.182622357504442e-05, + "grad_norm": 3.583101749420166, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8657014966011047, + "num_tokens": 260461985.0, + "step": 6827 + }, + { + "epoch": 0.8685917822160031, + "ewc_loss": 0.007197760511189699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.197760714916512e-05, + "grad_norm": 3.6250383853912354, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8516683578491211, + "num_tokens": 260503041.0, + "step": 6828 + }, + { + "epoch": 0.8687189924945936, + "ewc_loss": 0.007195988204330206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.195988291641697e-05, + "grad_norm": 3.5548930168151855, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8758482933044434, + "num_tokens": 260545553.0, + "step": 6829 + }, + { + "epoch": 0.868846202773184, + "ewc_loss": 0.0071366410702466965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.136641215765849e-05, + "grad_norm": 3.7275125980377197, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8747173547744751, + "num_tokens": 260577160.0, + "step": 6830 + }, + { + "epoch": 0.8689734130517746, + "ewc_loss": 0.007268467918038368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.268467743415385e-05, + "grad_norm": 3.58561635017395, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8781575560569763, + "num_tokens": 260614748.0, + "step": 6831 + }, + { + "epoch": 0.8691006233303651, + "ewc_loss": 0.007118675392121077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.118675421224907e-05, + "grad_norm": 3.5757391452789307, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8669374585151672, + "num_tokens": 260651762.0, + "step": 6832 + }, + { + "epoch": 0.8692278336089556, + "ewc_loss": 0.0071779461577534676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.177946099545807e-05, + "grad_norm": 3.6063473224639893, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8558710813522339, + "num_tokens": 260694438.0, + "step": 6833 + }, + { + "epoch": 0.8693550438875461, + "ewc_loss": 0.007187713403254747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.187713345047086e-05, + "grad_norm": 3.61946177482605, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8757503628730774, + "num_tokens": 260730256.0, + "step": 6834 + }, + { + "epoch": 0.8694822541661367, + "ewc_loss": 0.007178598549216986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.178598752943799e-05, + "grad_norm": 3.636401891708374, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8463672399520874, + "num_tokens": 260766190.0, + "step": 6835 + }, + { + "epoch": 0.8696094644447271, + "ewc_loss": 0.007181013468652964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.181013643275946e-05, + "grad_norm": 3.598362445831299, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8806559443473816, + "num_tokens": 260801733.0, + "step": 6836 + }, + { + "epoch": 0.8697366747233176, + "ewc_loss": 0.007160565350204706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160565291997045e-05, + "grad_norm": 3.522259473800659, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8790018558502197, + "num_tokens": 260843461.0, + "step": 6837 + }, + { + "epoch": 0.8698638850019081, + "ewc_loss": 0.007142127491533756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.142127287806943e-05, + "grad_norm": 3.6631784439086914, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8655819296836853, + "num_tokens": 260875006.0, + "step": 6838 + }, + { + "epoch": 0.8699910952804987, + "ewc_loss": 0.007256472483277321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.256472599692643e-05, + "grad_norm": 3.5746960639953613, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8607120513916016, + "num_tokens": 260917553.0, + "step": 6839 + }, + { + "epoch": 0.8701183055590892, + "ewc_loss": 0.007143363356590271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.143363473005593e-05, + "grad_norm": 3.651906967163086, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8411448001861572, + "num_tokens": 260955293.0, + "step": 6840 + }, + { + "epoch": 0.8702455158376797, + "ewc_loss": 0.0072488486766815186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.248848851304501e-05, + "grad_norm": 3.7268855571746826, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8689534068107605, + "num_tokens": 260983966.0, + "step": 6841 + }, + { + "epoch": 0.8703727261162701, + "ewc_loss": 0.007277508266270161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.277508120751008e-05, + "grad_norm": 3.553912878036499, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.879581093788147, + "num_tokens": 261019434.0, + "step": 6842 + }, + { + "epoch": 0.8704999363948607, + "ewc_loss": 0.00716864038258791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.168640149757266e-05, + "grad_norm": 3.6104488372802734, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8572054505348206, + "num_tokens": 261053761.0, + "step": 6843 + }, + { + "epoch": 0.8706271466734512, + "ewc_loss": 0.007277211174368858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.27721126168035e-05, + "grad_norm": 3.5850398540496826, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8582669496536255, + "num_tokens": 261093490.0, + "step": 6844 + }, + { + "epoch": 0.8707543569520417, + "ewc_loss": 0.007244112901389599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.244112930493429e-05, + "grad_norm": 3.6228020191192627, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8682006597518921, + "num_tokens": 261129518.0, + "step": 6845 + }, + { + "epoch": 0.8708815672306323, + "ewc_loss": 0.007282422389835119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.282422302523628e-05, + "grad_norm": 3.604401111602783, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8493562936782837, + "num_tokens": 261164282.0, + "step": 6846 + }, + { + "epoch": 0.8710087775092228, + "ewc_loss": 0.007255122065544128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.25512218195945e-05, + "grad_norm": 3.584331512451172, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8474993705749512, + "num_tokens": 261205055.0, + "step": 6847 + }, + { + "epoch": 0.8711359877878132, + "ewc_loss": 0.0072600943967700005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.260094571392983e-05, + "grad_norm": 3.5646684169769287, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8617695569992065, + "num_tokens": 261242329.0, + "step": 6848 + }, + { + "epoch": 0.8712631980664037, + "ewc_loss": 0.007259615696966648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.25961581338197e-05, + "grad_norm": 3.6240897178649902, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8401682376861572, + "num_tokens": 261280330.0, + "step": 6849 + }, + { + "epoch": 0.8713904083449943, + "ewc_loss": 0.007312454748898745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.312454545171931e-05, + "grad_norm": 3.5482242107391357, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8781594038009644, + "num_tokens": 261318232.0, + "step": 6850 + }, + { + "epoch": 0.8715176186235848, + "ewc_loss": 0.0072537013329565525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2537011874374e-05, + "grad_norm": 3.6282055377960205, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8838533163070679, + "num_tokens": 261351483.0, + "step": 6851 + }, + { + "epoch": 0.8716448289021753, + "ewc_loss": 0.007310816552489996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.310816727112979e-05, + "grad_norm": 3.6249239444732666, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8696323037147522, + "num_tokens": 261387058.0, + "step": 6852 + }, + { + "epoch": 0.8717720391807658, + "ewc_loss": 0.007278015837073326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.278015982592478e-05, + "grad_norm": 3.4902262687683105, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8810669183731079, + "num_tokens": 261430110.0, + "step": 6853 + }, + { + "epoch": 0.8718992494593563, + "ewc_loss": 0.007213131058961153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213131175376475e-05, + "grad_norm": 3.5731470584869385, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8739496469497681, + "num_tokens": 261470707.0, + "step": 6854 + }, + { + "epoch": 0.8720264597379468, + "ewc_loss": 0.007278301753103733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.278301927726716e-05, + "grad_norm": 3.6023478507995605, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8741052150726318, + "num_tokens": 261507146.0, + "step": 6855 + }, + { + "epoch": 0.8721536700165373, + "ewc_loss": 0.007253384683281183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.253384683281183e-05, + "grad_norm": 3.646649122238159, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.852891206741333, + "num_tokens": 261549723.0, + "step": 6856 + }, + { + "epoch": 0.8722808802951278, + "ewc_loss": 0.007263252977281809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2632530645933e-05, + "grad_norm": 3.5778422355651855, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8719873428344727, + "num_tokens": 261591030.0, + "step": 6857 + }, + { + "epoch": 0.8724080905737184, + "ewc_loss": 0.007199946325272322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.199946412583813e-05, + "grad_norm": 3.6038310527801514, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8763481378555298, + "num_tokens": 261623919.0, + "step": 6858 + }, + { + "epoch": 0.8725353008523089, + "ewc_loss": 0.007237816229462624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.237816316774115e-05, + "grad_norm": 3.647138833999634, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8832478523254395, + "num_tokens": 261655277.0, + "step": 6859 + }, + { + "epoch": 0.8726625111308994, + "ewc_loss": 0.0072470372542738914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.24703713785857e-05, + "grad_norm": 3.5987424850463867, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8536452651023865, + "num_tokens": 261697462.0, + "step": 6860 + }, + { + "epoch": 0.8727897214094898, + "ewc_loss": 0.0071806455962359905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.180645479820669e-05, + "grad_norm": 3.6133546829223633, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8638323545455933, + "num_tokens": 261739134.0, + "step": 6861 + }, + { + "epoch": 0.8729169316880804, + "ewc_loss": 0.00722680427134037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226804154925048e-05, + "grad_norm": 3.602139711380005, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.871307909488678, + "num_tokens": 261776295.0, + "step": 6862 + }, + { + "epoch": 0.8730441419666709, + "ewc_loss": 0.007183755282312632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.183755224104971e-05, + "grad_norm": 3.615595579147339, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8721953630447388, + "num_tokens": 261812388.0, + "step": 6863 + }, + { + "epoch": 0.8731713522452614, + "ewc_loss": 0.007209086325019598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.20908647053875e-05, + "grad_norm": 3.5547778606414795, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8693978786468506, + "num_tokens": 261855537.0, + "step": 6864 + }, + { + "epoch": 0.873298562523852, + "ewc_loss": 0.007171136327087879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.171136530814692e-05, + "grad_norm": 3.547440528869629, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8761990070343018, + "num_tokens": 261899922.0, + "step": 6865 + }, + { + "epoch": 0.8734257728024425, + "ewc_loss": 0.007176090031862259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.176090002758428e-05, + "grad_norm": 3.665818214416504, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.878807783126831, + "num_tokens": 261934475.0, + "step": 6866 + }, + { + "epoch": 0.8735529830810329, + "ewc_loss": 0.007258594036102295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.258594268932939e-05, + "grad_norm": 3.611584424972534, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8675447702407837, + "num_tokens": 261973395.0, + "step": 6867 + }, + { + "epoch": 0.8736801933596234, + "ewc_loss": 0.007183181121945381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.183181151049212e-05, + "grad_norm": 3.632894515991211, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8509104251861572, + "num_tokens": 262011644.0, + "step": 6868 + }, + { + "epoch": 0.873807403638214, + "ewc_loss": 0.007213259115815163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213259232230484e-05, + "grad_norm": 3.6653265953063965, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8634112477302551, + "num_tokens": 262044038.0, + "step": 6869 + }, + { + "epoch": 0.8739346139168045, + "ewc_loss": 0.007231392897665501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.231393101392314e-05, + "grad_norm": 3.6340954303741455, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8673694729804993, + "num_tokens": 262081023.0, + "step": 6870 + }, + { + "epoch": 0.874061824195395, + "ewc_loss": 0.007193120662122965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.193120836745948e-05, + "grad_norm": 3.576925754547119, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8613070845603943, + "num_tokens": 262123981.0, + "step": 6871 + }, + { + "epoch": 0.8741890344739855, + "ewc_loss": 0.007177267223596573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.177267252700403e-05, + "grad_norm": 3.618476152420044, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8566573858261108, + "num_tokens": 262161542.0, + "step": 6872 + }, + { + "epoch": 0.874316244752576, + "ewc_loss": 0.007224309258162975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224309229059145e-05, + "grad_norm": 3.5995681285858154, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8575931787490845, + "num_tokens": 262202482.0, + "step": 6873 + }, + { + "epoch": 0.8744434550311665, + "ewc_loss": 0.007181181106716394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.181180990301073e-05, + "grad_norm": 3.6099510192871094, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8705326318740845, + "num_tokens": 262239499.0, + "step": 6874 + }, + { + "epoch": 0.874570665309757, + "ewc_loss": 0.00721732946112752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.217329402919859e-05, + "grad_norm": 3.573549270629883, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8679916262626648, + "num_tokens": 262281324.0, + "step": 6875 + }, + { + "epoch": 0.8746978755883476, + "ewc_loss": 0.0071858493611216545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.185849244706333e-05, + "grad_norm": 3.6280136108398438, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8585044145584106, + "num_tokens": 262318772.0, + "step": 6876 + }, + { + "epoch": 0.8748250858669381, + "ewc_loss": 0.0072500319220125675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.250031922012568e-05, + "grad_norm": 3.6072299480438232, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8801440000534058, + "num_tokens": 262353264.0, + "step": 6877 + }, + { + "epoch": 0.8749522961455286, + "ewc_loss": 0.007209395058453083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.209394971141592e-05, + "grad_norm": 3.618908405303955, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8654513359069824, + "num_tokens": 262393360.0, + "step": 6878 + }, + { + "epoch": 0.875079506424119, + "ewc_loss": 0.007228655740618706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.228655886137858e-05, + "grad_norm": 3.6142280101776123, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.878789484500885, + "num_tokens": 262432010.0, + "step": 6879 + }, + { + "epoch": 0.8752067167027096, + "ewc_loss": 0.007216041442006826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.216041558422148e-05, + "grad_norm": 3.6166489124298096, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8671733736991882, + "num_tokens": 262468828.0, + "step": 6880 + }, + { + "epoch": 0.8753339269813001, + "ewc_loss": 0.0072236922569572926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.223692227853462e-05, + "grad_norm": 3.5172646045684814, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8755239844322205, + "num_tokens": 262515141.0, + "step": 6881 + }, + { + "epoch": 0.8754611372598906, + "ewc_loss": 0.007168723735958338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.168723823269829e-05, + "grad_norm": 3.610643148422241, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8661110401153564, + "num_tokens": 262553137.0, + "step": 6882 + }, + { + "epoch": 0.8755883475384811, + "ewc_loss": 0.007265414576977491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.265414751600474e-05, + "grad_norm": 3.567211866378784, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8908302783966064, + "num_tokens": 262593847.0, + "step": 6883 + }, + { + "epoch": 0.8757155578170717, + "ewc_loss": 0.00718500092625618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.185000868048519e-05, + "grad_norm": 3.599870443344116, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8586908578872681, + "num_tokens": 262635048.0, + "step": 6884 + }, + { + "epoch": 0.8758427680956621, + "ewc_loss": 0.00723284762352705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.232847565319389e-05, + "grad_norm": 3.6332712173461914, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8602038025856018, + "num_tokens": 262671200.0, + "step": 6885 + }, + { + "epoch": 0.8759699783742526, + "ewc_loss": 0.0072243004105985165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224300497910008e-05, + "grad_norm": 3.6077404022216797, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8706154227256775, + "num_tokens": 262708304.0, + "step": 6886 + }, + { + "epoch": 0.8760971886528431, + "ewc_loss": 0.007201958913356066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.201958942459896e-05, + "grad_norm": 3.5974483489990234, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8635762929916382, + "num_tokens": 262750680.0, + "step": 6887 + }, + { + "epoch": 0.8762243989314337, + "ewc_loss": 0.007195487152785063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.19548697816208e-05, + "grad_norm": 3.653425693511963, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8611417412757874, + "num_tokens": 262784845.0, + "step": 6888 + }, + { + "epoch": 0.8763516092100242, + "ewc_loss": 0.0072424765676259995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.242476567626e-05, + "grad_norm": 3.628852367401123, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.868466317653656, + "num_tokens": 262822646.0, + "step": 6889 + }, + { + "epoch": 0.8764788194886147, + "ewc_loss": 0.0071952263824641705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.195226498879492e-05, + "grad_norm": 3.5614066123962402, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8638109564781189, + "num_tokens": 262863663.0, + "step": 6890 + }, + { + "epoch": 0.8766060297672051, + "ewc_loss": 0.0071893539279699326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.189354073489085e-05, + "grad_norm": 3.607759714126587, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.854108452796936, + "num_tokens": 262903726.0, + "step": 6891 + }, + { + "epoch": 0.8767332400457957, + "ewc_loss": 0.007228977978229523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.228978211060166e-05, + "grad_norm": 3.614556312561035, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8557615876197815, + "num_tokens": 262942748.0, + "step": 6892 + }, + { + "epoch": 0.8768604503243862, + "ewc_loss": 0.0072133480571210384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213347998913378e-05, + "grad_norm": 3.696277141571045, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.847338855266571, + "num_tokens": 262977639.0, + "step": 6893 + }, + { + "epoch": 0.8769876606029767, + "ewc_loss": 0.007256516255438328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.256516255438328e-05, + "grad_norm": 3.5749824047088623, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8703976273536682, + "num_tokens": 263015970.0, + "step": 6894 + }, + { + "epoch": 0.8771148708815673, + "ewc_loss": 0.007164386101067066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.164385897340253e-05, + "grad_norm": 3.513502836227417, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.876265823841095, + "num_tokens": 263058618.0, + "step": 6895 + }, + { + "epoch": 0.8772420811601578, + "ewc_loss": 0.007177351508289576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.177351653808728e-05, + "grad_norm": 3.64104962348938, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8525251150131226, + "num_tokens": 263098628.0, + "step": 6896 + }, + { + "epoch": 0.8773692914387482, + "ewc_loss": 0.007273136638104916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.273136725416407e-05, + "grad_norm": 3.6126153469085693, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8708802461624146, + "num_tokens": 263136793.0, + "step": 6897 + }, + { + "epoch": 0.8774965017173387, + "ewc_loss": 0.007188575342297554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.188575546024367e-05, + "grad_norm": 3.6144304275512695, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.868941068649292, + "num_tokens": 263170996.0, + "step": 6898 + }, + { + "epoch": 0.8776237119959293, + "ewc_loss": 0.007209520321339369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.209520117612556e-05, + "grad_norm": 3.7389438152313232, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8564451932907104, + "num_tokens": 263199961.0, + "step": 6899 + }, + { + "epoch": 0.8777509222745198, + "ewc_loss": 0.007291409652680159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.291409565368667e-05, + "grad_norm": 3.567779541015625, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8662434220314026, + "num_tokens": 263241541.0, + "step": 6900 + }, + { + "epoch": 0.8778781325531103, + "ewc_loss": 0.007150838617235422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.150838791858405e-05, + "grad_norm": 3.579089879989624, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8795356750488281, + "num_tokens": 263278778.0, + "step": 6901 + }, + { + "epoch": 0.8780053428317008, + "ewc_loss": 0.007214162964373827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214162906166166e-05, + "grad_norm": 3.643766403198242, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8695189952850342, + "num_tokens": 263310155.0, + "step": 6902 + }, + { + "epoch": 0.8781325531102913, + "ewc_loss": 0.007253394927829504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.253394869621843e-05, + "grad_norm": 3.598100423812866, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8643954992294312, + "num_tokens": 263355793.0, + "step": 6903 + }, + { + "epoch": 0.8782597633888818, + "ewc_loss": 0.0071872081607580185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.187208393588662e-05, + "grad_norm": 3.5499155521392822, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8773515224456787, + "num_tokens": 263394446.0, + "step": 6904 + }, + { + "epoch": 0.8783869736674723, + "ewc_loss": 0.007178833242505789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.178833038778976e-05, + "grad_norm": 3.574625253677368, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8703691959381104, + "num_tokens": 263438479.0, + "step": 6905 + }, + { + "epoch": 0.8785141839460628, + "ewc_loss": 0.007226881105452776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226881280075759e-05, + "grad_norm": 3.6296629905700684, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8540226817131042, + "num_tokens": 263478514.0, + "step": 6906 + }, + { + "epoch": 0.8786413942246534, + "ewc_loss": 0.007226444780826569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226444722618908e-05, + "grad_norm": 3.6193528175354004, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8494371175765991, + "num_tokens": 263516734.0, + "step": 6907 + }, + { + "epoch": 0.8787686045032439, + "ewc_loss": 0.007204605732113123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204605935839936e-05, + "grad_norm": 3.554906129837036, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8566683530807495, + "num_tokens": 263560105.0, + "step": 6908 + }, + { + "epoch": 0.8788958147818343, + "ewc_loss": 0.00717676617205143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.176765939220786e-05, + "grad_norm": 3.628661632537842, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8699937462806702, + "num_tokens": 263597033.0, + "step": 6909 + }, + { + "epoch": 0.8790230250604248, + "ewc_loss": 0.007245162036269903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.245162123581395e-05, + "grad_norm": 3.6536834239959717, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8727988004684448, + "num_tokens": 263631106.0, + "step": 6910 + }, + { + "epoch": 0.8791502353390154, + "ewc_loss": 0.0072379764169454575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.237976387841627e-05, + "grad_norm": 3.582843542098999, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8618861436843872, + "num_tokens": 263668227.0, + "step": 6911 + }, + { + "epoch": 0.8792774456176059, + "ewc_loss": 0.007183112669736147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.183112757047638e-05, + "grad_norm": 3.6351232528686523, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8705235719680786, + "num_tokens": 263704335.0, + "step": 6912 + }, + { + "epoch": 0.8794046558961964, + "ewc_loss": 0.007247218396514654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.247218309203163e-05, + "grad_norm": 3.5690910816192627, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8732600212097168, + "num_tokens": 263742532.0, + "step": 6913 + }, + { + "epoch": 0.879531866174787, + "ewc_loss": 0.007176951505243778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.176951476139948e-05, + "grad_norm": 3.588895559310913, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8684325218200684, + "num_tokens": 263782321.0, + "step": 6914 + }, + { + "epoch": 0.8796590764533775, + "ewc_loss": 0.007223535794764757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.223535794764757e-05, + "grad_norm": 3.6181716918945312, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8757580518722534, + "num_tokens": 263818037.0, + "step": 6915 + }, + { + "epoch": 0.8797862867319679, + "ewc_loss": 0.0072204130701720715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.22041295375675e-05, + "grad_norm": 3.6470401287078857, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8531162142753601, + "num_tokens": 263854901.0, + "step": 6916 + }, + { + "epoch": 0.8799134970105584, + "ewc_loss": 0.007238375023007393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.238375110318884e-05, + "grad_norm": 3.5588817596435547, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8722462058067322, + "num_tokens": 263896241.0, + "step": 6917 + }, + { + "epoch": 0.880040707289149, + "ewc_loss": 0.007181163411587477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.181163528002799e-05, + "grad_norm": 3.6170427799224854, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8684225678443909, + "num_tokens": 263934673.0, + "step": 6918 + }, + { + "epoch": 0.8801679175677395, + "ewc_loss": 0.007237104699015617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.237104728119448e-05, + "grad_norm": 3.611278772354126, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8587812185287476, + "num_tokens": 263973456.0, + "step": 6919 + }, + { + "epoch": 0.88029512784633, + "ewc_loss": 0.00720159150660038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.20159150660038e-05, + "grad_norm": 3.674455404281616, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8498537540435791, + "num_tokens": 264006050.0, + "step": 6920 + }, + { + "epoch": 0.8804223381249205, + "ewc_loss": 0.0072560496628284454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.256049866555259e-05, + "grad_norm": 3.6045141220092773, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8773345351219177, + "num_tokens": 264042736.0, + "step": 6921 + }, + { + "epoch": 0.880549548403511, + "ewc_loss": 0.007200588006526232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.200588152045384e-05, + "grad_norm": 3.5890591144561768, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8598352074623108, + "num_tokens": 264084433.0, + "step": 6922 + }, + { + "epoch": 0.8806767586821015, + "ewc_loss": 0.007212752941995859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.212752825580537e-05, + "grad_norm": 3.6287171840667725, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.875537633895874, + "num_tokens": 264118331.0, + "step": 6923 + }, + { + "epoch": 0.880803968960692, + "ewc_loss": 0.007229772862046957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229772745631635e-05, + "grad_norm": 3.6411993503570557, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8692643046379089, + "num_tokens": 264152015.0, + "step": 6924 + }, + { + "epoch": 0.8809311792392825, + "ewc_loss": 0.0072272843681275845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.227284368127584e-05, + "grad_norm": 3.6230335235595703, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8727599382400513, + "num_tokens": 264187085.0, + "step": 6925 + }, + { + "epoch": 0.8810583895178731, + "ewc_loss": 0.007199409417808056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.199409446911886e-05, + "grad_norm": 3.590177059173584, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8637725710868835, + "num_tokens": 264225558.0, + "step": 6926 + }, + { + "epoch": 0.8811855997964636, + "ewc_loss": 0.007219563703984022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.219563849503174e-05, + "grad_norm": 3.6111817359924316, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8589865565299988, + "num_tokens": 264264220.0, + "step": 6927 + }, + { + "epoch": 0.881312810075054, + "ewc_loss": 0.007233445066958666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.233444921439514e-05, + "grad_norm": 3.6073224544525146, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8712782859802246, + "num_tokens": 264302097.0, + "step": 6928 + }, + { + "epoch": 0.8814400203536445, + "ewc_loss": 0.007242016494274139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.242016727104783e-05, + "grad_norm": 3.606152057647705, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8700413703918457, + "num_tokens": 264341966.0, + "step": 6929 + }, + { + "epoch": 0.8815672306322351, + "ewc_loss": 0.007228050380945206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.228050526464358e-05, + "grad_norm": 3.6188173294067383, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8443335294723511, + "num_tokens": 264384298.0, + "step": 6930 + }, + { + "epoch": 0.8816944409108256, + "ewc_loss": 0.007239894941449165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.239895057864487e-05, + "grad_norm": 3.561631917953491, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8685762882232666, + "num_tokens": 264422513.0, + "step": 6931 + }, + { + "epoch": 0.8818216511894161, + "ewc_loss": 0.007198088336735964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.19808813300915e-05, + "grad_norm": 3.647014856338501, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8789746761322021, + "num_tokens": 264461819.0, + "step": 6932 + }, + { + "epoch": 0.8819488614680067, + "ewc_loss": 0.007282079663127661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.28207960492e-05, + "grad_norm": 3.5799834728240967, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8566647171974182, + "num_tokens": 264502618.0, + "step": 6933 + }, + { + "epoch": 0.8820760717465971, + "ewc_loss": 0.007197062950581312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.197062950581312e-05, + "grad_norm": 3.656343936920166, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8597433567047119, + "num_tokens": 264534382.0, + "step": 6934 + }, + { + "epoch": 0.8822032820251876, + "ewc_loss": 0.007274797186255455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.274797098943964e-05, + "grad_norm": 3.579012870788574, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8579694628715515, + "num_tokens": 264575297.0, + "step": 6935 + }, + { + "epoch": 0.8823304923037781, + "ewc_loss": 0.007209145929664373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.209146133391187e-05, + "grad_norm": 3.5625433921813965, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8736648559570312, + "num_tokens": 264611822.0, + "step": 6936 + }, + { + "epoch": 0.8824577025823687, + "ewc_loss": 0.00722919125109911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229191396618262e-05, + "grad_norm": 3.566279888153076, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8619949817657471, + "num_tokens": 264657483.0, + "step": 6937 + }, + { + "epoch": 0.8825849128609592, + "ewc_loss": 0.007229779846966267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.22978002158925e-05, + "grad_norm": 3.64495587348938, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8725475072860718, + "num_tokens": 264690633.0, + "step": 6938 + }, + { + "epoch": 0.8827121231395497, + "ewc_loss": 0.007284160703420639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.284160528797656e-05, + "grad_norm": 3.537243366241455, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8825933337211609, + "num_tokens": 264733268.0, + "step": 6939 + }, + { + "epoch": 0.8828393334181401, + "ewc_loss": 0.007177861873060465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.177861698437482e-05, + "grad_norm": 3.5911638736724854, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8604739308357239, + "num_tokens": 264774769.0, + "step": 6940 + }, + { + "epoch": 0.8829665436967307, + "ewc_loss": 0.007243224419653416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.243224536068738e-05, + "grad_norm": 3.6866068840026855, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8608991503715515, + "num_tokens": 264806658.0, + "step": 6941 + }, + { + "epoch": 0.8830937539753212, + "ewc_loss": 0.007269057910889387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.269057823577896e-05, + "grad_norm": 3.644595146179199, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8551681637763977, + "num_tokens": 264841054.0, + "step": 6942 + }, + { + "epoch": 0.8832209642539117, + "ewc_loss": 0.007207862567156553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.207862654468045e-05, + "grad_norm": 3.5961313247680664, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8615318536758423, + "num_tokens": 264880241.0, + "step": 6943 + }, + { + "epoch": 0.8833481745325023, + "ewc_loss": 0.0072126202285289764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.212620403151959e-05, + "grad_norm": 3.590769052505493, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8690336346626282, + "num_tokens": 264922449.0, + "step": 6944 + }, + { + "epoch": 0.8834753848110928, + "ewc_loss": 0.0072195203974843025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.21952019375749e-05, + "grad_norm": 3.6185100078582764, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8665248155593872, + "num_tokens": 264961605.0, + "step": 6945 + }, + { + "epoch": 0.8836025950896832, + "ewc_loss": 0.007213647942990065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213647768367082e-05, + "grad_norm": 3.6948442459106445, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8748593330383301, + "num_tokens": 264992924.0, + "step": 6946 + }, + { + "epoch": 0.8837298053682737, + "ewc_loss": 0.007264613639563322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.264613668667153e-05, + "grad_norm": 3.6298348903656006, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8645515441894531, + "num_tokens": 265031849.0, + "step": 6947 + }, + { + "epoch": 0.8838570156468643, + "ewc_loss": 0.007212040014564991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.212039781734347e-05, + "grad_norm": 3.604361057281494, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8789852857589722, + "num_tokens": 265069714.0, + "step": 6948 + }, + { + "epoch": 0.8839842259254548, + "ewc_loss": 0.007224102038890123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.22410186426714e-05, + "grad_norm": 3.661616325378418, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8632769584655762, + "num_tokens": 265103325.0, + "step": 6949 + }, + { + "epoch": 0.8841114362040453, + "ewc_loss": 0.0072558061219751835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.255806121975183e-05, + "grad_norm": 3.6377182006835938, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.845991849899292, + "num_tokens": 265139133.0, + "step": 6950 + }, + { + "epoch": 0.8842386464826358, + "ewc_loss": 0.007227757945656776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.227758032968268e-05, + "grad_norm": 3.574846029281616, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8436071872711182, + "num_tokens": 265183536.0, + "step": 6951 + }, + { + "epoch": 0.8843658567612263, + "ewc_loss": 0.007212595082819462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.21259493730031e-05, + "grad_norm": 3.5904297828674316, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8581905364990234, + "num_tokens": 265226288.0, + "step": 6952 + }, + { + "epoch": 0.8844930670398168, + "ewc_loss": 0.007241479121148586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.241479033837095e-05, + "grad_norm": 3.6140239238739014, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.86759352684021, + "num_tokens": 265265846.0, + "step": 6953 + }, + { + "epoch": 0.8846202773184073, + "ewc_loss": 0.00724067585542798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.240675768116489e-05, + "grad_norm": 3.587089776992798, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8804030418395996, + "num_tokens": 265300612.0, + "step": 6954 + }, + { + "epoch": 0.8847474875969978, + "ewc_loss": 0.007228444796055555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.228444883367047e-05, + "grad_norm": 3.568894863128662, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8754753470420837, + "num_tokens": 265341440.0, + "step": 6955 + }, + { + "epoch": 0.8848746978755884, + "ewc_loss": 0.0071967588737607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.196758815553039e-05, + "grad_norm": 3.561375379562378, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8575787544250488, + "num_tokens": 265384854.0, + "step": 6956 + }, + { + "epoch": 0.8850019081541789, + "ewc_loss": 0.007205009926110506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.205009751487523e-05, + "grad_norm": 3.6245293617248535, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8600929975509644, + "num_tokens": 265420611.0, + "step": 6957 + }, + { + "epoch": 0.8851291184327693, + "ewc_loss": 0.00723826140165329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.238261605380103e-05, + "grad_norm": 3.6059820652008057, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8662430047988892, + "num_tokens": 265457720.0, + "step": 6958 + }, + { + "epoch": 0.8852563287113598, + "ewc_loss": 0.007213330361992121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213330536615103e-05, + "grad_norm": 3.6073896884918213, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8740158677101135, + "num_tokens": 265493434.0, + "step": 6959 + }, + { + "epoch": 0.8853835389899504, + "ewc_loss": 0.007226038724184036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226038724184036e-05, + "grad_norm": 3.622602939605713, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8680593371391296, + "num_tokens": 265529015.0, + "step": 6960 + }, + { + "epoch": 0.8855107492685409, + "ewc_loss": 0.007224719505757093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224719593068585e-05, + "grad_norm": 3.6146156787872314, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8516151905059814, + "num_tokens": 265568698.0, + "step": 6961 + }, + { + "epoch": 0.8856379595471314, + "ewc_loss": 0.007202974520623684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.202974666142836e-05, + "grad_norm": 3.6222615242004395, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8675522208213806, + "num_tokens": 265605631.0, + "step": 6962 + }, + { + "epoch": 0.885765169825722, + "ewc_loss": 0.007209788542240858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.209788600448519e-05, + "grad_norm": 3.5651776790618896, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8685429096221924, + "num_tokens": 265647071.0, + "step": 6963 + }, + { + "epoch": 0.8858923801043125, + "ewc_loss": 0.007178124971687794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.178125088103116e-05, + "grad_norm": 3.6188762187957764, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8661720752716064, + "num_tokens": 265688680.0, + "step": 6964 + }, + { + "epoch": 0.8860195903829029, + "ewc_loss": 0.007216664496809244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.216664380393922e-05, + "grad_norm": 3.5719165802001953, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8720505237579346, + "num_tokens": 265727707.0, + "step": 6965 + }, + { + "epoch": 0.8861468006614934, + "ewc_loss": 0.007159193977713585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.159193773986772e-05, + "grad_norm": 3.605905055999756, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8644157648086548, + "num_tokens": 265765860.0, + "step": 6966 + }, + { + "epoch": 0.886274010940084, + "ewc_loss": 0.007201736327260733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.201736298156902e-05, + "grad_norm": 3.6186928749084473, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8701149225234985, + "num_tokens": 265802988.0, + "step": 6967 + }, + { + "epoch": 0.8864012212186745, + "ewc_loss": 0.007197834085673094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.197834202088416e-05, + "grad_norm": 3.6157822608947754, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8626421689987183, + "num_tokens": 265840614.0, + "step": 6968 + }, + { + "epoch": 0.886528431497265, + "ewc_loss": 0.0072000715881586075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.200071559054777e-05, + "grad_norm": 3.6145479679107666, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8413746356964111, + "num_tokens": 265883032.0, + "step": 6969 + }, + { + "epoch": 0.8866556417758555, + "ewc_loss": 0.007201479282230139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.201479456853122e-05, + "grad_norm": 3.6332240104675293, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8776998519897461, + "num_tokens": 265918309.0, + "step": 6970 + }, + { + "epoch": 0.886782852054446, + "ewc_loss": 0.007204707711935043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204707799246535e-05, + "grad_norm": 3.5950570106506348, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8692864775657654, + "num_tokens": 265956406.0, + "step": 6971 + }, + { + "epoch": 0.8869100623330365, + "ewc_loss": 0.007179992739111185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.179992826422676e-05, + "grad_norm": 3.705549716949463, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8757854700088501, + "num_tokens": 265988413.0, + "step": 6972 + }, + { + "epoch": 0.887037272611627, + "ewc_loss": 0.007260376121848822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.260376150952652e-05, + "grad_norm": 3.6330394744873047, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8663774132728577, + "num_tokens": 266022036.0, + "step": 6973 + }, + { + "epoch": 0.8871644828902175, + "ewc_loss": 0.007190906908363104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.190906762843952e-05, + "grad_norm": 3.684840440750122, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8655816316604614, + "num_tokens": 266053735.0, + "step": 6974 + }, + { + "epoch": 0.8872916931688081, + "ewc_loss": 0.007234117016196251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.234117219923064e-05, + "grad_norm": 3.5466058254241943, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8815884590148926, + "num_tokens": 266094198.0, + "step": 6975 + }, + { + "epoch": 0.8874189034473986, + "ewc_loss": 0.007144430186599493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.144430128391832e-05, + "grad_norm": 3.614537000656128, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8746604323387146, + "num_tokens": 266132213.0, + "step": 6976 + }, + { + "epoch": 0.887546113725989, + "ewc_loss": 0.007229187525808811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229187758639455e-05, + "grad_norm": 3.6187610626220703, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8668299913406372, + "num_tokens": 266168001.0, + "step": 6977 + }, + { + "epoch": 0.8876733240045795, + "ewc_loss": 0.007206683978438377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.206683949334547e-05, + "grad_norm": 3.6621978282928467, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8392906188964844, + "num_tokens": 266208771.0, + "step": 6978 + }, + { + "epoch": 0.8878005342831701, + "ewc_loss": 0.007236642763018608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.236642704810947e-05, + "grad_norm": 3.5697944164276123, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8741921782493591, + "num_tokens": 266248090.0, + "step": 6979 + }, + { + "epoch": 0.8879277445617606, + "ewc_loss": 0.007175620179623365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.175619975896552e-05, + "grad_norm": 3.6983323097229004, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8571018576622009, + "num_tokens": 266283748.0, + "step": 6980 + }, + { + "epoch": 0.8880549548403511, + "ewc_loss": 0.007275850046426058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.275849930010736e-05, + "grad_norm": 3.6757256984710693, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8737988471984863, + "num_tokens": 266316613.0, + "step": 6981 + }, + { + "epoch": 0.8881821651189417, + "ewc_loss": 0.007225062232464552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.225062290672213e-05, + "grad_norm": 3.5481324195861816, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8700088858604431, + "num_tokens": 266357515.0, + "step": 6982 + }, + { + "epoch": 0.8883093753975321, + "ewc_loss": 0.007173954509198666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.173954509198666e-05, + "grad_norm": 3.607957601547241, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8789932727813721, + "num_tokens": 266393593.0, + "step": 6983 + }, + { + "epoch": 0.8884365856761226, + "ewc_loss": 0.007252569776028395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.252569776028395e-05, + "grad_norm": 3.5929062366485596, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8771858215332031, + "num_tokens": 266436104.0, + "step": 6984 + }, + { + "epoch": 0.8885637959547131, + "ewc_loss": 0.007207395508885384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.207395537989214e-05, + "grad_norm": 3.6078267097473145, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8784738779067993, + "num_tokens": 266472809.0, + "step": 6985 + }, + { + "epoch": 0.8886910062333037, + "ewc_loss": 0.007226916961371899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226916932268068e-05, + "grad_norm": 3.5704383850097656, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.876278281211853, + "num_tokens": 266512726.0, + "step": 6986 + }, + { + "epoch": 0.8888182165118942, + "ewc_loss": 0.007208907976746559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.208908209577203e-05, + "grad_norm": 3.6604859828948975, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8547255396842957, + "num_tokens": 266546098.0, + "step": 6987 + }, + { + "epoch": 0.8889454267904847, + "ewc_loss": 0.007266341708600521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.266341708600521e-05, + "grad_norm": 3.5864696502685547, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8587350249290466, + "num_tokens": 266588478.0, + "step": 6988 + }, + { + "epoch": 0.8890726370690751, + "ewc_loss": 0.007212077733129263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.212077616713941e-05, + "grad_norm": 3.6035220623016357, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8578607439994812, + "num_tokens": 266628975.0, + "step": 6989 + }, + { + "epoch": 0.8891998473476657, + "ewc_loss": 0.007245881482958794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.245881715789437e-05, + "grad_norm": 3.574859380722046, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8754771947860718, + "num_tokens": 266667533.0, + "step": 6990 + }, + { + "epoch": 0.8893270576262562, + "ewc_loss": 0.007214791607111692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214791548904032e-05, + "grad_norm": 3.6646838188171387, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8480398654937744, + "num_tokens": 266706990.0, + "step": 6991 + }, + { + "epoch": 0.8894542679048467, + "ewc_loss": 0.00727335549890995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.273355731740594e-05, + "grad_norm": 3.590257406234741, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8558489084243774, + "num_tokens": 266747314.0, + "step": 6992 + }, + { + "epoch": 0.8895814781834372, + "ewc_loss": 0.007204155437648296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204155554063618e-05, + "grad_norm": 3.580092668533325, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8572882413864136, + "num_tokens": 266791031.0, + "step": 6993 + }, + { + "epoch": 0.8897086884620278, + "ewc_loss": 0.007207963615655899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.207963790278882e-05, + "grad_norm": 3.676222562789917, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8670064210891724, + "num_tokens": 266823236.0, + "step": 6994 + }, + { + "epoch": 0.8898358987406182, + "ewc_loss": 0.007269965019077063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.269965135492384e-05, + "grad_norm": 3.552877426147461, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8659440279006958, + "num_tokens": 266867993.0, + "step": 6995 + }, + { + "epoch": 0.8899631090192087, + "ewc_loss": 0.007162404712289572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.162404654081911e-05, + "grad_norm": 3.592745780944824, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8490738868713379, + "num_tokens": 266908573.0, + "step": 6996 + }, + { + "epoch": 0.8900903192977992, + "ewc_loss": 0.007241198793053627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.241198909468949e-05, + "grad_norm": 3.582319736480713, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8646519780158997, + "num_tokens": 266951167.0, + "step": 6997 + }, + { + "epoch": 0.8902175295763898, + "ewc_loss": 0.007201316300779581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.201316475402564e-05, + "grad_norm": 3.6196372509002686, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8667092323303223, + "num_tokens": 266988976.0, + "step": 6998 + }, + { + "epoch": 0.8903447398549803, + "ewc_loss": 0.0072416276670992374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.241627463372424e-05, + "grad_norm": 3.677441120147705, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8672912120819092, + "num_tokens": 267022450.0, + "step": 6999 + }, + { + "epoch": 0.8904719501335708, + "ewc_loss": 0.007252637296915054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.252637442434207e-05, + "grad_norm": 3.640532970428467, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8599085807800293, + "num_tokens": 267059933.0, + "step": 7000 + }, + { + "epoch": 0.8905991604121613, + "ewc_loss": 0.0072184959426522255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.218495738925412e-05, + "grad_norm": 3.651130437850952, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8576602339744568, + "num_tokens": 267092518.0, + "step": 7001 + }, + { + "epoch": 0.8907263706907518, + "ewc_loss": 0.007238314487040043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.238314719870687e-05, + "grad_norm": 3.6303486824035645, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8618078231811523, + "num_tokens": 267130008.0, + "step": 7002 + }, + { + "epoch": 0.8908535809693423, + "ewc_loss": 0.0072263143956661224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226314482977614e-05, + "grad_norm": 3.55963397026062, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8691084980964661, + "num_tokens": 267171039.0, + "step": 7003 + }, + { + "epoch": 0.8909807912479328, + "ewc_loss": 0.007185890804976225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.185890717664734e-05, + "grad_norm": 3.5988287925720215, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8704267740249634, + "num_tokens": 267211197.0, + "step": 7004 + }, + { + "epoch": 0.8911080015265234, + "ewc_loss": 0.007232715841382742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.232715870486572e-05, + "grad_norm": 3.864971399307251, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8717061877250671, + "num_tokens": 267247471.0, + "step": 7005 + }, + { + "epoch": 0.8912352118051139, + "ewc_loss": 0.0073752556927502155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.375255518127233e-05, + "grad_norm": 3.627937078475952, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8415704965591431, + "num_tokens": 267287276.0, + "step": 7006 + }, + { + "epoch": 0.8913624220837043, + "ewc_loss": 0.007143653929233551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.143653783714399e-05, + "grad_norm": 3.6035118103027344, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8637409806251526, + "num_tokens": 267323827.0, + "step": 7007 + }, + { + "epoch": 0.8914896323622948, + "ewc_loss": 0.007237330079078674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.237330282805488e-05, + "grad_norm": 3.5545175075531006, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8752298355102539, + "num_tokens": 267364053.0, + "step": 7008 + }, + { + "epoch": 0.8916168426408854, + "ewc_loss": 0.007218083366751671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.218083192128688e-05, + "grad_norm": 3.6062376499176025, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8502165079116821, + "num_tokens": 267404393.0, + "step": 7009 + }, + { + "epoch": 0.8917440529194759, + "ewc_loss": 0.007249018177390099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.249018381116912e-05, + "grad_norm": 3.6281633377075195, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8672006130218506, + "num_tokens": 267444589.0, + "step": 7010 + }, + { + "epoch": 0.8918712631980664, + "ewc_loss": 0.007248478475958109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.24847850506194e-05, + "grad_norm": 3.5827128887176514, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8666279911994934, + "num_tokens": 267484460.0, + "step": 7011 + }, + { + "epoch": 0.891998473476657, + "ewc_loss": 0.007214666344225407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214666402433068e-05, + "grad_norm": 3.6919538974761963, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8593317866325378, + "num_tokens": 267517390.0, + "step": 7012 + }, + { + "epoch": 0.8921256837552475, + "ewc_loss": 0.0073031894862651825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30318934074603e-05, + "grad_norm": 3.6206960678100586, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8673080205917358, + "num_tokens": 267555813.0, + "step": 7013 + }, + { + "epoch": 0.8922528940338379, + "ewc_loss": 0.007223781198263168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.223780994536355e-05, + "grad_norm": 3.6174569129943848, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8647348880767822, + "num_tokens": 267595818.0, + "step": 7014 + }, + { + "epoch": 0.8923801043124284, + "ewc_loss": 0.0072605493478477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.26054931874387e-05, + "grad_norm": 3.663963794708252, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8416553735733032, + "num_tokens": 267633106.0, + "step": 7015 + }, + { + "epoch": 0.892507314591019, + "ewc_loss": 0.007299564313143492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.299564458662644e-05, + "grad_norm": 3.640730142593384, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8682172298431396, + "num_tokens": 267670166.0, + "step": 7016 + }, + { + "epoch": 0.8926345248696095, + "ewc_loss": 0.007245159707963467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.24515994079411e-05, + "grad_norm": 3.635528564453125, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8635174036026001, + "num_tokens": 267707284.0, + "step": 7017 + }, + { + "epoch": 0.8927617351482, + "ewc_loss": 0.0072551011107862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.255101081682369e-05, + "grad_norm": 3.6303162574768066, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8654588460922241, + "num_tokens": 267742349.0, + "step": 7018 + }, + { + "epoch": 0.8928889454267905, + "ewc_loss": 0.007272374350577593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.27237420505844e-05, + "grad_norm": 3.641355276107788, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8707132339477539, + "num_tokens": 267776003.0, + "step": 7019 + }, + { + "epoch": 0.893016155705381, + "ewc_loss": 0.007273833733052015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.273833762155846e-05, + "grad_norm": 3.5688226222991943, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8627010583877563, + "num_tokens": 267815887.0, + "step": 7020 + }, + { + "epoch": 0.8931433659839715, + "ewc_loss": 0.007244679611176252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.244679727591574e-05, + "grad_norm": 3.612534523010254, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8772936463356018, + "num_tokens": 267855874.0, + "step": 7021 + }, + { + "epoch": 0.893270576262562, + "ewc_loss": 0.007296736817806959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.296737021533772e-05, + "grad_norm": 3.5928027629852295, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8759621381759644, + "num_tokens": 267897901.0, + "step": 7022 + }, + { + "epoch": 0.8933977865411525, + "ewc_loss": 0.007263752166181803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.263752195285633e-05, + "grad_norm": 3.5715036392211914, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8773154020309448, + "num_tokens": 267938967.0, + "step": 7023 + }, + { + "epoch": 0.8935249968197431, + "ewc_loss": 0.007253675721585751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.25367572158575e-05, + "grad_norm": 3.614856243133545, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8634951114654541, + "num_tokens": 267975196.0, + "step": 7024 + }, + { + "epoch": 0.8936522070983336, + "ewc_loss": 0.007262231316417456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.262231520144269e-05, + "grad_norm": 3.5644350051879883, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8795392513275146, + "num_tokens": 268014407.0, + "step": 7025 + }, + { + "epoch": 0.893779417376924, + "ewc_loss": 0.0072234878316521645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.223487773444504e-05, + "grad_norm": 3.723815679550171, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8815628290176392, + "num_tokens": 268043892.0, + "step": 7026 + }, + { + "epoch": 0.8939066276555145, + "ewc_loss": 0.007339460775256157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.339460717048496e-05, + "grad_norm": 3.6373565196990967, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8662319183349609, + "num_tokens": 268079651.0, + "step": 7027 + }, + { + "epoch": 0.8940338379341051, + "ewc_loss": 0.00722452811896801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224528235383332e-05, + "grad_norm": 3.629171133041382, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8699143528938293, + "num_tokens": 268114021.0, + "step": 7028 + }, + { + "epoch": 0.8941610482126956, + "ewc_loss": 0.007251828908920288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.251829083543271e-05, + "grad_norm": 3.5775113105773926, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8646397590637207, + "num_tokens": 268155315.0, + "step": 7029 + }, + { + "epoch": 0.8942882584912861, + "ewc_loss": 0.0072341603226959705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.234160148072988e-05, + "grad_norm": 3.5723206996917725, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8637162446975708, + "num_tokens": 268199312.0, + "step": 7030 + }, + { + "epoch": 0.8944154687698767, + "ewc_loss": 0.007214803248643875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214803190436214e-05, + "grad_norm": 3.587934732437134, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8727482557296753, + "num_tokens": 268236955.0, + "step": 7031 + }, + { + "epoch": 0.8945426790484671, + "ewc_loss": 0.007232894189655781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.23289413144812e-05, + "grad_norm": 3.6381590366363525, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8638225793838501, + "num_tokens": 268272183.0, + "step": 7032 + }, + { + "epoch": 0.8946698893270576, + "ewc_loss": 0.007255394943058491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.255395030369982e-05, + "grad_norm": 3.696425199508667, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8567463159561157, + "num_tokens": 268305407.0, + "step": 7033 + }, + { + "epoch": 0.8947970996056481, + "ewc_loss": 0.007271701004356146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.271701178979129e-05, + "grad_norm": 3.6473586559295654, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.878604531288147, + "num_tokens": 268340443.0, + "step": 7034 + }, + { + "epoch": 0.8949243098842387, + "ewc_loss": 0.007250042166560888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.250042108353227e-05, + "grad_norm": 3.6169257164001465, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8842117190361023, + "num_tokens": 268375597.0, + "step": 7035 + }, + { + "epoch": 0.8950515201628292, + "ewc_loss": 0.007253246381878853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.253246440086514e-05, + "grad_norm": 3.6327013969421387, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8611390590667725, + "num_tokens": 268412872.0, + "step": 7036 + }, + { + "epoch": 0.8951787304414197, + "ewc_loss": 0.00727966520935297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.279665442183614e-05, + "grad_norm": 3.61667537689209, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8682003021240234, + "num_tokens": 268452021.0, + "step": 7037 + }, + { + "epoch": 0.8953059407200101, + "ewc_loss": 0.007257265504449606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.257265679072589e-05, + "grad_norm": 3.6392011642456055, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8756133317947388, + "num_tokens": 268486354.0, + "step": 7038 + }, + { + "epoch": 0.8954331509986007, + "ewc_loss": 0.007268914487212896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.268914487212896e-05, + "grad_norm": 3.6131229400634766, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.861460268497467, + "num_tokens": 268519918.0, + "step": 7039 + }, + { + "epoch": 0.8955603612771912, + "ewc_loss": 0.007257244549691677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.257244578795508e-05, + "grad_norm": 3.6367549896240234, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8584980368614197, + "num_tokens": 268554259.0, + "step": 7040 + }, + { + "epoch": 0.8956875715557817, + "ewc_loss": 0.007282489910721779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.28248996892944e-05, + "grad_norm": 3.587486505508423, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.860807478427887, + "num_tokens": 268595785.0, + "step": 7041 + }, + { + "epoch": 0.8958147818343722, + "ewc_loss": 0.007255229167640209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.255229138536379e-05, + "grad_norm": 3.5590221881866455, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8508126735687256, + "num_tokens": 268640805.0, + "step": 7042 + }, + { + "epoch": 0.8959419921129628, + "ewc_loss": 0.007249982561916113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.249982445500791e-05, + "grad_norm": 3.5637145042419434, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8715365529060364, + "num_tokens": 268682446.0, + "step": 7043 + }, + { + "epoch": 0.8960692023915532, + "ewc_loss": 0.007263578940182924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.263579027494416e-05, + "grad_norm": 3.6121346950531006, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8750051259994507, + "num_tokens": 268720262.0, + "step": 7044 + }, + { + "epoch": 0.8961964126701437, + "ewc_loss": 0.007263575214892626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.263575389515609e-05, + "grad_norm": 3.593355894088745, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8534445762634277, + "num_tokens": 268760519.0, + "step": 7045 + }, + { + "epoch": 0.8963236229487342, + "ewc_loss": 0.007242227904498577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.242227729875594e-05, + "grad_norm": 3.629924774169922, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8610242009162903, + "num_tokens": 268796621.0, + "step": 7046 + }, + { + "epoch": 0.8964508332273248, + "ewc_loss": 0.00727069191634655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.270692003658041e-05, + "grad_norm": 3.635361671447754, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8749557733535767, + "num_tokens": 268830527.0, + "step": 7047 + }, + { + "epoch": 0.8965780435059153, + "ewc_loss": 0.007253091316670179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.253091462189332e-05, + "grad_norm": 3.639847993850708, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8703174591064453, + "num_tokens": 268862112.0, + "step": 7048 + }, + { + "epoch": 0.8967052537845058, + "ewc_loss": 0.007251463830471039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.251463830471039e-05, + "grad_norm": 3.577630043029785, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8645241260528564, + "num_tokens": 268901089.0, + "step": 7049 + }, + { + "epoch": 0.8968324640630962, + "ewc_loss": 0.007224973291158676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.22497352398932e-05, + "grad_norm": 3.576765775680542, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8685427904129028, + "num_tokens": 268942766.0, + "step": 7050 + }, + { + "epoch": 0.8969596743416868, + "ewc_loss": 0.007235802244395018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.235802331706509e-05, + "grad_norm": 3.652557373046875, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8723618984222412, + "num_tokens": 268978642.0, + "step": 7051 + }, + { + "epoch": 0.8970868846202773, + "ewc_loss": 0.007282060571014881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.282060687430203e-05, + "grad_norm": 3.605058431625366, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8732602596282959, + "num_tokens": 269016048.0, + "step": 7052 + }, + { + "epoch": 0.8972140948988678, + "ewc_loss": 0.007219563238322735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.219563121907413e-05, + "grad_norm": 3.6363308429718018, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8682313561439514, + "num_tokens": 269054381.0, + "step": 7053 + }, + { + "epoch": 0.8973413051774584, + "ewc_loss": 0.00725913280621171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.259132689796388e-05, + "grad_norm": 3.615093946456909, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8714457154273987, + "num_tokens": 269090672.0, + "step": 7054 + }, + { + "epoch": 0.8974685154560489, + "ewc_loss": 0.00722119165584445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.221191481221467e-05, + "grad_norm": 3.577881336212158, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.87021803855896, + "num_tokens": 269130231.0, + "step": 7055 + }, + { + "epoch": 0.8975957257346393, + "ewc_loss": 0.007226869463920593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226869638543576e-05, + "grad_norm": 3.6006858348846436, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8714795708656311, + "num_tokens": 269166282.0, + "step": 7056 + }, + { + "epoch": 0.8977229360132298, + "ewc_loss": 0.0072544775903224945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.254477532114834e-05, + "grad_norm": 3.6487205028533936, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8705147504806519, + "num_tokens": 269205326.0, + "step": 7057 + }, + { + "epoch": 0.8978501462918204, + "ewc_loss": 0.0072571346536278725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.257134711835533e-05, + "grad_norm": 3.615689992904663, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8680200576782227, + "num_tokens": 269243903.0, + "step": 7058 + }, + { + "epoch": 0.8979773565704109, + "ewc_loss": 0.007216625846922398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.216625817818567e-05, + "grad_norm": 3.575263023376465, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8607988357543945, + "num_tokens": 269288330.0, + "step": 7059 + }, + { + "epoch": 0.8981045668490014, + "ewc_loss": 0.007214597892016172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214598008431494e-05, + "grad_norm": 3.591261863708496, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8581578731536865, + "num_tokens": 269328731.0, + "step": 7060 + }, + { + "epoch": 0.898231777127592, + "ewc_loss": 0.00725047942250967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.25047939340584e-05, + "grad_norm": 3.5976366996765137, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8681560158729553, + "num_tokens": 269369576.0, + "step": 7061 + }, + { + "epoch": 0.8983589874061825, + "ewc_loss": 0.007235292345285416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.235292287077755e-05, + "grad_norm": 3.637929916381836, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8652179837226868, + "num_tokens": 269407734.0, + "step": 7062 + }, + { + "epoch": 0.8984861976847729, + "ewc_loss": 0.007261938415467739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.261938299052417e-05, + "grad_norm": 3.6667959690093994, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8477708101272583, + "num_tokens": 269444042.0, + "step": 7063 + }, + { + "epoch": 0.8986134079633634, + "ewc_loss": 0.007251677569001913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.251677743624896e-05, + "grad_norm": 3.576504945755005, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8899643421173096, + "num_tokens": 269480979.0, + "step": 7064 + }, + { + "epoch": 0.898740618241954, + "ewc_loss": 0.00720652611926198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.206526061054319e-05, + "grad_norm": 3.6299984455108643, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8699827790260315, + "num_tokens": 269515594.0, + "step": 7065 + }, + { + "epoch": 0.8988678285205445, + "ewc_loss": 0.0072693233378231525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.269323396030813e-05, + "grad_norm": 3.582279682159424, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8698318004608154, + "num_tokens": 269556974.0, + "step": 7066 + }, + { + "epoch": 0.898995038799135, + "ewc_loss": 0.007202161941677332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.202161941677332e-05, + "grad_norm": 3.647031784057617, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8612484931945801, + "num_tokens": 269591088.0, + "step": 7067 + }, + { + "epoch": 0.8991222490777255, + "ewc_loss": 0.007261520717293024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.261520659085363e-05, + "grad_norm": 3.622347116470337, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8756376504898071, + "num_tokens": 269627921.0, + "step": 7068 + }, + { + "epoch": 0.899249459356316, + "ewc_loss": 0.007238014135509729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.238014222821221e-05, + "grad_norm": 3.625168800354004, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.851726770401001, + "num_tokens": 269666586.0, + "step": 7069 + }, + { + "epoch": 0.8993766696349065, + "ewc_loss": 0.0072326818481087685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.232681673485786e-05, + "grad_norm": 3.6611759662628174, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8698220252990723, + "num_tokens": 269701205.0, + "step": 7070 + }, + { + "epoch": 0.899503879913497, + "ewc_loss": 0.007261559832841158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.26155994925648e-05, + "grad_norm": 3.6251776218414307, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8798484206199646, + "num_tokens": 269739982.0, + "step": 7071 + }, + { + "epoch": 0.8996310901920875, + "ewc_loss": 0.007224914617836475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224914588732645e-05, + "grad_norm": 3.635529041290283, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8681073784828186, + "num_tokens": 269773048.0, + "step": 7072 + }, + { + "epoch": 0.8997583004706781, + "ewc_loss": 0.007243139203637838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.243139407364652e-05, + "grad_norm": 3.6321449279785156, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8652885556221008, + "num_tokens": 269811343.0, + "step": 7073 + }, + { + "epoch": 0.8998855107492686, + "ewc_loss": 0.007241204846650362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.24120473023504e-05, + "grad_norm": 3.555536985397339, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8703807592391968, + "num_tokens": 269851601.0, + "step": 7074 + }, + { + "epoch": 0.900012721027859, + "ewc_loss": 0.007204386871308088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204386929515749e-05, + "grad_norm": 3.5819573402404785, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8750474452972412, + "num_tokens": 269891396.0, + "step": 7075 + }, + { + "epoch": 0.9001399313064495, + "ewc_loss": 0.00724837789312005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.248378096846864e-05, + "grad_norm": 3.6440494060516357, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8673490285873413, + "num_tokens": 269930292.0, + "step": 7076 + }, + { + "epoch": 0.9002671415850401, + "ewc_loss": 0.0072526875883340836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.252687646541744e-05, + "grad_norm": 3.6658663749694824, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8409274220466614, + "num_tokens": 269968464.0, + "step": 7077 + }, + { + "epoch": 0.9003943518636306, + "ewc_loss": 0.007262059021741152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.262059079948813e-05, + "grad_norm": 3.587553024291992, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8542838096618652, + "num_tokens": 270011569.0, + "step": 7078 + }, + { + "epoch": 0.9005215621422211, + "ewc_loss": 0.00719499820843339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.194998033810407e-05, + "grad_norm": 3.5696303844451904, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.86319500207901, + "num_tokens": 270050608.0, + "step": 7079 + }, + { + "epoch": 0.9006487724208116, + "ewc_loss": 0.007222323678433895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.222323620226234e-05, + "grad_norm": 3.540682077407837, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8527451157569885, + "num_tokens": 270096100.0, + "step": 7080 + }, + { + "epoch": 0.9007759826994021, + "ewc_loss": 0.00722619378939271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226193702081218e-05, + "grad_norm": 3.574620008468628, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8782788515090942, + "num_tokens": 270140915.0, + "step": 7081 + }, + { + "epoch": 0.9009031929779926, + "ewc_loss": 0.007229117676615715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229117909446359e-05, + "grad_norm": 3.5363962650299072, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8675016760826111, + "num_tokens": 270184233.0, + "step": 7082 + }, + { + "epoch": 0.9010304032565831, + "ewc_loss": 0.007202621549367905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.202621782198548e-05, + "grad_norm": 3.6208813190460205, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8663288354873657, + "num_tokens": 270223889.0, + "step": 7083 + }, + { + "epoch": 0.9011576135351737, + "ewc_loss": 0.007237719371914864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.237719546537846e-05, + "grad_norm": 3.5651638507843018, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.871458888053894, + "num_tokens": 270266357.0, + "step": 7084 + }, + { + "epoch": 0.9012848238137642, + "ewc_loss": 0.007173273712396622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.173273479565978e-05, + "grad_norm": 3.643317222595215, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8747215270996094, + "num_tokens": 270298252.0, + "step": 7085 + }, + { + "epoch": 0.9014120340923547, + "ewc_loss": 0.007242283783853054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.242283754749224e-05, + "grad_norm": 3.6016554832458496, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.861181378364563, + "num_tokens": 270343187.0, + "step": 7086 + }, + { + "epoch": 0.9015392443709451, + "ewc_loss": 0.007171106059104204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.171105971792713e-05, + "grad_norm": 3.602001190185547, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8568457961082458, + "num_tokens": 270385094.0, + "step": 7087 + }, + { + "epoch": 0.9016664546495357, + "ewc_loss": 0.00719214603304863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.192145858425647e-05, + "grad_norm": 3.6566832065582275, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8548077344894409, + "num_tokens": 270421772.0, + "step": 7088 + }, + { + "epoch": 0.9017936649281262, + "ewc_loss": 0.0072151245549321175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.215124787762761e-05, + "grad_norm": 3.582036018371582, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8483781218528748, + "num_tokens": 270465033.0, + "step": 7089 + }, + { + "epoch": 0.9019208752067167, + "ewc_loss": 0.007150702178478241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.150702003855258e-05, + "grad_norm": 3.6157066822052, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8887708187103271, + "num_tokens": 270501370.0, + "step": 7090 + }, + { + "epoch": 0.9020480854853072, + "ewc_loss": 0.0072152274660766125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.215227378765121e-05, + "grad_norm": 3.609813690185547, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8717811107635498, + "num_tokens": 270543571.0, + "step": 7091 + }, + { + "epoch": 0.9021752957638978, + "ewc_loss": 0.0071805501356720924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.180550164775923e-05, + "grad_norm": 3.6162095069885254, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8765706419944763, + "num_tokens": 270580635.0, + "step": 7092 + }, + { + "epoch": 0.9023025060424882, + "ewc_loss": 0.007181522436439991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.181522232713178e-05, + "grad_norm": 3.6472113132476807, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8801581859588623, + "num_tokens": 270616534.0, + "step": 7093 + }, + { + "epoch": 0.9024297163210787, + "ewc_loss": 0.007211745250970125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.211745105450973e-05, + "grad_norm": 3.6411917209625244, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8480659127235413, + "num_tokens": 270653998.0, + "step": 7094 + }, + { + "epoch": 0.9025569265996692, + "ewc_loss": 0.007188522256910801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.188522431533784e-05, + "grad_norm": 3.6704158782958984, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8689722418785095, + "num_tokens": 270688023.0, + "step": 7095 + }, + { + "epoch": 0.9026841368782598, + "ewc_loss": 0.007213436998426914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.21343676559627e-05, + "grad_norm": 3.6828501224517822, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8495556116104126, + "num_tokens": 270721298.0, + "step": 7096 + }, + { + "epoch": 0.9028113471568503, + "ewc_loss": 0.007203682791441679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.203682616818696e-05, + "grad_norm": 3.5918776988983154, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8734269142150879, + "num_tokens": 270762182.0, + "step": 7097 + }, + { + "epoch": 0.9029385574354408, + "ewc_loss": 0.007161763031035662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.16176291462034e-05, + "grad_norm": 3.601001739501953, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8819671869277954, + "num_tokens": 270799577.0, + "step": 7098 + }, + { + "epoch": 0.9030657677140312, + "ewc_loss": 0.007192901335656643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.192901102825999e-05, + "grad_norm": 3.569758653640747, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8676878809928894, + "num_tokens": 270840218.0, + "step": 7099 + }, + { + "epoch": 0.9031929779926218, + "ewc_loss": 0.007176727522164583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.176727376645431e-05, + "grad_norm": 3.62544322013855, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8887200951576233, + "num_tokens": 270872982.0, + "step": 7100 + }, + { + "epoch": 0.9033201882712123, + "ewc_loss": 0.007226444780826569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226444722618908e-05, + "grad_norm": 3.6505320072174072, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8722971677780151, + "num_tokens": 270908381.0, + "step": 7101 + }, + { + "epoch": 0.9034473985498028, + "ewc_loss": 0.007207122165709734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.207121961982921e-05, + "grad_norm": 3.568148374557495, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8645116686820984, + "num_tokens": 270949217.0, + "step": 7102 + }, + { + "epoch": 0.9035746088283934, + "ewc_loss": 0.007170235738158226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.170235767262056e-05, + "grad_norm": 3.641940116882324, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8530102968215942, + "num_tokens": 270989667.0, + "step": 7103 + }, + { + "epoch": 0.9037018191069839, + "ewc_loss": 0.007235317025333643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.235317025333643e-05, + "grad_norm": 3.6245834827423096, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8577336072921753, + "num_tokens": 271030395.0, + "step": 7104 + }, + { + "epoch": 0.9038290293855743, + "ewc_loss": 0.007195638492703438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.195638318080455e-05, + "grad_norm": 3.6554980278015137, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.850212574005127, + "num_tokens": 271068239.0, + "step": 7105 + }, + { + "epoch": 0.9039562396641648, + "ewc_loss": 0.007225698791444302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.225698936963454e-05, + "grad_norm": 3.63716721534729, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8600088953971863, + "num_tokens": 271110294.0, + "step": 7106 + }, + { + "epoch": 0.9040834499427554, + "ewc_loss": 0.007213403470814228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213403296191245e-05, + "grad_norm": 3.633700132369995, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8841893672943115, + "num_tokens": 271143288.0, + "step": 7107 + }, + { + "epoch": 0.9042106602213459, + "ewc_loss": 0.007236005272716284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.236005330923945e-05, + "grad_norm": 3.5933163166046143, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8754001259803772, + "num_tokens": 271181183.0, + "step": 7108 + }, + { + "epoch": 0.9043378704999364, + "ewc_loss": 0.0071738664992153645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.173866470111534e-05, + "grad_norm": 3.6002233028411865, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8655204772949219, + "num_tokens": 271216201.0, + "step": 7109 + }, + { + "epoch": 0.9044650807785269, + "ewc_loss": 0.007224934175610542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224934233818203e-05, + "grad_norm": 3.60268235206604, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8741568922996521, + "num_tokens": 271250158.0, + "step": 7110 + }, + { + "epoch": 0.9045922910571175, + "ewc_loss": 0.0072224074974656105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.222407293738797e-05, + "grad_norm": 3.579615592956543, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8805713653564453, + "num_tokens": 271289994.0, + "step": 7111 + }, + { + "epoch": 0.9047195013357079, + "ewc_loss": 0.007199742831289768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.199742685770616e-05, + "grad_norm": 3.6527364253997803, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8731828927993774, + "num_tokens": 271325782.0, + "step": 7112 + }, + { + "epoch": 0.9048467116142984, + "ewc_loss": 0.0072552356868982315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.255235686898232e-05, + "grad_norm": 3.666633367538452, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8536710739135742, + "num_tokens": 271361604.0, + "step": 7113 + }, + { + "epoch": 0.9049739218928889, + "ewc_loss": 0.0072451019659638405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.245101733133197e-05, + "grad_norm": 3.6159908771514893, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8769281506538391, + "num_tokens": 271398520.0, + "step": 7114 + }, + { + "epoch": 0.9051011321714795, + "ewc_loss": 0.007202999200671911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.202999404398724e-05, + "grad_norm": 3.5506033897399902, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8618512153625488, + "num_tokens": 271445138.0, + "step": 7115 + }, + { + "epoch": 0.90522834245007, + "ewc_loss": 0.0071951900608837605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.195190119091421e-05, + "grad_norm": 3.664802312850952, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8724924325942993, + "num_tokens": 271479147.0, + "step": 7116 + }, + { + "epoch": 0.9053555527286605, + "ewc_loss": 0.007291045039892197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.291045039892197e-05, + "grad_norm": 3.6370387077331543, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8704484105110168, + "num_tokens": 271512477.0, + "step": 7117 + }, + { + "epoch": 0.905482763007251, + "ewc_loss": 0.0072283451445400715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.228345202747732e-05, + "grad_norm": 3.626115560531616, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8542271852493286, + "num_tokens": 271552355.0, + "step": 7118 + }, + { + "epoch": 0.9056099732858415, + "ewc_loss": 0.007233748212456703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.233748328872025e-05, + "grad_norm": 3.5819051265716553, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8646410703659058, + "num_tokens": 271595825.0, + "step": 7119 + }, + { + "epoch": 0.905737183564432, + "ewc_loss": 0.007220720872282982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.22072072676383e-05, + "grad_norm": 3.6924655437469482, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8554813861846924, + "num_tokens": 271627384.0, + "step": 7120 + }, + { + "epoch": 0.9058643938430225, + "ewc_loss": 0.007310848217457533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.31084801373072e-05, + "grad_norm": 3.5738418102264404, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8759661912918091, + "num_tokens": 271666711.0, + "step": 7121 + }, + { + "epoch": 0.9059916041216131, + "ewc_loss": 0.007205452304333448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.205452129710466e-05, + "grad_norm": 3.6148271560668945, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8726497888565063, + "num_tokens": 271703145.0, + "step": 7122 + }, + { + "epoch": 0.9061188144002036, + "ewc_loss": 0.007270927540957928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.270927744684741e-05, + "grad_norm": 3.6290128231048584, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8563454151153564, + "num_tokens": 271743131.0, + "step": 7123 + }, + { + "epoch": 0.906246024678794, + "ewc_loss": 0.007258482277393341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.25848221918568e-05, + "grad_norm": 3.6802818775177, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8720176219940186, + "num_tokens": 271778245.0, + "step": 7124 + }, + { + "epoch": 0.9063732349573845, + "ewc_loss": 0.007286239415407181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.286239269888029e-05, + "grad_norm": 3.6047468185424805, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8588119149208069, + "num_tokens": 271817279.0, + "step": 7125 + }, + { + "epoch": 0.9065004452359751, + "ewc_loss": 0.007234969176352024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.234969234559685e-05, + "grad_norm": 3.6027069091796875, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8734910488128662, + "num_tokens": 271857136.0, + "step": 7126 + }, + { + "epoch": 0.9066276555145656, + "ewc_loss": 0.007255140226334333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.255140371853486e-05, + "grad_norm": 3.6460726261138916, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8758789300918579, + "num_tokens": 271891651.0, + "step": 7127 + }, + { + "epoch": 0.9067548657931561, + "ewc_loss": 0.007278795819729567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.278795965248719e-05, + "grad_norm": 3.646852970123291, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8724154233932495, + "num_tokens": 271925805.0, + "step": 7128 + }, + { + "epoch": 0.9068820760717466, + "ewc_loss": 0.007243664003908634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.243664003908634e-05, + "grad_norm": 3.622753858566284, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8729029893875122, + "num_tokens": 271964954.0, + "step": 7129 + }, + { + "epoch": 0.9070092863503371, + "ewc_loss": 0.007232634350657463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.232634379761294e-05, + "grad_norm": 3.6378726959228516, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8661705851554871, + "num_tokens": 272000331.0, + "step": 7130 + }, + { + "epoch": 0.9071364966289276, + "ewc_loss": 0.007267474662512541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.26747457520105e-05, + "grad_norm": 3.6327192783355713, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.874205470085144, + "num_tokens": 272037853.0, + "step": 7131 + }, + { + "epoch": 0.9072637069075181, + "ewc_loss": 0.007243871223181486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.243871368700638e-05, + "grad_norm": 3.571777105331421, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8775633573532104, + "num_tokens": 272077663.0, + "step": 7132 + }, + { + "epoch": 0.9073909171861086, + "ewc_loss": 0.007221683394163847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.221683335956186e-05, + "grad_norm": 3.6060397624969482, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8625515103340149, + "num_tokens": 272116942.0, + "step": 7133 + }, + { + "epoch": 0.9075181274646992, + "ewc_loss": 0.0072563569992780685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.256356911966577e-05, + "grad_norm": 3.6051535606384277, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8618428111076355, + "num_tokens": 272151319.0, + "step": 7134 + }, + { + "epoch": 0.9076453377432897, + "ewc_loss": 0.007264282554388046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.264282612595707e-05, + "grad_norm": 3.6486940383911133, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.855900764465332, + "num_tokens": 272190817.0, + "step": 7135 + }, + { + "epoch": 0.9077725480218801, + "ewc_loss": 0.00726474029943347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.264740270329639e-05, + "grad_norm": 3.6059787273406982, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8622480630874634, + "num_tokens": 272230461.0, + "step": 7136 + }, + { + "epoch": 0.9078997583004706, + "ewc_loss": 0.007237604819238186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.237604586407542e-05, + "grad_norm": 3.665255069732666, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8611054420471191, + "num_tokens": 272268510.0, + "step": 7137 + }, + { + "epoch": 0.9080269685790612, + "ewc_loss": 0.007294453680515289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.294453826034442e-05, + "grad_norm": 3.573681592941284, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8684143424034119, + "num_tokens": 272312430.0, + "step": 7138 + }, + { + "epoch": 0.9081541788576517, + "ewc_loss": 0.007211833726614714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.211833872133866e-05, + "grad_norm": 3.627370595932007, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8674642443656921, + "num_tokens": 272348437.0, + "step": 7139 + }, + { + "epoch": 0.9082813891362422, + "ewc_loss": 0.007280142046511173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.280142017407343e-05, + "grad_norm": 3.7061548233032227, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8611643314361572, + "num_tokens": 272378902.0, + "step": 7140 + }, + { + "epoch": 0.9084085994148328, + "ewc_loss": 0.00729959225282073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.299592107301578e-05, + "grad_norm": 3.571164608001709, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8804507851600647, + "num_tokens": 272421257.0, + "step": 7141 + }, + { + "epoch": 0.9085358096934232, + "ewc_loss": 0.007182709872722626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.182709668995813e-05, + "grad_norm": 3.600196599960327, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8659325242042542, + "num_tokens": 272459800.0, + "step": 7142 + }, + { + "epoch": 0.9086630199720137, + "ewc_loss": 0.0072484128177165985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.248413021443412e-05, + "grad_norm": 3.578029155731201, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8867709636688232, + "num_tokens": 272501245.0, + "step": 7143 + }, + { + "epoch": 0.9087902302506042, + "ewc_loss": 0.007207304704934359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.207304588519037e-05, + "grad_norm": 3.673245668411255, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8686233758926392, + "num_tokens": 272535873.0, + "step": 7144 + }, + { + "epoch": 0.9089174405291948, + "ewc_loss": 0.007270216941833496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.270216883625835e-05, + "grad_norm": 3.636713743209839, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8658400177955627, + "num_tokens": 272575213.0, + "step": 7145 + }, + { + "epoch": 0.9090446508077853, + "ewc_loss": 0.007214759476482868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214759534690529e-05, + "grad_norm": 3.6267712116241455, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8693388104438782, + "num_tokens": 272611960.0, + "step": 7146 + }, + { + "epoch": 0.9091718610863758, + "ewc_loss": 0.007227544207125902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.227544119814411e-05, + "grad_norm": 3.612387180328369, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8687774538993835, + "num_tokens": 272650895.0, + "step": 7147 + }, + { + "epoch": 0.9092990713649662, + "ewc_loss": 0.007220180239528418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.220180123113096e-05, + "grad_norm": 3.621816396713257, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8685336112976074, + "num_tokens": 272690617.0, + "step": 7148 + }, + { + "epoch": 0.9094262816435568, + "ewc_loss": 0.007230835035443306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.230835035443306e-05, + "grad_norm": 3.6405465602874756, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8581750392913818, + "num_tokens": 272732526.0, + "step": 7149 + }, + { + "epoch": 0.9095534919221473, + "ewc_loss": 0.007229524664580822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229524635476992e-05, + "grad_norm": 3.6306474208831787, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8652145862579346, + "num_tokens": 272769882.0, + "step": 7150 + }, + { + "epoch": 0.9096807022007378, + "ewc_loss": 0.00721187237650156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.211872434709221e-05, + "grad_norm": 3.60800838470459, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8701024055480957, + "num_tokens": 272809614.0, + "step": 7151 + }, + { + "epoch": 0.9098079124793284, + "ewc_loss": 0.007199644576758146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.199644460342824e-05, + "grad_norm": 3.622664451599121, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8603702187538147, + "num_tokens": 272844819.0, + "step": 7152 + }, + { + "epoch": 0.9099351227579189, + "ewc_loss": 0.007234523072838783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.234523218357936e-05, + "grad_norm": 3.600769281387329, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8736685514450073, + "num_tokens": 272882798.0, + "step": 7153 + }, + { + "epoch": 0.9100623330365093, + "ewc_loss": 0.007194963749498129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.19496383680962e-05, + "grad_norm": 3.600888252258301, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8708657026290894, + "num_tokens": 272917472.0, + "step": 7154 + }, + { + "epoch": 0.9101895433150998, + "ewc_loss": 0.007220210041850805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.220209954539314e-05, + "grad_norm": 3.6011297702789307, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8567492365837097, + "num_tokens": 272959947.0, + "step": 7155 + }, + { + "epoch": 0.9103167535936904, + "ewc_loss": 0.007211171090602875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.211171032395214e-05, + "grad_norm": 3.6073415279388428, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8499417901039124, + "num_tokens": 272998194.0, + "step": 7156 + }, + { + "epoch": 0.9104439638722809, + "ewc_loss": 0.007204902824014425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204902794910595e-05, + "grad_norm": 3.6156160831451416, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8847917318344116, + "num_tokens": 273034688.0, + "step": 7157 + }, + { + "epoch": 0.9105711741508714, + "ewc_loss": 0.007212310563772917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.212310447357595e-05, + "grad_norm": 3.6161012649536133, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8787193894386292, + "num_tokens": 273071575.0, + "step": 7158 + }, + { + "epoch": 0.9106983844294619, + "ewc_loss": 0.007209251634776592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.209251634776592e-05, + "grad_norm": 3.7155141830444336, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8422996997833252, + "num_tokens": 273106106.0, + "step": 7159 + }, + { + "epoch": 0.9108255947080524, + "ewc_loss": 0.007264137268066406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.264137093443424e-05, + "grad_norm": 3.6261792182922363, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8612238764762878, + "num_tokens": 273142959.0, + "step": 7160 + }, + { + "epoch": 0.9109528049866429, + "ewc_loss": 0.0071815671399235725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.181567343650386e-05, + "grad_norm": 3.6282835006713867, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8516088724136353, + "num_tokens": 273182781.0, + "step": 7161 + }, + { + "epoch": 0.9110800152652334, + "ewc_loss": 0.007217241916805506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.217242091428488e-05, + "grad_norm": 3.5623815059661865, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8734458684921265, + "num_tokens": 273225738.0, + "step": 7162 + }, + { + "epoch": 0.9112072255438239, + "ewc_loss": 0.007190051954239607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.190051837824285e-05, + "grad_norm": 3.5445139408111572, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8747583627700806, + "num_tokens": 273269166.0, + "step": 7163 + }, + { + "epoch": 0.9113344358224145, + "ewc_loss": 0.007204912137240171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204912253655493e-05, + "grad_norm": 3.607781410217285, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8617945313453674, + "num_tokens": 273309093.0, + "step": 7164 + }, + { + "epoch": 0.911461646101005, + "ewc_loss": 0.007244533393532038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.244533480843529e-05, + "grad_norm": 3.601186513900757, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.872455894947052, + "num_tokens": 273346857.0, + "step": 7165 + }, + { + "epoch": 0.9115888563795955, + "ewc_loss": 0.0071881297044456005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.188129529822618e-05, + "grad_norm": 3.5770654678344727, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8626004457473755, + "num_tokens": 273385720.0, + "step": 7166 + }, + { + "epoch": 0.9117160666581859, + "ewc_loss": 0.007208754774183035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.208754686871544e-05, + "grad_norm": 3.572725296020508, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8689271211624146, + "num_tokens": 273431079.0, + "step": 7167 + }, + { + "epoch": 0.9118432769367765, + "ewc_loss": 0.007200208958238363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.200209074653685e-05, + "grad_norm": 3.569174289703369, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8687574863433838, + "num_tokens": 273477316.0, + "step": 7168 + }, + { + "epoch": 0.911970487215367, + "ewc_loss": 0.007187580224126577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.187580195022747e-05, + "grad_norm": 3.650153875350952, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8715515732765198, + "num_tokens": 273512831.0, + "step": 7169 + }, + { + "epoch": 0.9120976974939575, + "ewc_loss": 0.007227763067930937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.227763126138598e-05, + "grad_norm": 3.6364786624908447, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8604200482368469, + "num_tokens": 273551927.0, + "step": 7170 + }, + { + "epoch": 0.912224907772548, + "ewc_loss": 0.007196469232439995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.196469232439995e-05, + "grad_norm": 3.6933438777923584, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8712524175643921, + "num_tokens": 273583053.0, + "step": 7171 + }, + { + "epoch": 0.9123521180511386, + "ewc_loss": 0.007231409661471844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.231409836094826e-05, + "grad_norm": 3.6144473552703857, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8581944108009338, + "num_tokens": 273617953.0, + "step": 7172 + }, + { + "epoch": 0.912479328329729, + "ewc_loss": 0.0071928007528185844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.192800694610924e-05, + "grad_norm": 3.681380271911621, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8556358218193054, + "num_tokens": 273649959.0, + "step": 7173 + }, + { + "epoch": 0.9126065386083195, + "ewc_loss": 0.0072487154975533485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.248715701280162e-05, + "grad_norm": 3.654865026473999, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8573359847068787, + "num_tokens": 273682666.0, + "step": 7174 + }, + { + "epoch": 0.9127337488869101, + "ewc_loss": 0.007237328682094812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.237328827613965e-05, + "grad_norm": 3.6097116470336914, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8856676816940308, + "num_tokens": 273718641.0, + "step": 7175 + }, + { + "epoch": 0.9128609591655006, + "ewc_loss": 0.0072255367413163185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.225536683108658e-05, + "grad_norm": 3.5583417415618896, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.876814067363739, + "num_tokens": 273760502.0, + "step": 7176 + }, + { + "epoch": 0.9129881694440911, + "ewc_loss": 0.007224781438708305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224781438708305e-05, + "grad_norm": 3.648470640182495, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8618828058242798, + "num_tokens": 273795068.0, + "step": 7177 + }, + { + "epoch": 0.9131153797226816, + "ewc_loss": 0.0072958157397806644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.295815885299817e-05, + "grad_norm": 3.7356579303741455, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8502376675605774, + "num_tokens": 273825099.0, + "step": 7178 + }, + { + "epoch": 0.9132425900012721, + "ewc_loss": 0.007330173626542091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.330173684749752e-05, + "grad_norm": 3.5902066230773926, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8733638525009155, + "num_tokens": 273862566.0, + "step": 7179 + }, + { + "epoch": 0.9133698002798626, + "ewc_loss": 0.0072329966351389885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.23299672245048e-05, + "grad_norm": 3.6345338821411133, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8688838481903076, + "num_tokens": 273896107.0, + "step": 7180 + }, + { + "epoch": 0.9134970105584531, + "ewc_loss": 0.007314776070415974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.314776303246617e-05, + "grad_norm": 3.5780529975891113, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8725528717041016, + "num_tokens": 273934066.0, + "step": 7181 + }, + { + "epoch": 0.9136242208370436, + "ewc_loss": 0.0072659095749258995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.265909516718239e-05, + "grad_norm": 3.594456911087036, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8654305934906006, + "num_tokens": 273971843.0, + "step": 7182 + }, + { + "epoch": 0.9137514311156342, + "ewc_loss": 0.007302786223590374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.302786252694204e-05, + "grad_norm": 3.6642401218414307, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8604986071586609, + "num_tokens": 274001646.0, + "step": 7183 + }, + { + "epoch": 0.9138786413942247, + "ewc_loss": 0.007346670608967543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.34667046344839e-05, + "grad_norm": 3.641550302505493, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8514449596405029, + "num_tokens": 274042656.0, + "step": 7184 + }, + { + "epoch": 0.9140058516728151, + "ewc_loss": 0.0073082335293293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.308233762159944e-05, + "grad_norm": 3.569929599761963, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8550751209259033, + "num_tokens": 274084609.0, + "step": 7185 + }, + { + "epoch": 0.9141330619514056, + "ewc_loss": 0.007282644975930452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.282644946826622e-05, + "grad_norm": 3.5897982120513916, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8704513311386108, + "num_tokens": 274127503.0, + "step": 7186 + }, + { + "epoch": 0.9142602722299962, + "ewc_loss": 0.0073248944245278835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.324894249904901e-05, + "grad_norm": 3.586914539337158, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8713392615318298, + "num_tokens": 274164126.0, + "step": 7187 + }, + { + "epoch": 0.9143874825085867, + "ewc_loss": 0.007305814418941736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.305814506253228e-05, + "grad_norm": 3.6412010192871094, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8675572872161865, + "num_tokens": 274198104.0, + "step": 7188 + }, + { + "epoch": 0.9145146927871772, + "ewc_loss": 0.007342006545513868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.342006574617699e-05, + "grad_norm": 3.6586668491363525, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.85642409324646, + "num_tokens": 274233487.0, + "step": 7189 + }, + { + "epoch": 0.9146419030657678, + "ewc_loss": 0.007313774898648262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.313775131478906e-05, + "grad_norm": 3.6019062995910645, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8719629049301147, + "num_tokens": 274267317.0, + "step": 7190 + }, + { + "epoch": 0.9147691133443582, + "ewc_loss": 0.007303254213184118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.303254096768796e-05, + "grad_norm": 3.6531646251678467, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8615500926971436, + "num_tokens": 274304169.0, + "step": 7191 + }, + { + "epoch": 0.9148963236229487, + "ewc_loss": 0.007340418174862862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.340418233070523e-05, + "grad_norm": 3.617894411087036, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8684282302856445, + "num_tokens": 274343265.0, + "step": 7192 + }, + { + "epoch": 0.9150235339015392, + "ewc_loss": 0.007309474982321262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.309475040528923e-05, + "grad_norm": 3.6539573669433594, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8521701097488403, + "num_tokens": 274380756.0, + "step": 7193 + }, + { + "epoch": 0.9151507441801298, + "ewc_loss": 0.0073328446596860886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.332844688789919e-05, + "grad_norm": 3.5769948959350586, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8777496814727783, + "num_tokens": 274422035.0, + "step": 7194 + }, + { + "epoch": 0.9152779544587203, + "ewc_loss": 0.0072747645899653435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2747643571347e-05, + "grad_norm": 3.642664909362793, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8571823835372925, + "num_tokens": 274460875.0, + "step": 7195 + }, + { + "epoch": 0.9154051647373108, + "ewc_loss": 0.007327117957174778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.327117782551795e-05, + "grad_norm": 3.677703619003296, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8704726696014404, + "num_tokens": 274495807.0, + "step": 7196 + }, + { + "epoch": 0.9155323750159012, + "ewc_loss": 0.007325228303670883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.325228216359392e-05, + "grad_norm": 3.61336088180542, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8751908540725708, + "num_tokens": 274533098.0, + "step": 7197 + }, + { + "epoch": 0.9156595852944918, + "ewc_loss": 0.0072664725594222546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.266472675837576e-05, + "grad_norm": 3.7005481719970703, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.862335205078125, + "num_tokens": 274565282.0, + "step": 7198 + }, + { + "epoch": 0.9157867955730823, + "ewc_loss": 0.007345505058765411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.345504855038598e-05, + "grad_norm": 3.583735466003418, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8708740472793579, + "num_tokens": 274605401.0, + "step": 7199 + }, + { + "epoch": 0.9159140058516728, + "ewc_loss": 0.007229616865515709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229617040138692e-05, + "grad_norm": 3.6998934745788574, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8613038063049316, + "num_tokens": 274645018.0, + "step": 7200 + }, + { + "epoch": 0.9160412161302633, + "ewc_loss": 0.007348925340920687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.348925282713026e-05, + "grad_norm": 3.617259979248047, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8757975697517395, + "num_tokens": 274681541.0, + "step": 7201 + }, + { + "epoch": 0.9161684264088539, + "ewc_loss": 0.007274684961885214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.274685049196705e-05, + "grad_norm": 3.7127487659454346, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8700342178344727, + "num_tokens": 274716883.0, + "step": 7202 + }, + { + "epoch": 0.9162956366874443, + "ewc_loss": 0.007348321378231049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.348321378231049e-05, + "grad_norm": 3.65440034866333, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8613837957382202, + "num_tokens": 274753038.0, + "step": 7203 + }, + { + "epoch": 0.9164228469660348, + "ewc_loss": 0.007285342086106539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2853421443142e-05, + "grad_norm": 3.588207960128784, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8745448589324951, + "num_tokens": 274790760.0, + "step": 7204 + }, + { + "epoch": 0.9165500572446253, + "ewc_loss": 0.007260907907038927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.260908023454249e-05, + "grad_norm": 3.586648941040039, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.860403299331665, + "num_tokens": 274830872.0, + "step": 7205 + }, + { + "epoch": 0.9166772675232159, + "ewc_loss": 0.0072996043600142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.299604476429522e-05, + "grad_norm": 3.6493542194366455, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8542363047599792, + "num_tokens": 274869832.0, + "step": 7206 + }, + { + "epoch": 0.9168044778018064, + "ewc_loss": 0.007322630845010281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.322630699491128e-05, + "grad_norm": 3.6572067737579346, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8588743209838867, + "num_tokens": 274903853.0, + "step": 7207 + }, + { + "epoch": 0.9169316880803969, + "ewc_loss": 0.007299933582544327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.299933349713683e-05, + "grad_norm": 3.5885231494903564, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8663716316223145, + "num_tokens": 274942911.0, + "step": 7208 + }, + { + "epoch": 0.9170588983589874, + "ewc_loss": 0.00726623460650444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.266234752023593e-05, + "grad_norm": 3.5949058532714844, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8744953274726868, + "num_tokens": 274986427.0, + "step": 7209 + }, + { + "epoch": 0.9171861086375779, + "ewc_loss": 0.007295615039765835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.295615068869665e-05, + "grad_norm": 3.637784957885742, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8695727586746216, + "num_tokens": 275021557.0, + "step": 7210 + }, + { + "epoch": 0.9173133189161684, + "ewc_loss": 0.007306043058633804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.306042971322313e-05, + "grad_norm": 3.6570475101470947, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.85671067237854, + "num_tokens": 275061308.0, + "step": 7211 + }, + { + "epoch": 0.9174405291947589, + "ewc_loss": 0.007313946262001991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.313946116482839e-05, + "grad_norm": 3.5979082584381104, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8608311414718628, + "num_tokens": 275101497.0, + "step": 7212 + }, + { + "epoch": 0.9175677394733495, + "ewc_loss": 0.007258221041411161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.258221012307331e-05, + "grad_norm": 3.7371463775634766, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8546142578125, + "num_tokens": 275138317.0, + "step": 7213 + }, + { + "epoch": 0.91769494975194, + "ewc_loss": 0.007376761641353369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.376761641353369e-05, + "grad_norm": 3.5956532955169678, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8668370246887207, + "num_tokens": 275180119.0, + "step": 7214 + }, + { + "epoch": 0.9178221600305305, + "ewc_loss": 0.007233130745589733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.233130600070581e-05, + "grad_norm": 3.696000814437866, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8654940128326416, + "num_tokens": 275212083.0, + "step": 7215 + }, + { + "epoch": 0.9179493703091209, + "ewc_loss": 0.007343803066760302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.34380300855264e-05, + "grad_norm": 3.657984972000122, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8578635454177856, + "num_tokens": 275247872.0, + "step": 7216 + }, + { + "epoch": 0.9180765805877115, + "ewc_loss": 0.007279993034899235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.279992860276252e-05, + "grad_norm": 3.583099842071533, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8710668087005615, + "num_tokens": 275287022.0, + "step": 7217 + }, + { + "epoch": 0.918203790866302, + "ewc_loss": 0.007249148562550545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.249148620758206e-05, + "grad_norm": 3.672555685043335, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8688144683837891, + "num_tokens": 275322259.0, + "step": 7218 + }, + { + "epoch": 0.9183310011448925, + "ewc_loss": 0.007333570159971714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.333570101764053e-05, + "grad_norm": 3.6385200023651123, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8809242248535156, + "num_tokens": 275357410.0, + "step": 7219 + }, + { + "epoch": 0.918458211423483, + "ewc_loss": 0.007277890108525753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.277890108525753e-05, + "grad_norm": 3.672550678253174, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8783374428749084, + "num_tokens": 275388720.0, + "step": 7220 + }, + { + "epoch": 0.9185854217020736, + "ewc_loss": 0.007317764684557915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.317764539038762e-05, + "grad_norm": 3.623365640640259, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8739603757858276, + "num_tokens": 275427699.0, + "step": 7221 + }, + { + "epoch": 0.918712631980664, + "ewc_loss": 0.007294069044291973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.294068927876651e-05, + "grad_norm": 3.5920939445495605, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8692967891693115, + "num_tokens": 275472220.0, + "step": 7222 + }, + { + "epoch": 0.9188398422592545, + "ewc_loss": 0.007289351895451546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.289351924555376e-05, + "grad_norm": 3.67228102684021, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.855724573135376, + "num_tokens": 275506693.0, + "step": 7223 + }, + { + "epoch": 0.918967052537845, + "ewc_loss": 0.00733979232609272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.339792500715703e-05, + "grad_norm": 3.615891695022583, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8708919286727905, + "num_tokens": 275547295.0, + "step": 7224 + }, + { + "epoch": 0.9190942628164356, + "ewc_loss": 0.007274709176272154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.274709059856832e-05, + "grad_norm": 3.5720410346984863, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8633915185928345, + "num_tokens": 275591552.0, + "step": 7225 + }, + { + "epoch": 0.9192214730950261, + "ewc_loss": 0.007266166154295206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.266166358022019e-05, + "grad_norm": 3.673100709915161, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8423339128494263, + "num_tokens": 275632197.0, + "step": 7226 + }, + { + "epoch": 0.9193486833736166, + "ewc_loss": 0.007336833514273167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.336833368754014e-05, + "grad_norm": 3.615056276321411, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8611904382705688, + "num_tokens": 275670680.0, + "step": 7227 + }, + { + "epoch": 0.919475893652207, + "ewc_loss": 0.007245434448122978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.245434244396165e-05, + "grad_norm": 3.600816249847412, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8643239736557007, + "num_tokens": 275709315.0, + "step": 7228 + }, + { + "epoch": 0.9196031039307976, + "ewc_loss": 0.007266136351972818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.266136526595801e-05, + "grad_norm": 3.705054759979248, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8667218685150146, + "num_tokens": 275740405.0, + "step": 7229 + }, + { + "epoch": 0.9197303142093881, + "ewc_loss": 0.007323363330215216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.323363388422877e-05, + "grad_norm": 3.5932226181030273, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8842239379882812, + "num_tokens": 275780915.0, + "step": 7230 + }, + { + "epoch": 0.9198575244879786, + "ewc_loss": 0.007216949015855789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.216948870336637e-05, + "grad_norm": 3.629696846008301, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8619717955589294, + "num_tokens": 275817832.0, + "step": 7231 + }, + { + "epoch": 0.9199847347665692, + "ewc_loss": 0.00729123642668128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29123639757745e-05, + "grad_norm": 3.601334571838379, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8703446388244629, + "num_tokens": 275857621.0, + "step": 7232 + }, + { + "epoch": 0.9201119450451597, + "ewc_loss": 0.007247927598655224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.247927715070546e-05, + "grad_norm": 3.6536073684692383, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8633419275283813, + "num_tokens": 275898917.0, + "step": 7233 + }, + { + "epoch": 0.9202391553237501, + "ewc_loss": 0.007268007379025221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.268007175298408e-05, + "grad_norm": 3.6880083084106445, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8709661960601807, + "num_tokens": 275930151.0, + "step": 7234 + }, + { + "epoch": 0.9203663656023406, + "ewc_loss": 0.0072720786556601524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.272078801179305e-05, + "grad_norm": 3.60146164894104, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8738492727279663, + "num_tokens": 275966901.0, + "step": 7235 + }, + { + "epoch": 0.9204935758809312, + "ewc_loss": 0.007203354500234127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.203354471130297e-05, + "grad_norm": 3.5708816051483154, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8883625864982605, + "num_tokens": 276002845.0, + "step": 7236 + }, + { + "epoch": 0.9206207861595217, + "ewc_loss": 0.007240588311105967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.240588456625119e-05, + "grad_norm": 3.658949136734009, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8666373491287231, + "num_tokens": 276038476.0, + "step": 7237 + }, + { + "epoch": 0.9207479964381122, + "ewc_loss": 0.00728572765365243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.285727770067751e-05, + "grad_norm": 3.6315903663635254, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8626700639724731, + "num_tokens": 276077867.0, + "step": 7238 + }, + { + "epoch": 0.9208752067167028, + "ewc_loss": 0.0072380308993160725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.238030957523733e-05, + "grad_norm": 3.5692813396453857, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.856923520565033, + "num_tokens": 276123289.0, + "step": 7239 + }, + { + "epoch": 0.9210024169952932, + "ewc_loss": 0.007205515634268522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.205515430541709e-05, + "grad_norm": 3.686553955078125, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8552255630493164, + "num_tokens": 276157357.0, + "step": 7240 + }, + { + "epoch": 0.9211296272738837, + "ewc_loss": 0.00729855801910162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.298558193724602e-05, + "grad_norm": 3.6046857833862305, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8871155381202698, + "num_tokens": 276195548.0, + "step": 7241 + }, + { + "epoch": 0.9212568375524742, + "ewc_loss": 0.0072103459388017654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.210345938801765e-05, + "grad_norm": 3.6636228561401367, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8688708543777466, + "num_tokens": 276233533.0, + "step": 7242 + }, + { + "epoch": 0.9213840478310648, + "ewc_loss": 0.007272999733686447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.27299993741326e-05, + "grad_norm": 3.6320981979370117, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8695621490478516, + "num_tokens": 276274310.0, + "step": 7243 + }, + { + "epoch": 0.9215112581096553, + "ewc_loss": 0.0072214026004076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.221402483992279e-05, + "grad_norm": 3.632843255996704, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8650339245796204, + "num_tokens": 276310272.0, + "step": 7244 + }, + { + "epoch": 0.9216384683882458, + "ewc_loss": 0.007225493900477886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.225493754958734e-05, + "grad_norm": 3.5888566970825195, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8608608841896057, + "num_tokens": 276350334.0, + "step": 7245 + }, + { + "epoch": 0.9217656786668362, + "ewc_loss": 0.007214625831693411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214625657070428e-05, + "grad_norm": 3.6119909286499023, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8617361783981323, + "num_tokens": 276393913.0, + "step": 7246 + }, + { + "epoch": 0.9218928889454268, + "ewc_loss": 0.007224667351692915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224667206173763e-05, + "grad_norm": 3.6384060382843018, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.862375020980835, + "num_tokens": 276430368.0, + "step": 7247 + }, + { + "epoch": 0.9220200992240173, + "ewc_loss": 0.007241907063871622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.241906860144809e-05, + "grad_norm": 3.618875503540039, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8612995743751526, + "num_tokens": 276470930.0, + "step": 7248 + }, + { + "epoch": 0.9221473095026078, + "ewc_loss": 0.007204513531178236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204513531178236e-05, + "grad_norm": 3.639789581298828, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8711117506027222, + "num_tokens": 276509359.0, + "step": 7249 + }, + { + "epoch": 0.9222745197811983, + "ewc_loss": 0.007225632201880217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.225631998153403e-05, + "grad_norm": 3.607612133026123, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8597748279571533, + "num_tokens": 276546980.0, + "step": 7250 + }, + { + "epoch": 0.9224017300597889, + "ewc_loss": 0.0072035458870232105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.20354582881555e-05, + "grad_norm": 3.650930166244507, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8546395301818848, + "num_tokens": 276586212.0, + "step": 7251 + }, + { + "epoch": 0.9225289403383793, + "ewc_loss": 0.007223667576909065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.223667489597574e-05, + "grad_norm": 3.605203866958618, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8667684197425842, + "num_tokens": 276623946.0, + "step": 7252 + }, + { + "epoch": 0.9226561506169698, + "ewc_loss": 0.00719718961045146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.197189552243799e-05, + "grad_norm": 3.633328914642334, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8608747720718384, + "num_tokens": 276667567.0, + "step": 7253 + }, + { + "epoch": 0.9227833608955603, + "ewc_loss": 0.007201886270195246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.201886182883754e-05, + "grad_norm": 3.5823917388916016, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8599956035614014, + "num_tokens": 276708479.0, + "step": 7254 + }, + { + "epoch": 0.9229105711741509, + "ewc_loss": 0.007175853475928307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.175853534135967e-05, + "grad_norm": 3.669589042663574, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8606166243553162, + "num_tokens": 276744627.0, + "step": 7255 + }, + { + "epoch": 0.9230377814527414, + "ewc_loss": 0.007251901086419821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.251901115523651e-05, + "grad_norm": 3.622375965118408, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8676373958587646, + "num_tokens": 276782830.0, + "step": 7256 + }, + { + "epoch": 0.9231649917313319, + "ewc_loss": 0.007184047717601061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.184047717601061e-05, + "grad_norm": 3.6309731006622314, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8650712966918945, + "num_tokens": 276825745.0, + "step": 7257 + }, + { + "epoch": 0.9232922020099223, + "ewc_loss": 0.0072027877904474735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.202787674032152e-05, + "grad_norm": 3.6045618057250977, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8705431222915649, + "num_tokens": 276866743.0, + "step": 7258 + }, + { + "epoch": 0.9234194122885129, + "ewc_loss": 0.007183420471847057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.183420530054718e-05, + "grad_norm": 3.6441843509674072, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8638626337051392, + "num_tokens": 276903884.0, + "step": 7259 + }, + { + "epoch": 0.9235466225671034, + "ewc_loss": 0.007212834432721138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.212834316305816e-05, + "grad_norm": 3.6448252201080322, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8733764886856079, + "num_tokens": 276939142.0, + "step": 7260 + }, + { + "epoch": 0.9236738328456939, + "ewc_loss": 0.007213391829282045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213391654659063e-05, + "grad_norm": 3.646192789077759, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.868295431137085, + "num_tokens": 276979041.0, + "step": 7261 + }, + { + "epoch": 0.9238010431242845, + "ewc_loss": 0.0072002112865448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.20021125744097e-05, + "grad_norm": 3.6221187114715576, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8789716958999634, + "num_tokens": 277015590.0, + "step": 7262 + }, + { + "epoch": 0.923928253402875, + "ewc_loss": 0.0072082593105733395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.208259194158018e-05, + "grad_norm": 3.6794965267181396, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8611994981765747, + "num_tokens": 277055745.0, + "step": 7263 + }, + { + "epoch": 0.9240554636814655, + "ewc_loss": 0.007231356110423803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.231355994008482e-05, + "grad_norm": 3.6047351360321045, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.862543523311615, + "num_tokens": 277098856.0, + "step": 7264 + }, + { + "epoch": 0.9241826739600559, + "ewc_loss": 0.007162024732679129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.16202484909445e-05, + "grad_norm": 3.6014490127563477, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8779093027114868, + "num_tokens": 277139165.0, + "step": 7265 + }, + { + "epoch": 0.9243098842386465, + "ewc_loss": 0.0071872747503221035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.187274604802951e-05, + "grad_norm": 3.6484179496765137, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8727656006813049, + "num_tokens": 277171879.0, + "step": 7266 + }, + { + "epoch": 0.924437094517237, + "ewc_loss": 0.007218108046799898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.218107930384576e-05, + "grad_norm": 3.6001763343811035, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8654120564460754, + "num_tokens": 277211719.0, + "step": 7267 + }, + { + "epoch": 0.9245643047958275, + "ewc_loss": 0.007150229997932911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.150229794206098e-05, + "grad_norm": 3.6038119792938232, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.857951283454895, + "num_tokens": 277252675.0, + "step": 7268 + }, + { + "epoch": 0.924691515074418, + "ewc_loss": 0.007196616381406784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.196616206783801e-05, + "grad_norm": 3.6669108867645264, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8544695377349854, + "num_tokens": 277287529.0, + "step": 7269 + }, + { + "epoch": 0.9248187253530086, + "ewc_loss": 0.0072423843666911125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.242384162964299e-05, + "grad_norm": 3.6129674911499023, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8646986484527588, + "num_tokens": 277328392.0, + "step": 7270 + }, + { + "epoch": 0.924945935631599, + "ewc_loss": 0.007191743701696396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.191743497969583e-05, + "grad_norm": 3.628333806991577, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8742853403091431, + "num_tokens": 277368412.0, + "step": 7271 + }, + { + "epoch": 0.9250731459101895, + "ewc_loss": 0.007229480892419815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229480979731306e-05, + "grad_norm": 3.603644609451294, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8715802431106567, + "num_tokens": 277404373.0, + "step": 7272 + }, + { + "epoch": 0.92520035618878, + "ewc_loss": 0.007207192480564117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.207192538771778e-05, + "grad_norm": 3.618928909301758, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.864046037197113, + "num_tokens": 277444988.0, + "step": 7273 + }, + { + "epoch": 0.9253275664673706, + "ewc_loss": 0.007241794839501381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.24179481039755e-05, + "grad_norm": 3.627108097076416, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8749815225601196, + "num_tokens": 277487501.0, + "step": 7274 + }, + { + "epoch": 0.9254547767459611, + "ewc_loss": 0.007231198716908693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.231198833324015e-05, + "grad_norm": 3.6038615703582764, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8642922639846802, + "num_tokens": 277529056.0, + "step": 7275 + }, + { + "epoch": 0.9255819870245516, + "ewc_loss": 0.00722704641520977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2270464443136e-05, + "grad_norm": 3.6229233741760254, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8797954320907593, + "num_tokens": 277563022.0, + "step": 7276 + }, + { + "epoch": 0.925709197303142, + "ewc_loss": 0.007241074461489916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.241074490593746e-05, + "grad_norm": 3.5762155055999756, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8737587928771973, + "num_tokens": 277603879.0, + "step": 7277 + }, + { + "epoch": 0.9258364075817326, + "ewc_loss": 0.0072188894264400005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.21888936823234e-05, + "grad_norm": 3.6069769859313965, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8622350692749023, + "num_tokens": 277643711.0, + "step": 7278 + }, + { + "epoch": 0.9259636178603231, + "ewc_loss": 0.00724527332931757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.245273445732892e-05, + "grad_norm": 3.688843011856079, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8661078214645386, + "num_tokens": 277680667.0, + "step": 7279 + }, + { + "epoch": 0.9260908281389136, + "ewc_loss": 0.007288982160389423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.288982305908576e-05, + "grad_norm": 3.715271472930908, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8655372858047485, + "num_tokens": 277715548.0, + "step": 7280 + }, + { + "epoch": 0.9262180384175042, + "ewc_loss": 0.0072690933011472225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.269093475770205e-05, + "grad_norm": 3.709690809249878, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8647360801696777, + "num_tokens": 277745287.0, + "step": 7281 + }, + { + "epoch": 0.9263452486960947, + "ewc_loss": 0.007262891624122858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.262891449499875e-05, + "grad_norm": 3.6567986011505127, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8593112826347351, + "num_tokens": 277781484.0, + "step": 7282 + }, + { + "epoch": 0.9264724589746851, + "ewc_loss": 0.00723872147500515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.23872144590132e-05, + "grad_norm": 3.559917688369751, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8697750568389893, + "num_tokens": 277818429.0, + "step": 7283 + }, + { + "epoch": 0.9265996692532756, + "ewc_loss": 0.007215333636850119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.215333607746288e-05, + "grad_norm": 3.6091887950897217, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8733501434326172, + "num_tokens": 277853077.0, + "step": 7284 + }, + { + "epoch": 0.9267268795318662, + "ewc_loss": 0.007286802399903536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.286802429007366e-05, + "grad_norm": 3.5657339096069336, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8839428424835205, + "num_tokens": 277893997.0, + "step": 7285 + }, + { + "epoch": 0.9268540898104567, + "ewc_loss": 0.007228604052215815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.228604226838797e-05, + "grad_norm": 3.6218879222869873, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8563978672027588, + "num_tokens": 277934680.0, + "step": 7286 + }, + { + "epoch": 0.9269813000890472, + "ewc_loss": 0.007296693976968527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.296694093383849e-05, + "grad_norm": 3.590270757675171, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8597010374069214, + "num_tokens": 277980370.0, + "step": 7287 + }, + { + "epoch": 0.9271085103676378, + "ewc_loss": 0.00724839698523283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.24839701433666e-05, + "grad_norm": 3.614346504211426, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8783828616142273, + "num_tokens": 278017257.0, + "step": 7288 + }, + { + "epoch": 0.9272357206462282, + "ewc_loss": 0.007270323112607002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.270323112607002e-05, + "grad_norm": 3.6341614723205566, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8666244745254517, + "num_tokens": 278054835.0, + "step": 7289 + }, + { + "epoch": 0.9273629309248187, + "ewc_loss": 0.007272862363606691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.272862421814352e-05, + "grad_norm": 3.6426122188568115, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8653952479362488, + "num_tokens": 278089510.0, + "step": 7290 + }, + { + "epoch": 0.9274901412034092, + "ewc_loss": 0.00725729763507843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.257297693286091e-05, + "grad_norm": 3.625988245010376, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.867129921913147, + "num_tokens": 278126179.0, + "step": 7291 + }, + { + "epoch": 0.9276173514819998, + "ewc_loss": 0.007257980760186911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.257980905706063e-05, + "grad_norm": 3.6645565032958984, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8452683091163635, + "num_tokens": 278170837.0, + "step": 7292 + }, + { + "epoch": 0.9277445617605903, + "ewc_loss": 0.007284736260771751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.284736057044938e-05, + "grad_norm": 3.583791971206665, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8713860511779785, + "num_tokens": 278207410.0, + "step": 7293 + }, + { + "epoch": 0.9278717720391808, + "ewc_loss": 0.007206721697002649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.20672178431414e-05, + "grad_norm": 3.6468639373779297, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8861632943153381, + "num_tokens": 278240495.0, + "step": 7294 + }, + { + "epoch": 0.9279989823177712, + "ewc_loss": 0.007277824450284243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.277824624907225e-05, + "grad_norm": 3.6080472469329834, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8584301471710205, + "num_tokens": 278278449.0, + "step": 7295 + }, + { + "epoch": 0.9281261925963618, + "ewc_loss": 0.007238161750137806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.238161924760789e-05, + "grad_norm": 3.5711135864257812, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8770651817321777, + "num_tokens": 278321434.0, + "step": 7296 + }, + { + "epoch": 0.9282534028749523, + "ewc_loss": 0.007233383134007454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.233383075799793e-05, + "grad_norm": 3.6427416801452637, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8607494831085205, + "num_tokens": 278363730.0, + "step": 7297 + }, + { + "epoch": 0.9283806131535428, + "ewc_loss": 0.0072736069560050964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.273606752278283e-05, + "grad_norm": 3.678954839706421, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8671778440475464, + "num_tokens": 278395493.0, + "step": 7298 + }, + { + "epoch": 0.9285078234321333, + "ewc_loss": 0.007276860065758228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.276859832927585e-05, + "grad_norm": 3.6475653648376465, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8689696788787842, + "num_tokens": 278433024.0, + "step": 7299 + }, + { + "epoch": 0.9286350337107239, + "ewc_loss": 0.007227800320833921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.22780023352243e-05, + "grad_norm": 3.7528722286224365, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8672149777412415, + "num_tokens": 278471311.0, + "step": 7300 + }, + { + "epoch": 0.9287622439893143, + "ewc_loss": 0.007308973930776119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.308973727049306e-05, + "grad_norm": 3.589308261871338, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8848536014556885, + "num_tokens": 278509151.0, + "step": 7301 + }, + { + "epoch": 0.9288894542679048, + "ewc_loss": 0.007167622912675142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.167622970882803e-05, + "grad_norm": 3.5923855304718018, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8736572861671448, + "num_tokens": 278548743.0, + "step": 7302 + }, + { + "epoch": 0.9290166645464953, + "ewc_loss": 0.007231422234326601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.23142220522277e-05, + "grad_norm": 3.5731184482574463, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8771092295646667, + "num_tokens": 278590722.0, + "step": 7303 + }, + { + "epoch": 0.9291438748250859, + "ewc_loss": 0.007204866968095303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204867142718285e-05, + "grad_norm": 3.628471851348877, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8551967144012451, + "num_tokens": 278628150.0, + "step": 7304 + }, + { + "epoch": 0.9292710851036764, + "ewc_loss": 0.007242146413773298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.242146239150316e-05, + "grad_norm": 3.578481674194336, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8518980741500854, + "num_tokens": 278672543.0, + "step": 7305 + }, + { + "epoch": 0.9293982953822669, + "ewc_loss": 0.0071862321346998215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.186231960076839e-05, + "grad_norm": 3.6425867080688477, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8746223449707031, + "num_tokens": 278707234.0, + "step": 7306 + }, + { + "epoch": 0.9295255056608573, + "ewc_loss": 0.007254971656948328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.254971569636837e-05, + "grad_norm": 3.763414144515991, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8685182332992554, + "num_tokens": 278734678.0, + "step": 7307 + }, + { + "epoch": 0.9296527159394479, + "ewc_loss": 0.007303315214812756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.303315214812756e-05, + "grad_norm": 3.5433313846588135, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8760523796081543, + "num_tokens": 278778284.0, + "step": 7308 + }, + { + "epoch": 0.9297799262180384, + "ewc_loss": 0.0071425470523536205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.142547110561281e-05, + "grad_norm": 3.635141134262085, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8654545545578003, + "num_tokens": 278815363.0, + "step": 7309 + }, + { + "epoch": 0.9299071364966289, + "ewc_loss": 0.007291681133210659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.291680958587676e-05, + "grad_norm": 3.6006288528442383, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8547970652580261, + "num_tokens": 278860777.0, + "step": 7310 + }, + { + "epoch": 0.9300343467752195, + "ewc_loss": 0.007223622873425484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.223623106256127e-05, + "grad_norm": 3.6635236740112305, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8812923431396484, + "num_tokens": 278890552.0, + "step": 7311 + }, + { + "epoch": 0.93016155705381, + "ewc_loss": 0.007270887028425932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.270886999322101e-05, + "grad_norm": 3.7044575214385986, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8574138879776001, + "num_tokens": 278924741.0, + "step": 7312 + }, + { + "epoch": 0.9302887673324005, + "ewc_loss": 0.007282149512320757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.282149454113096e-05, + "grad_norm": 3.5923171043395996, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8761402368545532, + "num_tokens": 278965185.0, + "step": 7313 + }, + { + "epoch": 0.9304159776109909, + "ewc_loss": 0.00720860855653882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.208608440123498e-05, + "grad_norm": 3.678493022918701, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.857138991355896, + "num_tokens": 278998961.0, + "step": 7314 + }, + { + "epoch": 0.9305431878895815, + "ewc_loss": 0.0073196059092879295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.319606083910912e-05, + "grad_norm": 3.7387077808380127, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8540562391281128, + "num_tokens": 279032689.0, + "step": 7315 + }, + { + "epoch": 0.930670398168172, + "ewc_loss": 0.007323091384023428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.323091267608106e-05, + "grad_norm": 3.595662832260132, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8530011773109436, + "num_tokens": 279080305.0, + "step": 7316 + }, + { + "epoch": 0.9307976084467625, + "ewc_loss": 0.007228089030832052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.228089089039713e-05, + "grad_norm": 3.561948299407959, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8684136867523193, + "num_tokens": 279120581.0, + "step": 7317 + }, + { + "epoch": 0.930924818725353, + "ewc_loss": 0.007276997901499271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.276998076122254e-05, + "grad_norm": 3.5958006381988525, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.880129337310791, + "num_tokens": 279165050.0, + "step": 7318 + }, + { + "epoch": 0.9310520290039436, + "ewc_loss": 0.007290284615010023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.290284702321514e-05, + "grad_norm": 3.5837018489837646, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8703453540802002, + "num_tokens": 279209393.0, + "step": 7319 + }, + { + "epoch": 0.931179239282534, + "ewc_loss": 0.007265937048941851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.265937165357172e-05, + "grad_norm": 3.6611897945404053, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8545401692390442, + "num_tokens": 279250569.0, + "step": 7320 + }, + { + "epoch": 0.9313064495611245, + "ewc_loss": 0.00731702521443367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.317025301745161e-05, + "grad_norm": 3.584784507751465, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8743629455566406, + "num_tokens": 279291154.0, + "step": 7321 + }, + { + "epoch": 0.931433659839715, + "ewc_loss": 0.007227993104606867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.227993046399206e-05, + "grad_norm": 3.61566424369812, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8666526079177856, + "num_tokens": 279326967.0, + "step": 7322 + }, + { + "epoch": 0.9315608701183056, + "ewc_loss": 0.007291295565664768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.291295332834125e-05, + "grad_norm": 3.618659734725952, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.878863513469696, + "num_tokens": 279367129.0, + "step": 7323 + }, + { + "epoch": 0.9316880803968961, + "ewc_loss": 0.0072609842754900455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.260984421009198e-05, + "grad_norm": 3.610506057739258, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8726520538330078, + "num_tokens": 279403651.0, + "step": 7324 + }, + { + "epoch": 0.9318152906754866, + "ewc_loss": 0.0072615863755345345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.261586142703891e-05, + "grad_norm": 3.71042799949646, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8597266674041748, + "num_tokens": 279439166.0, + "step": 7325 + }, + { + "epoch": 0.931942500954077, + "ewc_loss": 0.007312856148928404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.312856178032234e-05, + "grad_norm": 3.62463116645813, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8663119673728943, + "num_tokens": 279473489.0, + "step": 7326 + }, + { + "epoch": 0.9320697112326676, + "ewc_loss": 0.007225490640848875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.225490844575688e-05, + "grad_norm": 3.615300416946411, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8705713748931885, + "num_tokens": 279511709.0, + "step": 7327 + }, + { + "epoch": 0.9321969215112581, + "ewc_loss": 0.0072578019462525845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.257801917148754e-05, + "grad_norm": 3.6234912872314453, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8546319007873535, + "num_tokens": 279551638.0, + "step": 7328 + }, + { + "epoch": 0.9323241317898486, + "ewc_loss": 0.007261928636580706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.261928840307519e-05, + "grad_norm": 3.6033616065979004, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8794018030166626, + "num_tokens": 279587304.0, + "step": 7329 + }, + { + "epoch": 0.9324513420684392, + "ewc_loss": 0.007248310372233391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.248310430441052e-05, + "grad_norm": 3.6105475425720215, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8596090078353882, + "num_tokens": 279628138.0, + "step": 7330 + }, + { + "epoch": 0.9325785523470297, + "ewc_loss": 0.007270062807947397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.270062633324414e-05, + "grad_norm": 3.6619763374328613, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8721802830696106, + "num_tokens": 279663064.0, + "step": 7331 + }, + { + "epoch": 0.9327057626256201, + "ewc_loss": 0.007284815888851881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.284816092578694e-05, + "grad_norm": 3.6449713706970215, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8596897721290588, + "num_tokens": 279704937.0, + "step": 7332 + }, + { + "epoch": 0.9328329729042106, + "ewc_loss": 0.007253320887684822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.253320654854178e-05, + "grad_norm": 3.6319260597229004, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.871131181716919, + "num_tokens": 279738215.0, + "step": 7333 + }, + { + "epoch": 0.9329601831828012, + "ewc_loss": 0.007264784071594477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.264783926075324e-05, + "grad_norm": 3.67254376411438, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8512165546417236, + "num_tokens": 279771996.0, + "step": 7334 + }, + { + "epoch": 0.9330873934613917, + "ewc_loss": 0.007296761963516474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29676175978966e-05, + "grad_norm": 3.6044411659240723, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8804299831390381, + "num_tokens": 279810037.0, + "step": 7335 + }, + { + "epoch": 0.9332146037399822, + "ewc_loss": 0.007257833145558834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.257833203766495e-05, + "grad_norm": 3.6377737522125244, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8609453439712524, + "num_tokens": 279847394.0, + "step": 7336 + }, + { + "epoch": 0.9333418140185727, + "ewc_loss": 0.007301077246665955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.301077130250633e-05, + "grad_norm": 3.70127272605896, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8544574975967407, + "num_tokens": 279879057.0, + "step": 7337 + }, + { + "epoch": 0.9334690242971632, + "ewc_loss": 0.007332067005336285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.332066888920963e-05, + "grad_norm": 3.6968295574188232, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8625814914703369, + "num_tokens": 279911775.0, + "step": 7338 + }, + { + "epoch": 0.9335962345757537, + "ewc_loss": 0.007314003072679043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.31400286895223e-05, + "grad_norm": 3.6158313751220703, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8698797821998596, + "num_tokens": 279947661.0, + "step": 7339 + }, + { + "epoch": 0.9337234448543442, + "ewc_loss": 0.007312675938010216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.312675734283403e-05, + "grad_norm": 3.7175934314727783, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8687522411346436, + "num_tokens": 279977507.0, + "step": 7340 + }, + { + "epoch": 0.9338506551329347, + "ewc_loss": 0.0073906127363443375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.39061288186349e-05, + "grad_norm": 3.5987861156463623, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8633502125740051, + "num_tokens": 280016749.0, + "step": 7341 + }, + { + "epoch": 0.9339778654115253, + "ewc_loss": 0.00730268657207489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30268657207489e-05, + "grad_norm": 3.6470730304718018, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8683253526687622, + "num_tokens": 280056092.0, + "step": 7342 + }, + { + "epoch": 0.9341050756901158, + "ewc_loss": 0.007369421422481537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.36942165531218e-05, + "grad_norm": 3.66206955909729, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.878298282623291, + "num_tokens": 280089803.0, + "step": 7343 + }, + { + "epoch": 0.9342322859687062, + "ewc_loss": 0.00736922537907958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.369225204456598e-05, + "grad_norm": 3.670322895050049, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8555551767349243, + "num_tokens": 280126190.0, + "step": 7344 + }, + { + "epoch": 0.9343594962472968, + "ewc_loss": 0.007362473756074905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.362473843386397e-05, + "grad_norm": 3.5907349586486816, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8769313097000122, + "num_tokens": 280162743.0, + "step": 7345 + }, + { + "epoch": 0.9344867065258873, + "ewc_loss": 0.0073165372014045715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.31653708498925e-05, + "grad_norm": 3.608530044555664, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.865791916847229, + "num_tokens": 280203676.0, + "step": 7346 + }, + { + "epoch": 0.9346139168044778, + "ewc_loss": 0.007356276735663414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.356276910286397e-05, + "grad_norm": 3.6285336017608643, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8835934400558472, + "num_tokens": 280241765.0, + "step": 7347 + }, + { + "epoch": 0.9347411270830683, + "ewc_loss": 0.007339779287576675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.339779403991997e-05, + "grad_norm": 3.671812057495117, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8468185663223267, + "num_tokens": 280280103.0, + "step": 7348 + }, + { + "epoch": 0.9348683373616589, + "ewc_loss": 0.0073832315392792225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.3832314228639e-05, + "grad_norm": 3.7797036170959473, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8509896397590637, + "num_tokens": 280314674.0, + "step": 7349 + }, + { + "epoch": 0.9349955476402493, + "ewc_loss": 0.0074045974761247635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.40459727239795e-05, + "grad_norm": 3.5947563648223877, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8684751987457275, + "num_tokens": 280353890.0, + "step": 7350 + }, + { + "epoch": 0.9351227579188398, + "ewc_loss": 0.007261520717293024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.261520659085363e-05, + "grad_norm": 3.599691390991211, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8586255311965942, + "num_tokens": 280396403.0, + "step": 7351 + }, + { + "epoch": 0.9352499681974303, + "ewc_loss": 0.007324341218918562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.324341277126223e-05, + "grad_norm": 3.6444995403289795, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8674454092979431, + "num_tokens": 280436154.0, + "step": 7352 + }, + { + "epoch": 0.9353771784760209, + "ewc_loss": 0.007341362535953522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.341362652368844e-05, + "grad_norm": 3.5689339637756348, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8734806180000305, + "num_tokens": 280476480.0, + "step": 7353 + }, + { + "epoch": 0.9355043887546114, + "ewc_loss": 0.007284241262823343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.284241291927174e-05, + "grad_norm": 3.734532356262207, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8712327480316162, + "num_tokens": 280505976.0, + "step": 7354 + }, + { + "epoch": 0.9356315990332019, + "ewc_loss": 0.007416905835270882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.416906009893864e-05, + "grad_norm": 3.593756675720215, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8633441925048828, + "num_tokens": 280548365.0, + "step": 7355 + }, + { + "epoch": 0.9357588093117923, + "ewc_loss": 0.007279274053871632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.279273995663971e-05, + "grad_norm": 3.6014201641082764, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8848038911819458, + "num_tokens": 280588074.0, + "step": 7356 + }, + { + "epoch": 0.9358860195903829, + "ewc_loss": 0.007335000671446323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.335000555031002e-05, + "grad_norm": 3.632831573486328, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8762834668159485, + "num_tokens": 280626915.0, + "step": 7357 + }, + { + "epoch": 0.9360132298689734, + "ewc_loss": 0.007330911234021187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.33091146685183e-05, + "grad_norm": 3.7331125736236572, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8631364703178406, + "num_tokens": 280660088.0, + "step": 7358 + }, + { + "epoch": 0.9361404401475639, + "ewc_loss": 0.00737012317404151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.370123057626188e-05, + "grad_norm": 3.6188950538635254, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.864845871925354, + "num_tokens": 280701326.0, + "step": 7359 + }, + { + "epoch": 0.9362676504261545, + "ewc_loss": 0.0072829388082027435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.282938895514235e-05, + "grad_norm": 3.617149829864502, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8764135837554932, + "num_tokens": 280740256.0, + "step": 7360 + }, + { + "epoch": 0.936394860704745, + "ewc_loss": 0.007313216105103493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.313216337934136e-05, + "grad_norm": 3.7108662128448486, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8804854154586792, + "num_tokens": 280773515.0, + "step": 7361 + }, + { + "epoch": 0.9365220709833355, + "ewc_loss": 0.007352071348577738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.352071406785399e-05, + "grad_norm": 3.6708950996398926, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8415971398353577, + "num_tokens": 280817207.0, + "step": 7362 + }, + { + "epoch": 0.9366492812619259, + "ewc_loss": 0.007297410164028406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.297410047613084e-05, + "grad_norm": 3.629757881164551, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8544822335243225, + "num_tokens": 280860253.0, + "step": 7363 + }, + { + "epoch": 0.9367764915405165, + "ewc_loss": 0.007279559969902039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.279559940798208e-05, + "grad_norm": 3.695931911468506, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8639230728149414, + "num_tokens": 280892634.0, + "step": 7364 + }, + { + "epoch": 0.936903701819107, + "ewc_loss": 0.007340185344219208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.340185402426869e-05, + "grad_norm": 3.6401305198669434, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8756252527236938, + "num_tokens": 280930912.0, + "step": 7365 + }, + { + "epoch": 0.9370309120976975, + "ewc_loss": 0.007268529385328293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.268529589055106e-05, + "grad_norm": 3.596301794052124, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8701239824295044, + "num_tokens": 280971588.0, + "step": 7366 + }, + { + "epoch": 0.937158122376288, + "ewc_loss": 0.0072595058009028435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.259505946421996e-05, + "grad_norm": 3.6759092807769775, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8747682571411133, + "num_tokens": 281009898.0, + "step": 7367 + }, + { + "epoch": 0.9372853326548786, + "ewc_loss": 0.007298332639038563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.298332639038563e-05, + "grad_norm": 3.6403636932373047, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8613319993019104, + "num_tokens": 281046726.0, + "step": 7368 + }, + { + "epoch": 0.937412542933469, + "ewc_loss": 0.007244256790727377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.24425699445419e-05, + "grad_norm": 3.5924243927001953, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8815199732780457, + "num_tokens": 281085163.0, + "step": 7369 + }, + { + "epoch": 0.9375397532120595, + "ewc_loss": 0.007243759464472532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.24375931895338e-05, + "grad_norm": 3.6156020164489746, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.872313380241394, + "num_tokens": 281128660.0, + "step": 7370 + }, + { + "epoch": 0.93766696349065, + "ewc_loss": 0.007250122260302305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.250122143886983e-05, + "grad_norm": 3.677700996398926, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8506532311439514, + "num_tokens": 281163780.0, + "step": 7371 + }, + { + "epoch": 0.9377941737692406, + "ewc_loss": 0.007297669071704149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.297669071704149e-05, + "grad_norm": 3.6097447872161865, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8757104873657227, + "num_tokens": 281204247.0, + "step": 7372 + }, + { + "epoch": 0.9379213840478311, + "ewc_loss": 0.007219628430902958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.21962860552594e-05, + "grad_norm": 3.5903494358062744, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8908909559249878, + "num_tokens": 281239518.0, + "step": 7373 + }, + { + "epoch": 0.9380485943264216, + "ewc_loss": 0.007240572944283485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.240573177114129e-05, + "grad_norm": 3.6413047313690186, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8762218952178955, + "num_tokens": 281275916.0, + "step": 7374 + }, + { + "epoch": 0.938175804605012, + "ewc_loss": 0.007281776983290911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.28177692508325e-05, + "grad_norm": 3.662069797515869, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.878778874874115, + "num_tokens": 281307615.0, + "step": 7375 + }, + { + "epoch": 0.9383030148836026, + "ewc_loss": 0.007264040410518646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.264040323207155e-05, + "grad_norm": 3.6526763439178467, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8726348876953125, + "num_tokens": 281342677.0, + "step": 7376 + }, + { + "epoch": 0.9384302251621931, + "ewc_loss": 0.0072455573827028275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.245557208079845e-05, + "grad_norm": 3.685183525085449, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8625040054321289, + "num_tokens": 281379545.0, + "step": 7377 + }, + { + "epoch": 0.9385574354407836, + "ewc_loss": 0.00728798471391201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.287984772119671e-05, + "grad_norm": 3.607586145401001, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8506757020950317, + "num_tokens": 281424169.0, + "step": 7378 + }, + { + "epoch": 0.9386846457193742, + "ewc_loss": 0.007226283196359873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.226283196359873e-05, + "grad_norm": 3.6070258617401123, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8871962428092957, + "num_tokens": 281461306.0, + "step": 7379 + }, + { + "epoch": 0.9388118559979647, + "ewc_loss": 0.007252118084579706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.252117939060554e-05, + "grad_norm": 3.607583999633789, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8808843493461609, + "num_tokens": 281498753.0, + "step": 7380 + }, + { + "epoch": 0.9389390662765551, + "ewc_loss": 0.007254092022776604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.254091906361282e-05, + "grad_norm": 3.623995780944824, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8768965005874634, + "num_tokens": 281537163.0, + "step": 7381 + }, + { + "epoch": 0.9390662765551456, + "ewc_loss": 0.007256553508341312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.25655336282216e-05, + "grad_norm": 3.6296777725219727, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8582397699356079, + "num_tokens": 281577926.0, + "step": 7382 + }, + { + "epoch": 0.9391934868337362, + "ewc_loss": 0.007247437257319689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.24743731552735e-05, + "grad_norm": 3.569319009780884, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.886375904083252, + "num_tokens": 281621283.0, + "step": 7383 + }, + { + "epoch": 0.9393206971123267, + "ewc_loss": 0.007212448865175247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.212448690552264e-05, + "grad_norm": 3.648042678833008, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8710712790489197, + "num_tokens": 281658336.0, + "step": 7384 + }, + { + "epoch": 0.9394479073909172, + "ewc_loss": 0.0072767892852425575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.276789256138727e-05, + "grad_norm": 3.631267547607422, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8707259893417358, + "num_tokens": 281695137.0, + "step": 7385 + }, + { + "epoch": 0.9395751176695077, + "ewc_loss": 0.0072381747886538506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.238175021484494e-05, + "grad_norm": 3.593409776687622, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8818284273147583, + "num_tokens": 281733570.0, + "step": 7386 + }, + { + "epoch": 0.9397023279480982, + "ewc_loss": 0.007220630533993244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.220630504889414e-05, + "grad_norm": 3.6535916328430176, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8686342239379883, + "num_tokens": 281771163.0, + "step": 7387 + }, + { + "epoch": 0.9398295382266887, + "ewc_loss": 0.0072556170634925365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.255616947077215e-05, + "grad_norm": 3.5872292518615723, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8719350695610046, + "num_tokens": 281813715.0, + "step": 7388 + }, + { + "epoch": 0.9399567485052792, + "ewc_loss": 0.007195988204330206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.195988291641697e-05, + "grad_norm": 3.5984413623809814, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8663830161094666, + "num_tokens": 281856921.0, + "step": 7389 + }, + { + "epoch": 0.9400839587838697, + "ewc_loss": 0.007224234286695719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224234286695719e-05, + "grad_norm": 3.6671106815338135, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8586669564247131, + "num_tokens": 281893273.0, + "step": 7390 + }, + { + "epoch": 0.9402111690624603, + "ewc_loss": 0.007256404031068087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.25640420569107e-05, + "grad_norm": 3.672197103500366, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.872991681098938, + "num_tokens": 281927452.0, + "step": 7391 + }, + { + "epoch": 0.9403383793410508, + "ewc_loss": 0.00722130062058568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.22130062058568e-05, + "grad_norm": 3.5496716499328613, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.892886221408844, + "num_tokens": 281965359.0, + "step": 7392 + }, + { + "epoch": 0.9404655896196412, + "ewc_loss": 0.007166493684053421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.166493742261082e-05, + "grad_norm": 3.698399066925049, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8659204244613647, + "num_tokens": 281999209.0, + "step": 7393 + }, + { + "epoch": 0.9405927998982317, + "ewc_loss": 0.00730808824300766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30808824300766e-05, + "grad_norm": 3.593231201171875, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8583964109420776, + "num_tokens": 282042129.0, + "step": 7394 + }, + { + "epoch": 0.9407200101768223, + "ewc_loss": 0.007177701685577631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.17770162736997e-05, + "grad_norm": 3.598862648010254, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8597895503044128, + "num_tokens": 282087941.0, + "step": 7395 + }, + { + "epoch": 0.9408472204554128, + "ewc_loss": 0.007229398004710674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229398033814505e-05, + "grad_norm": 3.665536403656006, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.873305082321167, + "num_tokens": 282123130.0, + "step": 7396 + }, + { + "epoch": 0.9409744307340033, + "ewc_loss": 0.0072819688357412815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.281969010364264e-05, + "grad_norm": 3.6404922008514404, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8638201951980591, + "num_tokens": 282161142.0, + "step": 7397 + }, + { + "epoch": 0.9411016410125939, + "ewc_loss": 0.0072149415500462055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214941433630884e-05, + "grad_norm": 3.6041765213012695, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8910890817642212, + "num_tokens": 282196363.0, + "step": 7398 + }, + { + "epoch": 0.9412288512911843, + "ewc_loss": 0.007234171032905579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.234171062009409e-05, + "grad_norm": 3.6951444149017334, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8498967289924622, + "num_tokens": 282233202.0, + "step": 7399 + }, + { + "epoch": 0.9413560615697748, + "ewc_loss": 0.007285873871296644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.285874016815796e-05, + "grad_norm": 3.6063456535339355, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8695722222328186, + "num_tokens": 282270976.0, + "step": 7400 + }, + { + "epoch": 0.9414832718483653, + "ewc_loss": 0.007220377214252949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.22037730156444e-05, + "grad_norm": 3.592452049255371, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8791943788528442, + "num_tokens": 282312223.0, + "step": 7401 + }, + { + "epoch": 0.9416104821269559, + "ewc_loss": 0.0072438218630850315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.243821892188862e-05, + "grad_norm": 3.6514222621917725, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8524119853973389, + "num_tokens": 282350418.0, + "step": 7402 + }, + { + "epoch": 0.9417376924055464, + "ewc_loss": 0.00727757578715682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.27757578715682e-05, + "grad_norm": 3.7581167221069336, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8698593378067017, + "num_tokens": 282380751.0, + "step": 7403 + }, + { + "epoch": 0.9418649026841369, + "ewc_loss": 0.007322005927562714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.32200569473207e-05, + "grad_norm": 3.596442461013794, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8807607889175415, + "num_tokens": 282420248.0, + "step": 7404 + }, + { + "epoch": 0.9419921129627273, + "ewc_loss": 0.007200692314654589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.200692198239267e-05, + "grad_norm": 3.644737720489502, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8721407651901245, + "num_tokens": 282457951.0, + "step": 7405 + }, + { + "epoch": 0.9421193232413179, + "ewc_loss": 0.007296059746295214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.296059629879892e-05, + "grad_norm": 3.601483106613159, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8659541606903076, + "num_tokens": 282507379.0, + "step": 7406 + }, + { + "epoch": 0.9422465335199084, + "ewc_loss": 0.007248870097100735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.248869951581582e-05, + "grad_norm": 3.632641315460205, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8561466932296753, + "num_tokens": 282546905.0, + "step": 7407 + }, + { + "epoch": 0.9423737437984989, + "ewc_loss": 0.0072695184499025345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.269518391694874e-05, + "grad_norm": 3.609238862991333, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8392782211303711, + "num_tokens": 282591368.0, + "step": 7408 + }, + { + "epoch": 0.9425009540770894, + "ewc_loss": 0.00725831463932991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.258314872160554e-05, + "grad_norm": 3.79958438873291, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8661197423934937, + "num_tokens": 282617331.0, + "step": 7409 + }, + { + "epoch": 0.94262816435568, + "ewc_loss": 0.007382556330412626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.382556213997304e-05, + "grad_norm": 3.6437900066375732, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8818551301956177, + "num_tokens": 282652346.0, + "step": 7410 + }, + { + "epoch": 0.9427553746342705, + "ewc_loss": 0.0072348290123045444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.234828808577731e-05, + "grad_norm": 3.6207895278930664, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8712396621704102, + "num_tokens": 282691564.0, + "step": 7411 + }, + { + "epoch": 0.9428825849128609, + "ewc_loss": 0.007285670842975378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.285671017598361e-05, + "grad_norm": 3.6671323776245117, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8779166340827942, + "num_tokens": 282728186.0, + "step": 7412 + }, + { + "epoch": 0.9430097951914514, + "ewc_loss": 0.0073309834115207195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.330983498832211e-05, + "grad_norm": 3.5965576171875, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8621132373809814, + "num_tokens": 282770491.0, + "step": 7413 + }, + { + "epoch": 0.943137005470042, + "ewc_loss": 0.007273796480149031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.273796654772013e-05, + "grad_norm": 3.6166932582855225, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8626316785812378, + "num_tokens": 282809824.0, + "step": 7414 + }, + { + "epoch": 0.9432642157486325, + "ewc_loss": 0.0073185451328754425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.318545249290764e-05, + "grad_norm": 3.6290764808654785, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8717590570449829, + "num_tokens": 282846735.0, + "step": 7415 + }, + { + "epoch": 0.943391426027223, + "ewc_loss": 0.0073221358470618725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.322135934373364e-05, + "grad_norm": 3.6354992389678955, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8877708911895752, + "num_tokens": 282882242.0, + "step": 7416 + }, + { + "epoch": 0.9435186363058136, + "ewc_loss": 0.0073122549802064896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.312255183933303e-05, + "grad_norm": 3.677506685256958, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8551511764526367, + "num_tokens": 282918774.0, + "step": 7417 + }, + { + "epoch": 0.943645846584404, + "ewc_loss": 0.007362291216850281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.362291216850281e-05, + "grad_norm": 3.65427827835083, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8608828186988831, + "num_tokens": 282957766.0, + "step": 7418 + }, + { + "epoch": 0.9437730568629945, + "ewc_loss": 0.007303307764232159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.303307938855141e-05, + "grad_norm": 3.637425184249878, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8569545149803162, + "num_tokens": 282995980.0, + "step": 7419 + }, + { + "epoch": 0.943900267141585, + "ewc_loss": 0.007323261350393295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.323261525016278e-05, + "grad_norm": 3.6584246158599854, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.869403064250946, + "num_tokens": 283033148.0, + "step": 7420 + }, + { + "epoch": 0.9440274774201756, + "ewc_loss": 0.007342780940234661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.342780736507848e-05, + "grad_norm": 3.631441354751587, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8726682066917419, + "num_tokens": 283069781.0, + "step": 7421 + }, + { + "epoch": 0.9441546876987661, + "ewc_loss": 0.007296604570001364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.296604599105194e-05, + "grad_norm": 3.65061092376709, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8614261150360107, + "num_tokens": 283107270.0, + "step": 7422 + }, + { + "epoch": 0.9442818979773566, + "ewc_loss": 0.007330597843974829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.330597873078659e-05, + "grad_norm": 3.641772985458374, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8807796239852905, + "num_tokens": 283143571.0, + "step": 7423 + }, + { + "epoch": 0.944409108255947, + "ewc_loss": 0.007301638834178448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.301638834178448e-05, + "grad_norm": 3.6170923709869385, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.872685432434082, + "num_tokens": 283179995.0, + "step": 7424 + }, + { + "epoch": 0.9445363185345376, + "ewc_loss": 0.007297269068658352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.297268894035369e-05, + "grad_norm": 3.6095097064971924, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8681716918945312, + "num_tokens": 283218370.0, + "step": 7425 + }, + { + "epoch": 0.9446635288131281, + "ewc_loss": 0.007306265644729137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.306265615625307e-05, + "grad_norm": 3.8179359436035156, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8634573817253113, + "num_tokens": 283247879.0, + "step": 7426 + }, + { + "epoch": 0.9447907390917186, + "ewc_loss": 0.0074258241802453995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.425824151141569e-05, + "grad_norm": 3.639183759689331, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.882915735244751, + "num_tokens": 283285027.0, + "step": 7427 + }, + { + "epoch": 0.9449179493703092, + "ewc_loss": 0.007260054349899292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.260054553626105e-05, + "grad_norm": 3.598906993865967, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8718241453170776, + "num_tokens": 283324934.0, + "step": 7428 + }, + { + "epoch": 0.9450451596488997, + "ewc_loss": 0.007303270976990461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.303270831471309e-05, + "grad_norm": 3.661777973175049, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8762206435203552, + "num_tokens": 283357540.0, + "step": 7429 + }, + { + "epoch": 0.9451723699274901, + "ewc_loss": 0.007359655108302832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.359655137406662e-05, + "grad_norm": 3.6938252449035645, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8658443093299866, + "num_tokens": 283393470.0, + "step": 7430 + }, + { + "epoch": 0.9452995802060806, + "ewc_loss": 0.007335402071475983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.335402187891304e-05, + "grad_norm": 3.6325325965881348, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8680580258369446, + "num_tokens": 283429916.0, + "step": 7431 + }, + { + "epoch": 0.9454267904846712, + "ewc_loss": 0.007291821297258139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29182138456963e-05, + "grad_norm": 3.6147491931915283, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8583354353904724, + "num_tokens": 283467997.0, + "step": 7432 + }, + { + "epoch": 0.9455540007632617, + "ewc_loss": 0.007304484955966473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.304485188797116e-05, + "grad_norm": 3.769191265106201, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8355141282081604, + "num_tokens": 283504741.0, + "step": 7433 + }, + { + "epoch": 0.9456812110418522, + "ewc_loss": 0.007391723804175854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.391723920591176e-05, + "grad_norm": 3.6892173290252686, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.877041220664978, + "num_tokens": 283534535.0, + "step": 7434 + }, + { + "epoch": 0.9458084213204427, + "ewc_loss": 0.007308379281312227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.308379281312227e-05, + "grad_norm": 3.6615777015686035, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8788191080093384, + "num_tokens": 283571600.0, + "step": 7435 + }, + { + "epoch": 0.9459356315990332, + "ewc_loss": 0.007327353581786156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.327353523578495e-05, + "grad_norm": 3.682070016860962, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8552095890045166, + "num_tokens": 283605662.0, + "step": 7436 + }, + { + "epoch": 0.9460628418776237, + "ewc_loss": 0.007352878339588642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.352878310484812e-05, + "grad_norm": 3.671173572540283, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.867484986782074, + "num_tokens": 283642868.0, + "step": 7437 + }, + { + "epoch": 0.9461900521562142, + "ewc_loss": 0.007328723557293415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.328723586397246e-05, + "grad_norm": 3.6678435802459717, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.858993649482727, + "num_tokens": 283675604.0, + "step": 7438 + }, + { + "epoch": 0.9463172624348047, + "ewc_loss": 0.0073621333576738834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.362133328570053e-05, + "grad_norm": 3.628359794616699, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8878815174102783, + "num_tokens": 283709459.0, + "step": 7439 + }, + { + "epoch": 0.9464444727133953, + "ewc_loss": 0.007325821090489626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.325821206904948e-05, + "grad_norm": 3.599818229675293, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8679261207580566, + "num_tokens": 283748020.0, + "step": 7440 + }, + { + "epoch": 0.9465716829919858, + "ewc_loss": 0.007332876790314913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.332876703003421e-05, + "grad_norm": 3.7333290576934814, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8618450164794922, + "num_tokens": 283778514.0, + "step": 7441 + }, + { + "epoch": 0.9466988932705762, + "ewc_loss": 0.007419060915708542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.419061148539186e-05, + "grad_norm": 3.616586923599243, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8712782859802246, + "num_tokens": 283819764.0, + "step": 7442 + }, + { + "epoch": 0.9468261035491667, + "ewc_loss": 0.007300696801394224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.300696597667411e-05, + "grad_norm": 3.6583251953125, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8539053201675415, + "num_tokens": 283853897.0, + "step": 7443 + }, + { + "epoch": 0.9469533138277573, + "ewc_loss": 0.0073770759627223015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.377075962722301e-05, + "grad_norm": 3.60998272895813, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.88144451379776, + "num_tokens": 283891826.0, + "step": 7444 + }, + { + "epoch": 0.9470805241063478, + "ewc_loss": 0.007330147549510002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.330147491302341e-05, + "grad_norm": 3.624232530593872, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8656268119812012, + "num_tokens": 283931690.0, + "step": 7445 + }, + { + "epoch": 0.9472077343849383, + "ewc_loss": 0.007367369718849659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.367369835264981e-05, + "grad_norm": 3.6379613876342773, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8671207427978516, + "num_tokens": 283971107.0, + "step": 7446 + }, + { + "epoch": 0.9473349446635289, + "ewc_loss": 0.0073660812340676785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.366081263171509e-05, + "grad_norm": 3.680095911026001, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8756625652313232, + "num_tokens": 284005286.0, + "step": 7447 + }, + { + "epoch": 0.9474621549421193, + "ewc_loss": 0.007353431079536676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.35343128326349e-05, + "grad_norm": 3.590869903564453, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8832916021347046, + "num_tokens": 284044089.0, + "step": 7448 + }, + { + "epoch": 0.9475893652207098, + "ewc_loss": 0.007300285622477531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30028550606221e-05, + "grad_norm": 3.665149450302124, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8647983074188232, + "num_tokens": 284081172.0, + "step": 7449 + }, + { + "epoch": 0.9477165754993003, + "ewc_loss": 0.007377541624009609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.377541624009609e-05, + "grad_norm": 3.7065937519073486, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8747608661651611, + "num_tokens": 284113362.0, + "step": 7450 + }, + { + "epoch": 0.9478437857778909, + "ewc_loss": 0.007362960837781429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.362960604950786e-05, + "grad_norm": 3.5736207962036133, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.877617359161377, + "num_tokens": 284155272.0, + "step": 7451 + }, + { + "epoch": 0.9479709960564814, + "ewc_loss": 0.007282522041350603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.282521983142942e-05, + "grad_norm": 3.6861963272094727, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8756453990936279, + "num_tokens": 284188218.0, + "step": 7452 + }, + { + "epoch": 0.9480982063350719, + "ewc_loss": 0.007388197351247072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.388197263935581e-05, + "grad_norm": 3.625023365020752, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8716280460357666, + "num_tokens": 284228838.0, + "step": 7453 + }, + { + "epoch": 0.9482254166136623, + "ewc_loss": 0.007320149801671505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.320149597944692e-05, + "grad_norm": 3.6510555744171143, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8647308945655823, + "num_tokens": 284265570.0, + "step": 7454 + }, + { + "epoch": 0.9483526268922529, + "ewc_loss": 0.007352587766945362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.352587999776006e-05, + "grad_norm": 3.6177542209625244, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8586604595184326, + "num_tokens": 284307062.0, + "step": 7455 + }, + { + "epoch": 0.9484798371708434, + "ewc_loss": 0.007311408873647451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.311408990062773e-05, + "grad_norm": 3.661128520965576, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8681922554969788, + "num_tokens": 284340488.0, + "step": 7456 + }, + { + "epoch": 0.9486070474494339, + "ewc_loss": 0.007341473363339901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.341473246924579e-05, + "grad_norm": 3.6574416160583496, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.868166983127594, + "num_tokens": 284382399.0, + "step": 7457 + }, + { + "epoch": 0.9487342577280244, + "ewc_loss": 0.007316801697015762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.316801929846406e-05, + "grad_norm": 3.7105579376220703, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8565305471420288, + "num_tokens": 284416391.0, + "step": 7458 + }, + { + "epoch": 0.948861468006615, + "ewc_loss": 0.007352782413363457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.352782267844304e-05, + "grad_norm": 3.616584300994873, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8676173686981201, + "num_tokens": 284457049.0, + "step": 7459 + }, + { + "epoch": 0.9489886782852054, + "ewc_loss": 0.007268521469086409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.26852158550173e-05, + "grad_norm": 3.644404411315918, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8658627867698669, + "num_tokens": 284494426.0, + "step": 7460 + }, + { + "epoch": 0.9491158885637959, + "ewc_loss": 0.007340834941715002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.340835145441815e-05, + "grad_norm": 3.6253395080566406, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8872986435890198, + "num_tokens": 284533349.0, + "step": 7461 + }, + { + "epoch": 0.9492430988423864, + "ewc_loss": 0.007307928986847401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.307928899535909e-05, + "grad_norm": 3.642781972885132, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8761123418807983, + "num_tokens": 284567777.0, + "step": 7462 + }, + { + "epoch": 0.949370309120977, + "ewc_loss": 0.007316428702324629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.316428673220798e-05, + "grad_norm": 3.646242380142212, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8633009195327759, + "num_tokens": 284609120.0, + "step": 7463 + }, + { + "epoch": 0.9494975193995675, + "ewc_loss": 0.007321187295019627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.321187149500474e-05, + "grad_norm": 3.6360743045806885, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8607966899871826, + "num_tokens": 284649221.0, + "step": 7464 + }, + { + "epoch": 0.949624729678158, + "ewc_loss": 0.007290946785360575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.290946814464405e-05, + "grad_norm": 3.6601974964141846, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8726186752319336, + "num_tokens": 284682595.0, + "step": 7465 + }, + { + "epoch": 0.9497519399567486, + "ewc_loss": 0.007314817048609257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.314817048609257e-05, + "grad_norm": 3.6101932525634766, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8635658025741577, + "num_tokens": 284722903.0, + "step": 7466 + }, + { + "epoch": 0.949879150235339, + "ewc_loss": 0.007267741020768881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.267740875249729e-05, + "grad_norm": 3.6565237045288086, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.874055802822113, + "num_tokens": 284759166.0, + "step": 7467 + }, + { + "epoch": 0.9500063605139295, + "ewc_loss": 0.007305079139769077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.305078906938434e-05, + "grad_norm": 3.5803937911987305, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8635249137878418, + "num_tokens": 284803192.0, + "step": 7468 + }, + { + "epoch": 0.95013357079252, + "ewc_loss": 0.007247245870530605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.247245957842097e-05, + "grad_norm": 3.7258312702178955, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8704938888549805, + "num_tokens": 284837088.0, + "step": 7469 + }, + { + "epoch": 0.9502607810711106, + "ewc_loss": 0.007363790646195412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.363790791714564e-05, + "grad_norm": 3.6176838874816895, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.849830150604248, + "num_tokens": 284879641.0, + "step": 7470 + }, + { + "epoch": 0.9503879913497011, + "ewc_loss": 0.007242527324706316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.242527499329299e-05, + "grad_norm": 3.635948657989502, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8562880754470825, + "num_tokens": 284917519.0, + "step": 7471 + }, + { + "epoch": 0.9505152016282916, + "ewc_loss": 0.00729680759832263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29680759832263e-05, + "grad_norm": 3.743295669555664, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8590608835220337, + "num_tokens": 284950422.0, + "step": 7472 + }, + { + "epoch": 0.950642411906882, + "ewc_loss": 0.007356968708336353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.356968853855506e-05, + "grad_norm": 3.618462324142456, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8754582405090332, + "num_tokens": 284986392.0, + "step": 7473 + }, + { + "epoch": 0.9507696221854726, + "ewc_loss": 0.00725396815687418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.253968215081841e-05, + "grad_norm": 3.622856616973877, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8697205185890198, + "num_tokens": 285027684.0, + "step": 7474 + }, + { + "epoch": 0.9508968324640631, + "ewc_loss": 0.007307941094040871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.307941268663853e-05, + "grad_norm": 3.711591958999634, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8603211641311646, + "num_tokens": 285059551.0, + "step": 7475 + }, + { + "epoch": 0.9510240427426536, + "ewc_loss": 0.007353486027568579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.353485852945596e-05, + "grad_norm": 3.6146199703216553, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8691287636756897, + "num_tokens": 285100197.0, + "step": 7476 + }, + { + "epoch": 0.9511512530212441, + "ewc_loss": 0.0072600035928189754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.260003621922806e-05, + "grad_norm": 3.733006477355957, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8710356950759888, + "num_tokens": 285130115.0, + "step": 7477 + }, + { + "epoch": 0.9512784632998347, + "ewc_loss": 0.007381100207567215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.381100294878706e-05, + "grad_norm": 3.6009902954101562, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8679072260856628, + "num_tokens": 285169069.0, + "step": 7478 + }, + { + "epoch": 0.9514056735784251, + "ewc_loss": 0.007266285829246044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.266285683726892e-05, + "grad_norm": 3.661324977874756, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8470857739448547, + "num_tokens": 285209971.0, + "step": 7479 + }, + { + "epoch": 0.9515328838570156, + "ewc_loss": 0.007344200275838375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.344200275838375e-05, + "grad_norm": 3.615666627883911, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8630040884017944, + "num_tokens": 285250024.0, + "step": 7480 + }, + { + "epoch": 0.9516600941356061, + "ewc_loss": 0.007298455573618412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.298455602722242e-05, + "grad_norm": 3.673861026763916, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8653867244720459, + "num_tokens": 285284900.0, + "step": 7481 + }, + { + "epoch": 0.9517873044141967, + "ewc_loss": 0.007353861816227436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.353862019954249e-05, + "grad_norm": 3.6414315700531006, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8731594085693359, + "num_tokens": 285325657.0, + "step": 7482 + }, + { + "epoch": 0.9519145146927872, + "ewc_loss": 0.007320890203118324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.320890290429816e-05, + "grad_norm": 3.711782217025757, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8686399459838867, + "num_tokens": 285356362.0, + "step": 7483 + }, + { + "epoch": 0.9520417249713777, + "ewc_loss": 0.007355200592428446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.355200796155259e-05, + "grad_norm": 3.6971638202667236, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8675189018249512, + "num_tokens": 285394321.0, + "step": 7484 + }, + { + "epoch": 0.9521689352499682, + "ewc_loss": 0.007312071975320578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.312071829801425e-05, + "grad_norm": 3.5814197063446045, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8871818780899048, + "num_tokens": 285435260.0, + "step": 7485 + }, + { + "epoch": 0.9522961455285587, + "ewc_loss": 0.007263615261763334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.263615407282487e-05, + "grad_norm": 3.605010986328125, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8744250535964966, + "num_tokens": 285479449.0, + "step": 7486 + }, + { + "epoch": 0.9524233558071492, + "ewc_loss": 0.007300151512026787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.300151628442109e-05, + "grad_norm": 3.6387600898742676, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8491121530532837, + "num_tokens": 285520492.0, + "step": 7487 + }, + { + "epoch": 0.9525505660857397, + "ewc_loss": 0.007300518453121185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.300518336705863e-05, + "grad_norm": 3.6549036502838135, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.864439070224762, + "num_tokens": 285558410.0, + "step": 7488 + }, + { + "epoch": 0.9526777763643303, + "ewc_loss": 0.007309178356081247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.309178181458265e-05, + "grad_norm": 3.6473608016967773, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8490572571754456, + "num_tokens": 285598871.0, + "step": 7489 + }, + { + "epoch": 0.9528049866429208, + "ewc_loss": 0.007284734398126602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.284734601853415e-05, + "grad_norm": 3.6243181228637695, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8611587285995483, + "num_tokens": 285641374.0, + "step": 7490 + }, + { + "epoch": 0.9529321969215112, + "ewc_loss": 0.007292901165783405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.292901136679575e-05, + "grad_norm": 3.634397268295288, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8816318511962891, + "num_tokens": 285675416.0, + "step": 7491 + }, + { + "epoch": 0.9530594072001017, + "ewc_loss": 0.0072919209487736225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.291921065188944e-05, + "grad_norm": 3.6962897777557373, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8738874197006226, + "num_tokens": 285707491.0, + "step": 7492 + }, + { + "epoch": 0.9531866174786923, + "ewc_loss": 0.007326281629502773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.326281775021926e-05, + "grad_norm": 3.671510934829712, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8565880060195923, + "num_tokens": 285747571.0, + "step": 7493 + }, + { + "epoch": 0.9533138277572828, + "ewc_loss": 0.007287631276994944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.287631160579622e-05, + "grad_norm": 3.593526840209961, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8904009461402893, + "num_tokens": 285786362.0, + "step": 7494 + }, + { + "epoch": 0.9534410380358733, + "ewc_loss": 0.00725469458848238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.254694355651736e-05, + "grad_norm": 3.6385891437530518, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8753728866577148, + "num_tokens": 285829401.0, + "step": 7495 + }, + { + "epoch": 0.9535682483144639, + "ewc_loss": 0.007307190448045731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30719038983807e-05, + "grad_norm": 3.6092422008514404, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8640778064727783, + "num_tokens": 285867853.0, + "step": 7496 + }, + { + "epoch": 0.9536954585930543, + "ewc_loss": 0.007263957988470793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.263958104886115e-05, + "grad_norm": 3.621518611907959, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8781161904335022, + "num_tokens": 285908497.0, + "step": 7497 + }, + { + "epoch": 0.9538226688716448, + "ewc_loss": 0.007286753039807081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.28675295249559e-05, + "grad_norm": 3.7454755306243896, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8555184602737427, + "num_tokens": 285940528.0, + "step": 7498 + }, + { + "epoch": 0.9539498791502353, + "ewc_loss": 0.007335918955504894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.335918780881912e-05, + "grad_norm": 3.637789487838745, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8691474199295044, + "num_tokens": 285980206.0, + "step": 7499 + }, + { + "epoch": 0.9540770894288259, + "ewc_loss": 0.007232338190078735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.232338248286396e-05, + "grad_norm": 3.630794048309326, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8624589443206787, + "num_tokens": 286019409.0, + "step": 7500 + }, + { + "epoch": 0.9542042997074164, + "ewc_loss": 0.007268495392054319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.26849539205432e-05, + "grad_norm": 3.6625919342041016, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8783955574035645, + "num_tokens": 286050955.0, + "step": 7501 + }, + { + "epoch": 0.9543315099860069, + "ewc_loss": 0.007302574347704649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.302574522327632e-05, + "grad_norm": 3.6859476566314697, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8723664879798889, + "num_tokens": 286087567.0, + "step": 7502 + }, + { + "epoch": 0.9544587202645973, + "ewc_loss": 0.00730412220582366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.304122118512169e-05, + "grad_norm": 3.7308127880096436, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8475390672683716, + "num_tokens": 286126893.0, + "step": 7503 + }, + { + "epoch": 0.9545859305431879, + "ewc_loss": 0.007309664972126484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.309664943022653e-05, + "grad_norm": 3.628527879714966, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8622009754180908, + "num_tokens": 286165818.0, + "step": 7504 + }, + { + "epoch": 0.9547131408217784, + "ewc_loss": 0.007229553535580635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.229553739307448e-05, + "grad_norm": 3.6048800945281982, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8712321519851685, + "num_tokens": 286208026.0, + "step": 7505 + }, + { + "epoch": 0.9548403511003689, + "ewc_loss": 0.007246419321745634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.246419409057125e-05, + "grad_norm": 3.647165298461914, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8678706884384155, + "num_tokens": 286245468.0, + "step": 7506 + }, + { + "epoch": 0.9549675613789594, + "ewc_loss": 0.007283164653927088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.283164450200275e-05, + "grad_norm": 3.6623103618621826, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8714420795440674, + "num_tokens": 286283955.0, + "step": 7507 + }, + { + "epoch": 0.95509477165755, + "ewc_loss": 0.007277394644916058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.277394615812227e-05, + "grad_norm": 3.615842580795288, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8733142018318176, + "num_tokens": 286326560.0, + "step": 7508 + }, + { + "epoch": 0.9552219819361404, + "ewc_loss": 0.007237522397190332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.237522368086502e-05, + "grad_norm": 3.6158769130706787, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8684182167053223, + "num_tokens": 286365791.0, + "step": 7509 + }, + { + "epoch": 0.9553491922147309, + "ewc_loss": 0.0072455089539289474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.245509186759591e-05, + "grad_norm": 3.6186177730560303, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8784782290458679, + "num_tokens": 286404132.0, + "step": 7510 + }, + { + "epoch": 0.9554764024933214, + "ewc_loss": 0.007244931999593973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.244932203320786e-05, + "grad_norm": 3.6072778701782227, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8570314645767212, + "num_tokens": 286444828.0, + "step": 7511 + }, + { + "epoch": 0.955603612771912, + "ewc_loss": 0.007235146593302488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.235146767925471e-05, + "grad_norm": 3.6621010303497314, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8735097646713257, + "num_tokens": 286481725.0, + "step": 7512 + }, + { + "epoch": 0.9557308230505025, + "ewc_loss": 0.007283281534910202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.283281593117863e-05, + "grad_norm": 3.616262435913086, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8678926825523376, + "num_tokens": 286520354.0, + "step": 7513 + }, + { + "epoch": 0.955858033329093, + "ewc_loss": 0.007213654462248087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213654316728935e-05, + "grad_norm": 3.6058638095855713, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.866152822971344, + "num_tokens": 286564314.0, + "step": 7514 + }, + { + "epoch": 0.9559852436076836, + "ewc_loss": 0.0072279637679457664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.227963942568749e-05, + "grad_norm": 3.602933645248413, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8678290247917175, + "num_tokens": 286606955.0, + "step": 7515 + }, + { + "epoch": 0.956112453886274, + "ewc_loss": 0.007218134123831987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.218134123831987e-05, + "grad_norm": 3.6662473678588867, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8747307062149048, + "num_tokens": 286640091.0, + "step": 7516 + }, + { + "epoch": 0.9562396641648645, + "ewc_loss": 0.007259122561663389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.259122503455728e-05, + "grad_norm": 3.6416120529174805, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8671566843986511, + "num_tokens": 286679090.0, + "step": 7517 + }, + { + "epoch": 0.956366874443455, + "ewc_loss": 0.00722576305270195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.225762965390459e-05, + "grad_norm": 3.6052498817443848, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8730998635292053, + "num_tokens": 286720260.0, + "step": 7518 + }, + { + "epoch": 0.9564940847220456, + "ewc_loss": 0.007204716559499502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204716530395672e-05, + "grad_norm": 3.5856921672821045, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8734196424484253, + "num_tokens": 286763992.0, + "step": 7519 + }, + { + "epoch": 0.9566212950006361, + "ewc_loss": 0.007214098237454891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2140981501434e-05, + "grad_norm": 3.634722948074341, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8504819273948669, + "num_tokens": 286807068.0, + "step": 7520 + }, + { + "epoch": 0.9567485052792266, + "ewc_loss": 0.007224642671644688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.224642467917874e-05, + "grad_norm": 3.5912256240844727, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8744625449180603, + "num_tokens": 286853020.0, + "step": 7521 + }, + { + "epoch": 0.956875715557817, + "ewc_loss": 0.007162940222769976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.162940164562315e-05, + "grad_norm": 3.6126623153686523, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8584285974502563, + "num_tokens": 286896098.0, + "step": 7522 + }, + { + "epoch": 0.9570029258364076, + "ewc_loss": 0.007208582945168018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.208582974271849e-05, + "grad_norm": 3.6253912448883057, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8522372245788574, + "num_tokens": 286938873.0, + "step": 7523 + }, + { + "epoch": 0.9571301361149981, + "ewc_loss": 0.007189922966063023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.189923053374514e-05, + "grad_norm": 3.638212203979492, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8724379539489746, + "num_tokens": 286972941.0, + "step": 7524 + }, + { + "epoch": 0.9572573463935886, + "ewc_loss": 0.007186222355812788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.18622250133194e-05, + "grad_norm": 3.657139778137207, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8573231101036072, + "num_tokens": 287010818.0, + "step": 7525 + }, + { + "epoch": 0.9573845566721791, + "ewc_loss": 0.007220960687845945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.220960833365098e-05, + "grad_norm": 3.6285765171051025, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.871690034866333, + "num_tokens": 287053841.0, + "step": 7526 + }, + { + "epoch": 0.9575117669507697, + "ewc_loss": 0.007176769431680441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.176769577199593e-05, + "grad_norm": 3.644747734069824, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8731756210327148, + "num_tokens": 287090900.0, + "step": 7527 + }, + { + "epoch": 0.9576389772293601, + "ewc_loss": 0.00720457686111331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.20457683200948e-05, + "grad_norm": 3.5835378170013428, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8676111102104187, + "num_tokens": 287131203.0, + "step": 7528 + }, + { + "epoch": 0.9577661875079506, + "ewc_loss": 0.007176794111728668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.176794315455481e-05, + "grad_norm": 3.6608195304870605, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8791792392730713, + "num_tokens": 287165949.0, + "step": 7529 + }, + { + "epoch": 0.9578933977865411, + "ewc_loss": 0.007230428978800774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.230429037008435e-05, + "grad_norm": 3.682131052017212, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8665270209312439, + "num_tokens": 287203376.0, + "step": 7530 + }, + { + "epoch": 0.9580206080651317, + "ewc_loss": 0.007214847020804882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214846846181899e-05, + "grad_norm": 3.644674062728882, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8591909408569336, + "num_tokens": 287240391.0, + "step": 7531 + }, + { + "epoch": 0.9581478183437222, + "ewc_loss": 0.007207357790321112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.20735770300962e-05, + "grad_norm": 3.630561590194702, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8660137057304382, + "num_tokens": 287278485.0, + "step": 7532 + }, + { + "epoch": 0.9582750286223127, + "ewc_loss": 0.007212077733129263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.212077616713941e-05, + "grad_norm": 3.6837613582611084, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8620792031288147, + "num_tokens": 287313626.0, + "step": 7533 + }, + { + "epoch": 0.9584022389009031, + "ewc_loss": 0.007259043399244547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.259043195517734e-05, + "grad_norm": 3.6450142860412598, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.883277416229248, + "num_tokens": 287344532.0, + "step": 7534 + }, + { + "epoch": 0.9585294491794937, + "ewc_loss": 0.007214220240712166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214220386231318e-05, + "grad_norm": 3.6585023403167725, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8494020700454712, + "num_tokens": 287383069.0, + "step": 7535 + }, + { + "epoch": 0.9586566594580842, + "ewc_loss": 0.007254011929035187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.254011870827526e-05, + "grad_norm": 3.6291089057922363, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.869174599647522, + "num_tokens": 287418554.0, + "step": 7536 + }, + { + "epoch": 0.9587838697366747, + "ewc_loss": 0.007230756338685751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.230756455101073e-05, + "grad_norm": 3.6587231159210205, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8665869235992432, + "num_tokens": 287452449.0, + "step": 7537 + }, + { + "epoch": 0.9589110800152653, + "ewc_loss": 0.007277269382029772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.277269469341263e-05, + "grad_norm": 3.6098344326019287, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8542219996452332, + "num_tokens": 287492247.0, + "step": 7538 + }, + { + "epoch": 0.9590382902938558, + "ewc_loss": 0.007248484995216131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.248485053423792e-05, + "grad_norm": 3.5956172943115234, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8743406534194946, + "num_tokens": 287528651.0, + "step": 7539 + }, + { + "epoch": 0.9591655005724462, + "ewc_loss": 0.007277865894138813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.277866097865626e-05, + "grad_norm": 3.657365560531616, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8580455780029297, + "num_tokens": 287565837.0, + "step": 7540 + }, + { + "epoch": 0.9592927108510367, + "ewc_loss": 0.007320019882172346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.320020085899159e-05, + "grad_norm": 3.640723943710327, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8672442436218262, + "num_tokens": 287599469.0, + "step": 7541 + }, + { + "epoch": 0.9594199211296273, + "ewc_loss": 0.007298658601939678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.298658601939678e-05, + "grad_norm": 3.6730849742889404, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8665119409561157, + "num_tokens": 287637104.0, + "step": 7542 + }, + { + "epoch": 0.9595471314082178, + "ewc_loss": 0.007331825327128172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.331825327128172e-05, + "grad_norm": 3.613619327545166, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8677026629447937, + "num_tokens": 287679134.0, + "step": 7543 + }, + { + "epoch": 0.9596743416868083, + "ewc_loss": 0.007298974320292473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.298974378500134e-05, + "grad_norm": 3.5827927589416504, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8748445510864258, + "num_tokens": 287720340.0, + "step": 7544 + }, + { + "epoch": 0.9598015519653988, + "ewc_loss": 0.007313587237149477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.31358741177246e-05, + "grad_norm": 3.58972430229187, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8715496063232422, + "num_tokens": 287760931.0, + "step": 7545 + }, + { + "epoch": 0.9599287622439893, + "ewc_loss": 0.007319830358028412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.319830183405429e-05, + "grad_norm": 3.6387102603912354, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8552781343460083, + "num_tokens": 287801723.0, + "step": 7546 + }, + { + "epoch": 0.9600559725225798, + "ewc_loss": 0.00732317054644227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.323170575546101e-05, + "grad_norm": 3.710970163345337, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8651562333106995, + "num_tokens": 287837010.0, + "step": 7547 + }, + { + "epoch": 0.9601831828011703, + "ewc_loss": 0.007353662047535181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.353661931119859e-05, + "grad_norm": 3.6275994777679443, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8699031472206116, + "num_tokens": 287874381.0, + "step": 7548 + }, + { + "epoch": 0.9603103930797608, + "ewc_loss": 0.007288270629942417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.288270717253909e-05, + "grad_norm": 3.6271214485168457, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8606626987457275, + "num_tokens": 287916673.0, + "step": 7549 + }, + { + "epoch": 0.9604376033583514, + "ewc_loss": 0.007319245021790266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.319245196413249e-05, + "grad_norm": 3.6448280811309814, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8633394241333008, + "num_tokens": 287956764.0, + "step": 7550 + }, + { + "epoch": 0.9605648136369419, + "ewc_loss": 0.007308679632842541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.308679778361693e-05, + "grad_norm": 3.6084952354431152, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8755561113357544, + "num_tokens": 287996373.0, + "step": 7551 + }, + { + "epoch": 0.9606920239155323, + "ewc_loss": 0.007299015298485756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.299015123862773e-05, + "grad_norm": 3.697176218032837, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8624439239501953, + "num_tokens": 288033671.0, + "step": 7552 + }, + { + "epoch": 0.9608192341941229, + "ewc_loss": 0.007359644863754511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.359644951066002e-05, + "grad_norm": 3.6550376415252686, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8769135475158691, + "num_tokens": 288068254.0, + "step": 7553 + }, + { + "epoch": 0.9609464444727134, + "ewc_loss": 0.00729990191757679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.299902063095942e-05, + "grad_norm": 3.6651833057403564, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8536141514778137, + "num_tokens": 288102833.0, + "step": 7554 + }, + { + "epoch": 0.9610736547513039, + "ewc_loss": 0.007316453382372856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.316453411476687e-05, + "grad_norm": 3.6238996982574463, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8807860612869263, + "num_tokens": 288135927.0, + "step": 7555 + }, + { + "epoch": 0.9612008650298944, + "ewc_loss": 0.007303227670490742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.303227903321385e-05, + "grad_norm": 3.605807065963745, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8880581259727478, + "num_tokens": 288173842.0, + "step": 7556 + }, + { + "epoch": 0.961328075308485, + "ewc_loss": 0.007303494960069656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.303494930965826e-05, + "grad_norm": 3.6178054809570312, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8728440403938293, + "num_tokens": 288215007.0, + "step": 7557 + }, + { + "epoch": 0.9614552855870754, + "ewc_loss": 0.0073151905089616776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.315190305234864e-05, + "grad_norm": 3.6730663776397705, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8573703765869141, + "num_tokens": 288251598.0, + "step": 7558 + }, + { + "epoch": 0.9615824958656659, + "ewc_loss": 0.0073392800986766815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.339280273299664e-05, + "grad_norm": 3.617546319961548, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8686673641204834, + "num_tokens": 288292811.0, + "step": 7559 + }, + { + "epoch": 0.9617097061442564, + "ewc_loss": 0.007277334108948708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.27733422536403e-05, + "grad_norm": 3.6247031688690186, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8727000951766968, + "num_tokens": 288337790.0, + "step": 7560 + }, + { + "epoch": 0.961836916422847, + "ewc_loss": 0.007304726634174585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.304726750589907e-05, + "grad_norm": 3.6427364349365234, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8653774261474609, + "num_tokens": 288373347.0, + "step": 7561 + }, + { + "epoch": 0.9619641267014375, + "ewc_loss": 0.007308815605938435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.308815838769078e-05, + "grad_norm": 3.639130115509033, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8891201019287109, + "num_tokens": 288405650.0, + "step": 7562 + }, + { + "epoch": 0.962091336980028, + "ewc_loss": 0.007288745138794184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.288745109690353e-05, + "grad_norm": 3.617938280105591, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8751863837242126, + "num_tokens": 288443849.0, + "step": 7563 + }, + { + "epoch": 0.9622185472586186, + "ewc_loss": 0.0072823939844965935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.282393926288933e-05, + "grad_norm": 3.638256549835205, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8480179309844971, + "num_tokens": 288487557.0, + "step": 7564 + }, + { + "epoch": 0.962345757537209, + "ewc_loss": 0.007292300928384066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.292300870176405e-05, + "grad_norm": 3.6150035858154297, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8598802089691162, + "num_tokens": 288529932.0, + "step": 7565 + }, + { + "epoch": 0.9624729678157995, + "ewc_loss": 0.007270838133990765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.270838250406086e-05, + "grad_norm": 3.6570513248443604, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8632071018218994, + "num_tokens": 288568276.0, + "step": 7566 + }, + { + "epoch": 0.96260017809439, + "ewc_loss": 0.007297382224351168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29738239897415e-05, + "grad_norm": 3.6482505798339844, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8710250854492188, + "num_tokens": 288602387.0, + "step": 7567 + }, + { + "epoch": 0.9627273883729806, + "ewc_loss": 0.007276680786162615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.276680844370276e-05, + "grad_norm": 3.6745412349700928, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8741461634635925, + "num_tokens": 288638157.0, + "step": 7568 + }, + { + "epoch": 0.9628545986515711, + "ewc_loss": 0.007299658376723528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.299658318515867e-05, + "grad_norm": 3.6478707790374756, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8643426299095154, + "num_tokens": 288675054.0, + "step": 7569 + }, + { + "epoch": 0.9629818089301616, + "ewc_loss": 0.007257726043462753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.257726247189566e-05, + "grad_norm": 3.6450047492980957, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.874934732913971, + "num_tokens": 288714222.0, + "step": 7570 + }, + { + "epoch": 0.963109019208752, + "ewc_loss": 0.007266661152243614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.266661123139784e-05, + "grad_norm": 3.6642894744873047, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.858849823474884, + "num_tokens": 288752772.0, + "step": 7571 + }, + { + "epoch": 0.9632362294873426, + "ewc_loss": 0.0072894529439508915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.289453060366213e-05, + "grad_norm": 3.6522510051727295, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8513576984405518, + "num_tokens": 288792067.0, + "step": 7572 + }, + { + "epoch": 0.9633634397659331, + "ewc_loss": 0.007272981107234955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.272981019923463e-05, + "grad_norm": 3.691849946975708, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8808251619338989, + "num_tokens": 288826172.0, + "step": 7573 + }, + { + "epoch": 0.9634906500445236, + "ewc_loss": 0.0073045711033046246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.304571045096964e-05, + "grad_norm": 3.6370434761047363, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.880276620388031, + "num_tokens": 288863774.0, + "step": 7574 + }, + { + "epoch": 0.9636178603231141, + "ewc_loss": 0.007267038803547621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.26703874533996e-05, + "grad_norm": 3.6393847465515137, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8619664907455444, + "num_tokens": 288901892.0, + "step": 7575 + }, + { + "epoch": 0.9637450706017047, + "ewc_loss": 0.007274406496435404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.274406380020082e-05, + "grad_norm": 3.632232904434204, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8580873012542725, + "num_tokens": 288941470.0, + "step": 7576 + }, + { + "epoch": 0.9638722808802951, + "ewc_loss": 0.007285910192877054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.285910396603867e-05, + "grad_norm": 3.589966297149658, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8544036746025085, + "num_tokens": 288988616.0, + "step": 7577 + }, + { + "epoch": 0.9639994911588856, + "ewc_loss": 0.007263627834618092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.26362777641043e-05, + "grad_norm": 3.6293466091156006, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8831170201301575, + "num_tokens": 289024589.0, + "step": 7578 + }, + { + "epoch": 0.9641267014374761, + "ewc_loss": 0.007294987328350544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.294987153727561e-05, + "grad_norm": 3.5976057052612305, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8709948658943176, + "num_tokens": 289064334.0, + "step": 7579 + }, + { + "epoch": 0.9642539117160667, + "ewc_loss": 0.007264150306582451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.264150190167129e-05, + "grad_norm": 3.6613962650299072, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8589941263198853, + "num_tokens": 289099385.0, + "step": 7580 + }, + { + "epoch": 0.9643811219946572, + "ewc_loss": 0.007311702705919743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.311702938750386e-05, + "grad_norm": 3.6134626865386963, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8715493083000183, + "num_tokens": 289139143.0, + "step": 7581 + }, + { + "epoch": 0.9645083322732477, + "ewc_loss": 0.0072535565122962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.253556395880878e-05, + "grad_norm": 3.6577255725860596, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8619285225868225, + "num_tokens": 289179303.0, + "step": 7582 + }, + { + "epoch": 0.9646355425518381, + "ewc_loss": 0.007296862080693245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.296862168004736e-05, + "grad_norm": 3.6976113319396973, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8598321676254272, + "num_tokens": 289214218.0, + "step": 7583 + }, + { + "epoch": 0.9647627528304287, + "ewc_loss": 0.007297608070075512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29760795366019e-05, + "grad_norm": 3.769197940826416, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8656843900680542, + "num_tokens": 289240991.0, + "step": 7584 + }, + { + "epoch": 0.9648899631090192, + "ewc_loss": 0.007334869354963303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.334869587793946e-05, + "grad_norm": 3.6908037662506104, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8620032072067261, + "num_tokens": 289273374.0, + "step": 7585 + }, + { + "epoch": 0.9650171733876097, + "ewc_loss": 0.007287910673767328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.287910557352006e-05, + "grad_norm": 3.5843448638916016, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8541227579116821, + "num_tokens": 289314904.0, + "step": 7586 + }, + { + "epoch": 0.9651443836662003, + "ewc_loss": 0.007271225564181805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.271225331351161e-05, + "grad_norm": 3.6112172603607178, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8618732690811157, + "num_tokens": 289358110.0, + "step": 7587 + }, + { + "epoch": 0.9652715939447908, + "ewc_loss": 0.007333029992878437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.333030225709081e-05, + "grad_norm": 3.6851112842559814, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8573416471481323, + "num_tokens": 289391581.0, + "step": 7588 + }, + { + "epoch": 0.9653988042233812, + "ewc_loss": 0.007369617931544781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.369618106167763e-05, + "grad_norm": 3.646254777908325, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.842366635799408, + "num_tokens": 289432280.0, + "step": 7589 + }, + { + "epoch": 0.9655260145019717, + "ewc_loss": 0.00733110262081027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.331102824537084e-05, + "grad_norm": 3.671579122543335, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8517069220542908, + "num_tokens": 289473222.0, + "step": 7590 + }, + { + "epoch": 0.9656532247805623, + "ewc_loss": 0.007379815448075533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.379815360764042e-05, + "grad_norm": 3.6247811317443848, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8636751174926758, + "num_tokens": 289512898.0, + "step": 7591 + }, + { + "epoch": 0.9657804350591528, + "ewc_loss": 0.007356547750532627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.356547575909644e-05, + "grad_norm": 3.5910558700561523, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8731615543365479, + "num_tokens": 289554394.0, + "step": 7592 + }, + { + "epoch": 0.9659076453377433, + "ewc_loss": 0.007366540376096964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.366540376096964e-05, + "grad_norm": 3.6269474029541016, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8632674217224121, + "num_tokens": 289594871.0, + "step": 7593 + }, + { + "epoch": 0.9660348556163338, + "ewc_loss": 0.007373728323727846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.373728294624016e-05, + "grad_norm": 3.6954145431518555, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.857764482498169, + "num_tokens": 289629165.0, + "step": 7594 + }, + { + "epoch": 0.9661620658949243, + "ewc_loss": 0.007419981528073549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.41998155717738e-05, + "grad_norm": 3.5848653316497803, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8668094873428345, + "num_tokens": 289670131.0, + "step": 7595 + }, + { + "epoch": 0.9662892761735148, + "ewc_loss": 0.007331474684178829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.331474625971168e-05, + "grad_norm": 3.6795880794525146, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8716144561767578, + "num_tokens": 289705238.0, + "step": 7596 + }, + { + "epoch": 0.9664164864521053, + "ewc_loss": 0.007433051243424416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.433051359839737e-05, + "grad_norm": 3.674132823944092, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8710442185401917, + "num_tokens": 289740146.0, + "step": 7597 + }, + { + "epoch": 0.9665436967306958, + "ewc_loss": 0.0073773511685431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.377350993920118e-05, + "grad_norm": 3.639246940612793, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8730519413948059, + "num_tokens": 289781145.0, + "step": 7598 + }, + { + "epoch": 0.9666709070092864, + "ewc_loss": 0.007365007419139147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.365007331827655e-05, + "grad_norm": 3.7022054195404053, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8510614633560181, + "num_tokens": 289816347.0, + "step": 7599 + }, + { + "epoch": 0.9667981172878769, + "ewc_loss": 0.007414048537611961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.414048741338775e-05, + "grad_norm": 3.6637041568756104, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8532084226608276, + "num_tokens": 289852974.0, + "step": 7600 + }, + { + "epoch": 0.9669253275664673, + "ewc_loss": 0.007347136735916138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.34713685233146e-05, + "grad_norm": 3.643885612487793, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8558803796768188, + "num_tokens": 289894553.0, + "step": 7601 + }, + { + "epoch": 0.9670525378450578, + "ewc_loss": 0.007355600129812956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.355600246228278e-05, + "grad_norm": 3.672405481338501, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8489022254943848, + "num_tokens": 289930641.0, + "step": 7602 + }, + { + "epoch": 0.9671797481236484, + "ewc_loss": 0.007387765217572451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.387765072053298e-05, + "grad_norm": 3.6231775283813477, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8625280261039734, + "num_tokens": 289972690.0, + "step": 7603 + }, + { + "epoch": 0.9673069584022389, + "ewc_loss": 0.00732812425121665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.328124047489837e-05, + "grad_norm": 3.6766726970672607, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8615977168083191, + "num_tokens": 290006518.0, + "step": 7604 + }, + { + "epoch": 0.9674341686808294, + "ewc_loss": 0.007392775267362595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.392775296466425e-05, + "grad_norm": 3.6693804264068604, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8530431985855103, + "num_tokens": 290045206.0, + "step": 7605 + }, + { + "epoch": 0.96756137895942, + "ewc_loss": 0.007360898423939943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.360898598562926e-05, + "grad_norm": 3.564868450164795, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8725498914718628, + "num_tokens": 290086976.0, + "step": 7606 + }, + { + "epoch": 0.9676885892380104, + "ewc_loss": 0.00731311459094286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.313114474527538e-05, + "grad_norm": 3.6382479667663574, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.879528284072876, + "num_tokens": 290126894.0, + "step": 7607 + }, + { + "epoch": 0.9678157995166009, + "ewc_loss": 0.007394116371870041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.394116255454719e-05, + "grad_norm": 3.6359593868255615, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8806182146072388, + "num_tokens": 290161684.0, + "step": 7608 + }, + { + "epoch": 0.9679430097951914, + "ewc_loss": 0.0073423441499471664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.342344179050997e-05, + "grad_norm": 3.622617721557617, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8689477443695068, + "num_tokens": 290203407.0, + "step": 7609 + }, + { + "epoch": 0.968070220073782, + "ewc_loss": 0.007339613512158394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.339613512158394e-05, + "grad_norm": 3.617648124694824, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8751556873321533, + "num_tokens": 290238645.0, + "step": 7610 + }, + { + "epoch": 0.9681974303523725, + "ewc_loss": 0.007323574274778366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.323574391193688e-05, + "grad_norm": 3.6006081104278564, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8665285110473633, + "num_tokens": 290280279.0, + "step": 7611 + }, + { + "epoch": 0.968324640630963, + "ewc_loss": 0.007311100605875254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.311100489459932e-05, + "grad_norm": 3.6697890758514404, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8651682138442993, + "num_tokens": 290317739.0, + "step": 7612 + }, + { + "epoch": 0.9684518509095535, + "ewc_loss": 0.0073464238084852695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.34642380848527e-05, + "grad_norm": 3.6088972091674805, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8660627007484436, + "num_tokens": 290360384.0, + "step": 7613 + }, + { + "epoch": 0.968579061188144, + "ewc_loss": 0.007297516334801912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.297516276594251e-05, + "grad_norm": 3.666724443435669, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8755339980125427, + "num_tokens": 290398706.0, + "step": 7614 + }, + { + "epoch": 0.9687062714667345, + "ewc_loss": 0.007328399922698736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.328399806283414e-05, + "grad_norm": 3.6118316650390625, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.878960907459259, + "num_tokens": 290439524.0, + "step": 7615 + }, + { + "epoch": 0.968833481745325, + "ewc_loss": 0.007259062025696039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.25906211300753e-05, + "grad_norm": 3.7343881130218506, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8638800978660583, + "num_tokens": 290472125.0, + "step": 7616 + }, + { + "epoch": 0.9689606920239155, + "ewc_loss": 0.007356834132224321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.356834248639643e-05, + "grad_norm": 3.627788543701172, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8515230417251587, + "num_tokens": 290514348.0, + "step": 7617 + }, + { + "epoch": 0.9690879023025061, + "ewc_loss": 0.007239897269755602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.239897240651771e-05, + "grad_norm": 3.654176712036133, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8629124760627747, + "num_tokens": 290551699.0, + "step": 7618 + }, + { + "epoch": 0.9692151125810966, + "ewc_loss": 0.0072963726706802845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.296372496057302e-05, + "grad_norm": 3.6712958812713623, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8698272109031677, + "num_tokens": 290588644.0, + "step": 7619 + }, + { + "epoch": 0.969342322859687, + "ewc_loss": 0.007296917960047722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.296918192878366e-05, + "grad_norm": 3.6322619915008545, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8708080053329468, + "num_tokens": 290629546.0, + "step": 7620 + }, + { + "epoch": 0.9694695331382776, + "ewc_loss": 0.007248616777360439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.248616748256609e-05, + "grad_norm": 3.620130777359009, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.873918890953064, + "num_tokens": 290665527.0, + "step": 7621 + }, + { + "epoch": 0.9695967434168681, + "ewc_loss": 0.0072827585972845554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.282758451765403e-05, + "grad_norm": 3.6441640853881836, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8594570159912109, + "num_tokens": 290706667.0, + "step": 7622 + }, + { + "epoch": 0.9697239536954586, + "ewc_loss": 0.007280492223799229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.280491990968585e-05, + "grad_norm": 3.6557321548461914, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8613314628601074, + "num_tokens": 290745698.0, + "step": 7623 + }, + { + "epoch": 0.9698511639740491, + "ewc_loss": 0.007280164398252964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.280164572875947e-05, + "grad_norm": 3.6801576614379883, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8740779161453247, + "num_tokens": 290777983.0, + "step": 7624 + }, + { + "epoch": 0.9699783742526397, + "ewc_loss": 0.007303999736905098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30399988242425e-05, + "grad_norm": 3.6452207565307617, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8603225946426392, + "num_tokens": 290814184.0, + "step": 7625 + }, + { + "epoch": 0.9701055845312301, + "ewc_loss": 0.00728666502982378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.286664913408458e-05, + "grad_norm": 3.6158738136291504, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8601289987564087, + "num_tokens": 290854054.0, + "step": 7626 + }, + { + "epoch": 0.9702327948098206, + "ewc_loss": 0.007274839095771313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.274839299498126e-05, + "grad_norm": 3.642444133758545, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8663569092750549, + "num_tokens": 290891325.0, + "step": 7627 + }, + { + "epoch": 0.9703600050884111, + "ewc_loss": 0.007313952781260014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.313952664844692e-05, + "grad_norm": 3.6455435752868652, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8657993078231812, + "num_tokens": 290930101.0, + "step": 7628 + }, + { + "epoch": 0.9704872153670017, + "ewc_loss": 0.00729021430015564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.290214125532657e-05, + "grad_norm": 3.652834415435791, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8668146729469299, + "num_tokens": 290964691.0, + "step": 7629 + }, + { + "epoch": 0.9706144256455922, + "ewc_loss": 0.007290405686944723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29040548321791e-05, + "grad_norm": 3.609281539916992, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8638625144958496, + "num_tokens": 291008140.0, + "step": 7630 + }, + { + "epoch": 0.9707416359241827, + "ewc_loss": 0.007260972633957863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.260972779477015e-05, + "grad_norm": 3.666454553604126, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8595254421234131, + "num_tokens": 291041484.0, + "step": 7631 + }, + { + "epoch": 0.9708688462027731, + "ewc_loss": 0.007336577866226435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.336577982641757e-05, + "grad_norm": 3.698955535888672, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.847621500492096, + "num_tokens": 291082337.0, + "step": 7632 + }, + { + "epoch": 0.9709960564813637, + "ewc_loss": 0.007315942086279392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.31594191165641e-05, + "grad_norm": 3.6557085514068604, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8663261532783508, + "num_tokens": 291115921.0, + "step": 7633 + }, + { + "epoch": 0.9711232667599542, + "ewc_loss": 0.007304522208869457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.304522296180949e-05, + "grad_norm": 3.6418826580047607, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8753111362457275, + "num_tokens": 291154367.0, + "step": 7634 + }, + { + "epoch": 0.9712504770385447, + "ewc_loss": 0.007312086410820484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.312086381716654e-05, + "grad_norm": 3.5733563899993896, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8668107986450195, + "num_tokens": 291198919.0, + "step": 7635 + }, + { + "epoch": 0.9713776873171353, + "ewc_loss": 0.007278840988874435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.278841076185927e-05, + "grad_norm": 3.6889090538024902, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8566129207611084, + "num_tokens": 291237062.0, + "step": 7636 + }, + { + "epoch": 0.9715048975957258, + "ewc_loss": 0.007358740549534559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.358740549534559e-05, + "grad_norm": 3.665740489959717, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8785576820373535, + "num_tokens": 291275676.0, + "step": 7637 + }, + { + "epoch": 0.9716321078743162, + "ewc_loss": 0.0073017096146941185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.301709410967305e-05, + "grad_norm": 3.6436867713928223, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8595585823059082, + "num_tokens": 291316785.0, + "step": 7638 + }, + { + "epoch": 0.9717593181529067, + "ewc_loss": 0.0072946627624332905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.294662646017969e-05, + "grad_norm": 3.623697280883789, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8645235300064087, + "num_tokens": 291360448.0, + "step": 7639 + }, + { + "epoch": 0.9718865284314973, + "ewc_loss": 0.007292889524251223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.292889495147392e-05, + "grad_norm": 3.6228485107421875, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8653106689453125, + "num_tokens": 291401650.0, + "step": 7640 + }, + { + "epoch": 0.9720137387100878, + "ewc_loss": 0.007279028184711933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.279028068296611e-05, + "grad_norm": 3.6264758110046387, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8677902221679688, + "num_tokens": 291439736.0, + "step": 7641 + }, + { + "epoch": 0.9721409489886783, + "ewc_loss": 0.0072937458753585815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.293745875358582e-05, + "grad_norm": 3.6222288608551025, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8720593452453613, + "num_tokens": 291479077.0, + "step": 7642 + }, + { + "epoch": 0.9722681592672688, + "ewc_loss": 0.007289028726518154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.289028872037306e-05, + "grad_norm": 3.6854236125946045, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8774411082267761, + "num_tokens": 291512470.0, + "step": 7643 + }, + { + "epoch": 0.9723953695458593, + "ewc_loss": 0.007298345677554607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.298345735762268e-05, + "grad_norm": 3.6045539379119873, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8766491413116455, + "num_tokens": 291552797.0, + "step": 7644 + }, + { + "epoch": 0.9725225798244498, + "ewc_loss": 0.007257523015141487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.257523247972131e-05, + "grad_norm": 3.6604151725769043, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8709194660186768, + "num_tokens": 291589885.0, + "step": 7645 + }, + { + "epoch": 0.9726497901030403, + "ewc_loss": 0.007302246987819672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.302247104234993e-05, + "grad_norm": 3.5900332927703857, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8664489984512329, + "num_tokens": 291633005.0, + "step": 7646 + }, + { + "epoch": 0.9727770003816308, + "ewc_loss": 0.007243644911795855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.243645086418837e-05, + "grad_norm": 3.7148752212524414, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.862774133682251, + "num_tokens": 291666083.0, + "step": 7647 + }, + { + "epoch": 0.9729042106602214, + "ewc_loss": 0.007348021492362022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.348021608777344e-05, + "grad_norm": 3.6286232471466064, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8747250437736511, + "num_tokens": 291706421.0, + "step": 7648 + }, + { + "epoch": 0.9730314209388119, + "ewc_loss": 0.007232159376144409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.232159259729087e-05, + "grad_norm": 3.637188196182251, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8722682595252991, + "num_tokens": 291744641.0, + "step": 7649 + }, + { + "epoch": 0.9731586312174023, + "ewc_loss": 0.007290272507816553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29027233319357e-05, + "grad_norm": 3.6208348274230957, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8920154571533203, + "num_tokens": 291780586.0, + "step": 7650 + }, + { + "epoch": 0.9732858414959928, + "ewc_loss": 0.007268892601132393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.268892659340054e-05, + "grad_norm": 3.6228981018066406, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8629988431930542, + "num_tokens": 291820482.0, + "step": 7651 + }, + { + "epoch": 0.9734130517745834, + "ewc_loss": 0.007270882837474346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.270882633747533e-05, + "grad_norm": 3.618572950363159, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8763219118118286, + "num_tokens": 291858568.0, + "step": 7652 + }, + { + "epoch": 0.9735402620531739, + "ewc_loss": 0.007275703828781843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.275703683262691e-05, + "grad_norm": 3.766839027404785, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8581146001815796, + "num_tokens": 291889383.0, + "step": 7653 + }, + { + "epoch": 0.9736674723317644, + "ewc_loss": 0.0073537579737603664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.353757973760366e-05, + "grad_norm": 3.6185665130615234, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8759560585021973, + "num_tokens": 291927541.0, + "step": 7654 + }, + { + "epoch": 0.973794682610355, + "ewc_loss": 0.007222098298370838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.222098065540195e-05, + "grad_norm": 3.667131185531616, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8518632650375366, + "num_tokens": 291969074.0, + "step": 7655 + }, + { + "epoch": 0.9739218928889454, + "ewc_loss": 0.007308481726795435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.308481872314587e-05, + "grad_norm": 3.6385860443115234, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8668408393859863, + "num_tokens": 292008311.0, + "step": 7656 + }, + { + "epoch": 0.9740491031675359, + "ewc_loss": 0.007263536099344492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.263536099344492e-05, + "grad_norm": 3.6592953205108643, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8584523797035217, + "num_tokens": 292051810.0, + "step": 7657 + }, + { + "epoch": 0.9741763134461264, + "ewc_loss": 0.0072966571897268295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.296656986000016e-05, + "grad_norm": 3.6309406757354736, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.87480628490448, + "num_tokens": 292090411.0, + "step": 7658 + }, + { + "epoch": 0.974303523724717, + "ewc_loss": 0.007253621704876423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.253621879499406e-05, + "grad_norm": 3.6782257556915283, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8601300716400146, + "num_tokens": 292124411.0, + "step": 7659 + }, + { + "epoch": 0.9744307340033075, + "ewc_loss": 0.0073158470913767815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.315847324207425e-05, + "grad_norm": 3.6120188236236572, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.866956353187561, + "num_tokens": 292166428.0, + "step": 7660 + }, + { + "epoch": 0.974557944281898, + "ewc_loss": 0.007254895754158497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.254895899677649e-05, + "grad_norm": 3.641383647918701, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8698492050170898, + "num_tokens": 292202687.0, + "step": 7661 + }, + { + "epoch": 0.9746851545604885, + "ewc_loss": 0.007310785353183746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.310785440495238e-05, + "grad_norm": 3.584120988845825, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8768865466117859, + "num_tokens": 292246364.0, + "step": 7662 + }, + { + "epoch": 0.974812364839079, + "ewc_loss": 0.00726504810154438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.265048043336719e-05, + "grad_norm": 3.6469736099243164, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8684000968933105, + "num_tokens": 292285789.0, + "step": 7663 + }, + { + "epoch": 0.9749395751176695, + "ewc_loss": 0.007330793421715498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.330793596338481e-05, + "grad_norm": 3.6892619132995605, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8616306781768799, + "num_tokens": 292320755.0, + "step": 7664 + }, + { + "epoch": 0.97506678539626, + "ewc_loss": 0.007340519223362207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.34051936888136e-05, + "grad_norm": 3.621962547302246, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8763493299484253, + "num_tokens": 292362818.0, + "step": 7665 + }, + { + "epoch": 0.9751939956748505, + "ewc_loss": 0.007269422989338636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.269423076650128e-05, + "grad_norm": 3.646890878677368, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8604824542999268, + "num_tokens": 292404016.0, + "step": 7666 + }, + { + "epoch": 0.9753212059534411, + "ewc_loss": 0.0073182471096515656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.318246935028583e-05, + "grad_norm": 3.6811399459838867, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8545143604278564, + "num_tokens": 292442117.0, + "step": 7667 + }, + { + "epoch": 0.9754484162320316, + "ewc_loss": 0.007313779555261135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.313779497053474e-05, + "grad_norm": 3.6939845085144043, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8754090070724487, + "num_tokens": 292472523.0, + "step": 7668 + }, + { + "epoch": 0.975575626510622, + "ewc_loss": 0.007323247846215963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.323247700696811e-05, + "grad_norm": 3.6977105140686035, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8731430768966675, + "num_tokens": 292508571.0, + "step": 7669 + }, + { + "epoch": 0.9757028367892125, + "ewc_loss": 0.007317077834159136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.317077688639984e-05, + "grad_norm": 3.6781513690948486, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8635464906692505, + "num_tokens": 292545358.0, + "step": 7670 + }, + { + "epoch": 0.9758300470678031, + "ewc_loss": 0.00730614410713315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30614410713315e-05, + "grad_norm": 3.6676700115203857, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8517837524414062, + "num_tokens": 292583962.0, + "step": 7671 + }, + { + "epoch": 0.9759572573463936, + "ewc_loss": 0.007323230151087046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.323230238398537e-05, + "grad_norm": 3.677190065383911, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8672030568122864, + "num_tokens": 292616186.0, + "step": 7672 + }, + { + "epoch": 0.9760844676249841, + "ewc_loss": 0.007331407628953457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.331407687161118e-05, + "grad_norm": 3.624138355255127, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8631386756896973, + "num_tokens": 292657378.0, + "step": 7673 + }, + { + "epoch": 0.9762116779035747, + "ewc_loss": 0.007302685175091028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.302685116883367e-05, + "grad_norm": 3.6294209957122803, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8809468746185303, + "num_tokens": 292692041.0, + "step": 7674 + }, + { + "epoch": 0.9763388881821651, + "ewc_loss": 0.007341455668210983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.341455784626305e-05, + "grad_norm": 3.7250845432281494, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8649559020996094, + "num_tokens": 292725062.0, + "step": 7675 + }, + { + "epoch": 0.9764660984607556, + "ewc_loss": 0.007381754461675882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.381754403468221e-05, + "grad_norm": 3.62168288230896, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8751095533370972, + "num_tokens": 292766522.0, + "step": 7676 + }, + { + "epoch": 0.9765933087393461, + "ewc_loss": 0.007292845286428928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.292845111805946e-05, + "grad_norm": 3.5752344131469727, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8674472570419312, + "num_tokens": 292811371.0, + "step": 7677 + }, + { + "epoch": 0.9767205190179367, + "ewc_loss": 0.0073228925466537476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.322892633965239e-05, + "grad_norm": 3.6478147506713867, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8687434196472168, + "num_tokens": 292850237.0, + "step": 7678 + }, + { + "epoch": 0.9768477292965272, + "ewc_loss": 0.007363002281636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.363002077909186e-05, + "grad_norm": 3.6786117553710938, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8709580302238464, + "num_tokens": 292887291.0, + "step": 7679 + }, + { + "epoch": 0.9769749395751177, + "ewc_loss": 0.0073266057297587395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.326605555135757e-05, + "grad_norm": 3.6055238246917725, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.880628764629364, + "num_tokens": 292928396.0, + "step": 7680 + }, + { + "epoch": 0.9771021498537081, + "ewc_loss": 0.007286618929356337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.286619074875489e-05, + "grad_norm": 3.6804392337799072, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8722249865531921, + "num_tokens": 292962748.0, + "step": 7681 + }, + { + "epoch": 0.9772293601322987, + "ewc_loss": 0.007340839598327875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.340839511016384e-05, + "grad_norm": 3.6307661533355713, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.878490149974823, + "num_tokens": 293002330.0, + "step": 7682 + }, + { + "epoch": 0.9773565704108892, + "ewc_loss": 0.0072714705020189285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.271470531122759e-05, + "grad_norm": 3.6829984188079834, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8670457601547241, + "num_tokens": 293038288.0, + "step": 7683 + }, + { + "epoch": 0.9774837806894797, + "ewc_loss": 0.007329345680773258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.329345680773258e-05, + "grad_norm": 3.618150472640991, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8704791069030762, + "num_tokens": 293078121.0, + "step": 7684 + }, + { + "epoch": 0.9776109909680702, + "ewc_loss": 0.0072658178396523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2658178396523e-05, + "grad_norm": 3.6560726165771484, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8642491698265076, + "num_tokens": 293116846.0, + "step": 7685 + }, + { + "epoch": 0.9777382012466608, + "ewc_loss": 0.00729980506002903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.299805292859674e-05, + "grad_norm": 3.6292662620544434, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8633511066436768, + "num_tokens": 293162237.0, + "step": 7686 + }, + { + "epoch": 0.9778654115252512, + "ewc_loss": 0.0072620022110641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.262002327479422e-05, + "grad_norm": 3.706237554550171, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.864517867565155, + "num_tokens": 293196631.0, + "step": 7687 + }, + { + "epoch": 0.9779926218038417, + "ewc_loss": 0.007307908497750759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30790852685459e-05, + "grad_norm": 3.669515609741211, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.876223087310791, + "num_tokens": 293238883.0, + "step": 7688 + }, + { + "epoch": 0.9781198320824323, + "ewc_loss": 0.0072621931321918964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.262192957568914e-05, + "grad_norm": 3.7229344844818115, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.859129786491394, + "num_tokens": 293270802.0, + "step": 7689 + }, + { + "epoch": 0.9782470423610228, + "ewc_loss": 0.007302752695977688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.302752783289179e-05, + "grad_norm": 3.6544079780578613, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8530244827270508, + "num_tokens": 293314910.0, + "step": 7690 + }, + { + "epoch": 0.9783742526396133, + "ewc_loss": 0.007263012230396271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.263012230396271e-05, + "grad_norm": 3.6301512718200684, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8804546594619751, + "num_tokens": 293353083.0, + "step": 7691 + }, + { + "epoch": 0.9785014629182038, + "ewc_loss": 0.007262249011546373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.262248982442543e-05, + "grad_norm": 3.649334669113159, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8697370290756226, + "num_tokens": 293389473.0, + "step": 7692 + }, + { + "epoch": 0.9786286731967943, + "ewc_loss": 0.007289165630936623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.289165660040453e-05, + "grad_norm": 3.6414363384246826, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8802070021629333, + "num_tokens": 293426893.0, + "step": 7693 + }, + { + "epoch": 0.9787558834753848, + "ewc_loss": 0.0072675952687859535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.267595356097445e-05, + "grad_norm": 3.597313642501831, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8722833395004272, + "num_tokens": 293468116.0, + "step": 7694 + }, + { + "epoch": 0.9788830937539753, + "ewc_loss": 0.007263814564794302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.263814768521115e-05, + "grad_norm": 3.6981396675109863, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8718920946121216, + "num_tokens": 293503050.0, + "step": 7695 + }, + { + "epoch": 0.9790103040325658, + "ewc_loss": 0.007318350486457348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.318350253626704e-05, + "grad_norm": 3.6486823558807373, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8579275012016296, + "num_tokens": 293541673.0, + "step": 7696 + }, + { + "epoch": 0.9791375143111564, + "ewc_loss": 0.007265136577188969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.265136810019612e-05, + "grad_norm": 3.6926374435424805, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8648270964622498, + "num_tokens": 293578472.0, + "step": 7697 + }, + { + "epoch": 0.9792647245897469, + "ewc_loss": 0.007320026867091656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.320026634261012e-05, + "grad_norm": 3.641321897506714, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8711997270584106, + "num_tokens": 293611204.0, + "step": 7698 + }, + { + "epoch": 0.9793919348683373, + "ewc_loss": 0.007271653041243553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.271653157658875e-05, + "grad_norm": 3.6422817707061768, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8570282459259033, + "num_tokens": 293651352.0, + "step": 7699 + }, + { + "epoch": 0.9795191451469278, + "ewc_loss": 0.007307624910026789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.307624764507636e-05, + "grad_norm": 3.7020082473754883, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8791837692260742, + "num_tokens": 293687396.0, + "step": 7700 + }, + { + "epoch": 0.9796463554255184, + "ewc_loss": 0.007341601420193911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.341601303778589e-05, + "grad_norm": 3.665200710296631, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.863735556602478, + "num_tokens": 293725083.0, + "step": 7701 + }, + { + "epoch": 0.9797735657041089, + "ewc_loss": 0.007292882073670626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.292882219189778e-05, + "grad_norm": 3.645766258239746, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.867660641670227, + "num_tokens": 293766136.0, + "step": 7702 + }, + { + "epoch": 0.9799007759826994, + "ewc_loss": 0.007291907910257578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.291907968465239e-05, + "grad_norm": 3.6444735527038574, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8509619832038879, + "num_tokens": 293805810.0, + "step": 7703 + }, + { + "epoch": 0.98002798626129, + "ewc_loss": 0.0073022558353841305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30225583538413e-05, + "grad_norm": 3.690293073654175, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8605161905288696, + "num_tokens": 293839041.0, + "step": 7704 + }, + { + "epoch": 0.9801551965398804, + "ewc_loss": 0.007325423415750265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.325423212023452e-05, + "grad_norm": 3.5889484882354736, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.881193220615387, + "num_tokens": 293876541.0, + "step": 7705 + }, + { + "epoch": 0.9802824068184709, + "ewc_loss": 0.007257421500980854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.257421384565532e-05, + "grad_norm": 3.583343505859375, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8808758854866028, + "num_tokens": 293918141.0, + "step": 7706 + }, + { + "epoch": 0.9804096170970614, + "ewc_loss": 0.007292823400348425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.292823283933103e-05, + "grad_norm": 3.6729187965393066, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8564322590827942, + "num_tokens": 293958078.0, + "step": 7707 + }, + { + "epoch": 0.980536827375652, + "ewc_loss": 0.007348273880779743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.348274084506556e-05, + "grad_norm": 3.6912992000579834, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8783149123191833, + "num_tokens": 293990347.0, + "step": 7708 + }, + { + "epoch": 0.9806640376542425, + "ewc_loss": 0.00732690654695034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.326906779780984e-05, + "grad_norm": 3.626913547515869, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8726475238800049, + "num_tokens": 294032136.0, + "step": 7709 + }, + { + "epoch": 0.980791247932833, + "ewc_loss": 0.007291228976100683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.291229121619835e-05, + "grad_norm": 3.769512414932251, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8558754920959473, + "num_tokens": 294065136.0, + "step": 7710 + }, + { + "epoch": 0.9809184582114235, + "ewc_loss": 0.007399699185043573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.399699097732082e-05, + "grad_norm": 3.6019461154937744, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8807837963104248, + "num_tokens": 294104539.0, + "step": 7711 + }, + { + "epoch": 0.981045668490014, + "ewc_loss": 0.007247846107929945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.247846224345267e-05, + "grad_norm": 3.607271909713745, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8659383058547974, + "num_tokens": 294148962.0, + "step": 7712 + }, + { + "epoch": 0.9811728787686045, + "ewc_loss": 0.007314971182495356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.314971298910677e-05, + "grad_norm": 3.67868971824646, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8721612691879272, + "num_tokens": 294181866.0, + "step": 7713 + }, + { + "epoch": 0.981300089047195, + "ewc_loss": 0.007333444897085428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.333444955293089e-05, + "grad_norm": 3.5933287143707275, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8771290183067322, + "num_tokens": 294223558.0, + "step": 7714 + }, + { + "epoch": 0.9814272993257855, + "ewc_loss": 0.007269249763339758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.26924990885891e-05, + "grad_norm": 3.6412155628204346, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8731734752655029, + "num_tokens": 294260487.0, + "step": 7715 + }, + { + "epoch": 0.9815545096043761, + "ewc_loss": 0.007341643329709768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.341643504332751e-05, + "grad_norm": 3.6230318546295166, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8690592050552368, + "num_tokens": 294302642.0, + "step": 7716 + }, + { + "epoch": 0.9816817198829666, + "ewc_loss": 0.007284566760063171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.284566527232528e-05, + "grad_norm": 3.690683126449585, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8598677515983582, + "num_tokens": 294338376.0, + "step": 7717 + }, + { + "epoch": 0.981808930161557, + "ewc_loss": 0.007360168267041445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.360168092418462e-05, + "grad_norm": 3.737152338027954, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8615881204605103, + "num_tokens": 294372089.0, + "step": 7718 + }, + { + "epoch": 0.9819361404401475, + "ewc_loss": 0.007345982827246189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.34598288545385e-05, + "grad_norm": 3.665452480316162, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8739727735519409, + "num_tokens": 294403801.0, + "step": 7719 + }, + { + "epoch": 0.9820633507187381, + "ewc_loss": 0.00730875413864851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.308753993129358e-05, + "grad_norm": 3.685469388961792, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8700801730155945, + "num_tokens": 294440301.0, + "step": 7720 + }, + { + "epoch": 0.9821905609973286, + "ewc_loss": 0.0073590693064033985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.35906942281872e-05, + "grad_norm": 3.6879642009735107, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8671771287918091, + "num_tokens": 294472967.0, + "step": 7721 + }, + { + "epoch": 0.9823177712759191, + "ewc_loss": 0.007362779229879379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.362779433606192e-05, + "grad_norm": 3.6463191509246826, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8582630157470703, + "num_tokens": 294518248.0, + "step": 7722 + }, + { + "epoch": 0.9824449815545097, + "ewc_loss": 0.007331349421292543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.331349479500204e-05, + "grad_norm": 3.686063051223755, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8665306568145752, + "num_tokens": 294553899.0, + "step": 7723 + }, + { + "epoch": 0.9825721918331001, + "ewc_loss": 0.0073807197622954845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.380719762295485e-05, + "grad_norm": 3.649101734161377, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.84649258852005, + "num_tokens": 294591382.0, + "step": 7724 + }, + { + "epoch": 0.9826994021116906, + "ewc_loss": 0.007352665066719055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.352665124926716e-05, + "grad_norm": 3.647299289703369, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8694129586219788, + "num_tokens": 294629606.0, + "step": 7725 + }, + { + "epoch": 0.9828266123902811, + "ewc_loss": 0.0073790536262094975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.379053568001837e-05, + "grad_norm": 3.6483726501464844, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8696129322052002, + "num_tokens": 294669417.0, + "step": 7726 + }, + { + "epoch": 0.9829538226688717, + "ewc_loss": 0.007373695727437735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.373695552814752e-05, + "grad_norm": 3.6436359882354736, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8767474889755249, + "num_tokens": 294709876.0, + "step": 7727 + }, + { + "epoch": 0.9830810329474622, + "ewc_loss": 0.007371256127953529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.371255924226716e-05, + "grad_norm": 3.655003547668457, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8589752912521362, + "num_tokens": 294750565.0, + "step": 7728 + }, + { + "epoch": 0.9832082432260527, + "ewc_loss": 0.007385090459138155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.385090430034325e-05, + "grad_norm": 3.6248161792755127, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8695482015609741, + "num_tokens": 294790586.0, + "step": 7729 + }, + { + "epoch": 0.9833354535046431, + "ewc_loss": 0.007339216768741608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.339216972468421e-05, + "grad_norm": 3.6304593086242676, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.865219235420227, + "num_tokens": 294830176.0, + "step": 7730 + }, + { + "epoch": 0.9834626637832337, + "ewc_loss": 0.007392522878944874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.392522820737213e-05, + "grad_norm": 3.6735219955444336, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8558799624443054, + "num_tokens": 294874199.0, + "step": 7731 + }, + { + "epoch": 0.9835898740618242, + "ewc_loss": 0.007368462160229683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.36846195650287e-05, + "grad_norm": 3.6089303493499756, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8745048642158508, + "num_tokens": 294914219.0, + "step": 7732 + }, + { + "epoch": 0.9837170843404147, + "ewc_loss": 0.007326070684939623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.326070772251114e-05, + "grad_norm": 3.698967695236206, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8447576761245728, + "num_tokens": 294950726.0, + "step": 7733 + }, + { + "epoch": 0.9838442946190052, + "ewc_loss": 0.007393836975097656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.393836858682334e-05, + "grad_norm": 3.6282827854156494, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8664426207542419, + "num_tokens": 294993815.0, + "step": 7734 + }, + { + "epoch": 0.9839715048975958, + "ewc_loss": 0.007312328089028597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.312327943509445e-05, + "grad_norm": 3.646406412124634, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8770787715911865, + "num_tokens": 295029275.0, + "step": 7735 + }, + { + "epoch": 0.9840987151761862, + "ewc_loss": 0.007358554285019636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.358554285019636e-05, + "grad_norm": 3.6600708961486816, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8668792247772217, + "num_tokens": 295066166.0, + "step": 7736 + }, + { + "epoch": 0.9842259254547767, + "ewc_loss": 0.007351888809353113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.351888780249283e-05, + "grad_norm": 3.7147679328918457, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8447785377502441, + "num_tokens": 295100436.0, + "step": 7737 + }, + { + "epoch": 0.9843531357333672, + "ewc_loss": 0.007373005151748657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.373005064437166e-05, + "grad_norm": 3.62320613861084, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8694853782653809, + "num_tokens": 295141756.0, + "step": 7738 + }, + { + "epoch": 0.9844803460119578, + "ewc_loss": 0.007306071929633617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30607207515277e-05, + "grad_norm": 3.637712001800537, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8849974870681763, + "num_tokens": 295178894.0, + "step": 7739 + }, + { + "epoch": 0.9846075562905483, + "ewc_loss": 0.007338935509324074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.338935392908752e-05, + "grad_norm": 3.6599156856536865, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8569505214691162, + "num_tokens": 295219133.0, + "step": 7740 + }, + { + "epoch": 0.9847347665691388, + "ewc_loss": 0.007356176618486643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.356176502071321e-05, + "grad_norm": 3.6783041954040527, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8674330115318298, + "num_tokens": 295260322.0, + "step": 7741 + }, + { + "epoch": 0.9848619768477292, + "ewc_loss": 0.007352649699896574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.352649845415726e-05, + "grad_norm": 3.6438703536987305, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8626469373703003, + "num_tokens": 295299675.0, + "step": 7742 + }, + { + "epoch": 0.9849891871263198, + "ewc_loss": 0.007314297370612621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.314297545235604e-05, + "grad_norm": 3.635056257247925, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8730881214141846, + "num_tokens": 295336281.0, + "step": 7743 + }, + { + "epoch": 0.9851163974049103, + "ewc_loss": 0.007315110415220261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.315110269701108e-05, + "grad_norm": 3.6237142086029053, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8656904697418213, + "num_tokens": 295376932.0, + "step": 7744 + }, + { + "epoch": 0.9852436076835008, + "ewc_loss": 0.007322370074689388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.32237022020854e-05, + "grad_norm": 3.6249239444732666, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8645209670066833, + "num_tokens": 295421904.0, + "step": 7745 + }, + { + "epoch": 0.9853708179620914, + "ewc_loss": 0.00730541069060564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30541069060564e-05, + "grad_norm": 3.665527820587158, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8701115846633911, + "num_tokens": 295456688.0, + "step": 7746 + }, + { + "epoch": 0.9854980282406819, + "ewc_loss": 0.007337135262787342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.337135320995003e-05, + "grad_norm": 3.6439766883850098, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8839550018310547, + "num_tokens": 295500022.0, + "step": 7747 + }, + { + "epoch": 0.9856252385192723, + "ewc_loss": 0.0072946189902722836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.294618990272284e-05, + "grad_norm": 3.5930256843566895, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8884885311126709, + "num_tokens": 295538106.0, + "step": 7748 + }, + { + "epoch": 0.9857524487978628, + "ewc_loss": 0.007277993950992823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.277994154719636e-05, + "grad_norm": 3.72121262550354, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8637929558753967, + "num_tokens": 295572465.0, + "step": 7749 + }, + { + "epoch": 0.9858796590764534, + "ewc_loss": 0.007351788226515055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.351788372034207e-05, + "grad_norm": 3.6071174144744873, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8720403909683228, + "num_tokens": 295614577.0, + "step": 7750 + }, + { + "epoch": 0.9860068693550439, + "ewc_loss": 0.007253610994666815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.253610965562984e-05, + "grad_norm": 3.6121718883514404, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8724535703659058, + "num_tokens": 295654780.0, + "step": 7751 + }, + { + "epoch": 0.9861340796336344, + "ewc_loss": 0.00727957533672452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.279575220309198e-05, + "grad_norm": 3.6052322387695312, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8662266731262207, + "num_tokens": 295695880.0, + "step": 7752 + }, + { + "epoch": 0.986261289912225, + "ewc_loss": 0.007275384850800037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.27538499631919e-05, + "grad_norm": 3.6325161457061768, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.868488609790802, + "num_tokens": 295738176.0, + "step": 7753 + }, + { + "epoch": 0.9863885001908154, + "ewc_loss": 0.007277844939380884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.277844997588545e-05, + "grad_norm": 3.6671218872070312, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8574208617210388, + "num_tokens": 295777562.0, + "step": 7754 + }, + { + "epoch": 0.9865157104694059, + "ewc_loss": 0.007289677858352661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.289677887456492e-05, + "grad_norm": 3.6111040115356445, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8844596147537231, + "num_tokens": 295815013.0, + "step": 7755 + }, + { + "epoch": 0.9866429207479964, + "ewc_loss": 0.007238195277750492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.238195394165814e-05, + "grad_norm": 3.6267476081848145, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8572350740432739, + "num_tokens": 295857751.0, + "step": 7756 + }, + { + "epoch": 0.986770131026587, + "ewc_loss": 0.007277782540768385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.277782424353063e-05, + "grad_norm": 3.6385269165039062, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8746098279953003, + "num_tokens": 295896033.0, + "step": 7757 + }, + { + "epoch": 0.9868973413051775, + "ewc_loss": 0.007280232384800911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.280232239281759e-05, + "grad_norm": 3.624101400375366, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.871099591255188, + "num_tokens": 295936838.0, + "step": 7758 + }, + { + "epoch": 0.987024551583768, + "ewc_loss": 0.00726704578846693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.267046021297574e-05, + "grad_norm": 3.663756847381592, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8745085000991821, + "num_tokens": 295972307.0, + "step": 7759 + }, + { + "epoch": 0.9871517618623584, + "ewc_loss": 0.007279351819306612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.279351848410442e-05, + "grad_norm": 3.7064173221588135, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8659303188323975, + "num_tokens": 296009966.0, + "step": 7760 + }, + { + "epoch": 0.987278972140949, + "ewc_loss": 0.007296588737517595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.296588591998443e-05, + "grad_norm": 3.6431007385253906, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.864014208316803, + "num_tokens": 296047874.0, + "step": 7761 + }, + { + "epoch": 0.9874061824195395, + "ewc_loss": 0.00723704369738698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.237043610075489e-05, + "grad_norm": 3.585275173187256, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8746253848075867, + "num_tokens": 296090591.0, + "step": 7762 + }, + { + "epoch": 0.98753339269813, + "ewc_loss": 0.0072443499229848385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.244350126711652e-05, + "grad_norm": 3.6627206802368164, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8632903099060059, + "num_tokens": 296133024.0, + "step": 7763 + }, + { + "epoch": 0.9876606029767205, + "ewc_loss": 0.0072969114407896996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.296911644516513e-05, + "grad_norm": 3.618849992752075, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8558169007301331, + "num_tokens": 296177347.0, + "step": 7764 + }, + { + "epoch": 0.9877878132553111, + "ewc_loss": 0.007252552546560764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.25255231373012e-05, + "grad_norm": 3.6780078411102295, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8597741723060608, + "num_tokens": 296215532.0, + "step": 7765 + }, + { + "epoch": 0.9879150235339016, + "ewc_loss": 0.007309712469577789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.309712236747146e-05, + "grad_norm": 3.7180323600769043, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8571447134017944, + "num_tokens": 296249494.0, + "step": 7766 + }, + { + "epoch": 0.988042233812492, + "ewc_loss": 0.007304750848561525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.304750761250034e-05, + "grad_norm": 3.6545910835266113, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8580907583236694, + "num_tokens": 296289370.0, + "step": 7767 + }, + { + "epoch": 0.9881694440910825, + "ewc_loss": 0.007266244385391474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.266244210768491e-05, + "grad_norm": 3.621659278869629, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8691897392272949, + "num_tokens": 296329376.0, + "step": 7768 + }, + { + "epoch": 0.9882966543696731, + "ewc_loss": 0.007275772280991077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.275772077264264e-05, + "grad_norm": 3.6440532207489014, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8755893111228943, + "num_tokens": 296368875.0, + "step": 7769 + }, + { + "epoch": 0.9884238646482636, + "ewc_loss": 0.00730254827067256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.30254832888022e-05, + "grad_norm": 3.699949026107788, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8657441735267639, + "num_tokens": 296404766.0, + "step": 7770 + }, + { + "epoch": 0.9885510749268541, + "ewc_loss": 0.007329562678933144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.329562504310161e-05, + "grad_norm": 3.6652610301971436, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8672698736190796, + "num_tokens": 296441705.0, + "step": 7771 + }, + { + "epoch": 0.9886782852054447, + "ewc_loss": 0.007303078193217516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.303078018594533e-05, + "grad_norm": 3.6366233825683594, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8757377862930298, + "num_tokens": 296481075.0, + "step": 7772 + }, + { + "epoch": 0.9888054954840351, + "ewc_loss": 0.0072877476923167706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.287747575901449e-05, + "grad_norm": 3.731806755065918, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8637673854827881, + "num_tokens": 296511404.0, + "step": 7773 + }, + { + "epoch": 0.9889327057626256, + "ewc_loss": 0.007372227497398853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.37222726456821e-05, + "grad_norm": 3.703455924987793, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8586562871932983, + "num_tokens": 296546080.0, + "step": 7774 + }, + { + "epoch": 0.9890599160412161, + "ewc_loss": 0.00732394028455019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.323940371861681e-05, + "grad_norm": 3.638495922088623, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8708515763282776, + "num_tokens": 296586119.0, + "step": 7775 + }, + { + "epoch": 0.9891871263198067, + "ewc_loss": 0.007324309088289738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.32430926291272e-05, + "grad_norm": 3.614537239074707, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8663806915283203, + "num_tokens": 296629602.0, + "step": 7776 + }, + { + "epoch": 0.9893143365983972, + "ewc_loss": 0.007333120331168175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.333120447583497e-05, + "grad_norm": 3.661898136138916, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.860985279083252, + "num_tokens": 296666709.0, + "step": 7777 + }, + { + "epoch": 0.9894415468769877, + "ewc_loss": 0.0073646726086735725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.364672637777403e-05, + "grad_norm": 3.670182943344116, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8689819574356079, + "num_tokens": 296702611.0, + "step": 7778 + }, + { + "epoch": 0.9895687571555781, + "ewc_loss": 0.00735857617110014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.358576112892479e-05, + "grad_norm": 3.6750333309173584, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8745735883712769, + "num_tokens": 296737649.0, + "step": 7779 + }, + { + "epoch": 0.9896959674341687, + "ewc_loss": 0.007353649474680424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.353649561991915e-05, + "grad_norm": 3.6486988067626953, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8693324327468872, + "num_tokens": 296776915.0, + "step": 7780 + }, + { + "epoch": 0.9898231777127592, + "ewc_loss": 0.007337899878621101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.337900024140254e-05, + "grad_norm": 3.6484451293945312, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8734464645385742, + "num_tokens": 296811491.0, + "step": 7781 + }, + { + "epoch": 0.9899503879913497, + "ewc_loss": 0.007355102337896824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.355102570727468e-05, + "grad_norm": 3.7049169540405273, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.871496319770813, + "num_tokens": 296847602.0, + "step": 7782 + }, + { + "epoch": 0.9900775982699402, + "ewc_loss": 0.007379040587693453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.379040471278131e-05, + "grad_norm": 3.68266224861145, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8509690761566162, + "num_tokens": 296884737.0, + "step": 7783 + }, + { + "epoch": 0.9902048085485308, + "ewc_loss": 0.007348968181759119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.34896821086295e-05, + "grad_norm": 3.6315762996673584, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.855944037437439, + "num_tokens": 296927780.0, + "step": 7784 + }, + { + "epoch": 0.9903320188271212, + "ewc_loss": 0.007335839327424765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.335839472943917e-05, + "grad_norm": 3.6652469635009766, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8821252584457397, + "num_tokens": 296964200.0, + "step": 7785 + }, + { + "epoch": 0.9904592291057117, + "ewc_loss": 0.007365969941020012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.365969941020012e-05, + "grad_norm": 3.6022398471832275, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.859603762626648, + "num_tokens": 297009927.0, + "step": 7786 + }, + { + "epoch": 0.9905864393843022, + "ewc_loss": 0.007309155073016882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.309154898393899e-05, + "grad_norm": 3.7048709392547607, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8559434413909912, + "num_tokens": 297050788.0, + "step": 7787 + }, + { + "epoch": 0.9907136496628928, + "ewc_loss": 0.007409576326608658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.409576210193336e-05, + "grad_norm": 3.6483194828033447, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8728727102279663, + "num_tokens": 297092398.0, + "step": 7788 + }, + { + "epoch": 0.9908408599414833, + "ewc_loss": 0.007316572591662407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.316572737181559e-05, + "grad_norm": 3.679979085922241, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.866338849067688, + "num_tokens": 297130728.0, + "step": 7789 + }, + { + "epoch": 0.9909680702200738, + "ewc_loss": 0.007364021614193916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.364021439570934e-05, + "grad_norm": 3.616001844406128, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8901858329772949, + "num_tokens": 297169124.0, + "step": 7790 + }, + { + "epoch": 0.9910952804986642, + "ewc_loss": 0.007312502712011337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.312502566492185e-05, + "grad_norm": 3.6827759742736816, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8786438703536987, + "num_tokens": 297204914.0, + "step": 7791 + }, + { + "epoch": 0.9912224907772548, + "ewc_loss": 0.007374404929578304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.374404958682135e-05, + "grad_norm": 3.7033278942108154, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8640086650848389, + "num_tokens": 297240551.0, + "step": 7792 + }, + { + "epoch": 0.9913497010558453, + "ewc_loss": 0.0073372311890125275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.33723136363551e-05, + "grad_norm": 3.6045689582824707, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8581783175468445, + "num_tokens": 297287160.0, + "step": 7793 + }, + { + "epoch": 0.9914769113344358, + "ewc_loss": 0.0072837769985198975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.283777085831389e-05, + "grad_norm": 3.6900503635406494, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8702900409698486, + "num_tokens": 297323394.0, + "step": 7794 + }, + { + "epoch": 0.9916041216130264, + "ewc_loss": 0.007378292270004749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.378292502835393e-05, + "grad_norm": 3.746023178100586, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8454175591468811, + "num_tokens": 297357808.0, + "step": 7795 + }, + { + "epoch": 0.9917313318916169, + "ewc_loss": 0.007365851197391748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.365851342910901e-05, + "grad_norm": 3.6442766189575195, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8634302616119385, + "num_tokens": 297397300.0, + "step": 7796 + }, + { + "epoch": 0.9918585421702073, + "ewc_loss": 0.007285056170076132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.285056199179962e-05, + "grad_norm": 3.6210532188415527, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8772636651992798, + "num_tokens": 297434699.0, + "step": 7797 + }, + { + "epoch": 0.9919857524487978, + "ewc_loss": 0.00731623824685812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.316238043131307e-05, + "grad_norm": 3.65231990814209, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.866980791091919, + "num_tokens": 297473939.0, + "step": 7798 + }, + { + "epoch": 0.9921129627273884, + "ewc_loss": 0.007335759233683348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.335759437410161e-05, + "grad_norm": 3.709785223007202, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8603771924972534, + "num_tokens": 297511154.0, + "step": 7799 + }, + { + "epoch": 0.9922401730059789, + "ewc_loss": 0.0073552122339606285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.355212437687442e-05, + "grad_norm": 3.6219217777252197, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8590913414955139, + "num_tokens": 297556092.0, + "step": 7800 + }, + { + "epoch": 0.9923673832845694, + "ewc_loss": 0.00729779340326786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.297793490579352e-05, + "grad_norm": 3.627248764038086, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8716033697128296, + "num_tokens": 297598160.0, + "step": 7801 + }, + { + "epoch": 0.9924945935631599, + "ewc_loss": 0.007312071975320578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.312071829801425e-05, + "grad_norm": 3.5992271900177, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8837891817092896, + "num_tokens": 297638706.0, + "step": 7802 + }, + { + "epoch": 0.9926218038417504, + "ewc_loss": 0.007286252453923225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.286252366611734e-05, + "grad_norm": 3.6642208099365234, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8734279870986938, + "num_tokens": 297675955.0, + "step": 7803 + }, + { + "epoch": 0.9927490141203409, + "ewc_loss": 0.007326387334614992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.326387276407331e-05, + "grad_norm": 3.6398229598999023, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8633078336715698, + "num_tokens": 297715362.0, + "step": 7804 + }, + { + "epoch": 0.9928762243989314, + "ewc_loss": 0.007311469875276089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.311470108106732e-05, + "grad_norm": 3.710484027862549, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8522707223892212, + "num_tokens": 297750931.0, + "step": 7805 + }, + { + "epoch": 0.993003434677522, + "ewc_loss": 0.007335612550377846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.335612463066354e-05, + "grad_norm": 3.709798574447632, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8591978549957275, + "num_tokens": 297786107.0, + "step": 7806 + }, + { + "epoch": 0.9931306449561125, + "ewc_loss": 0.00730956019833684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.309560169233009e-05, + "grad_norm": 3.643620491027832, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8676818013191223, + "num_tokens": 297824782.0, + "step": 7807 + }, + { + "epoch": 0.993257855234703, + "ewc_loss": 0.00728073064237833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.28073064237833e-05, + "grad_norm": 3.689654588699341, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8631020784378052, + "num_tokens": 297861748.0, + "step": 7808 + }, + { + "epoch": 0.9933850655132934, + "ewc_loss": 0.007324176840484142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.324176840484142e-05, + "grad_norm": 3.6278247833251953, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.868420422077179, + "num_tokens": 297902722.0, + "step": 7809 + }, + { + "epoch": 0.993512275791884, + "ewc_loss": 0.007284050807356834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.284050661837682e-05, + "grad_norm": 3.599130392074585, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8683639764785767, + "num_tokens": 297948709.0, + "step": 7810 + }, + { + "epoch": 0.9936394860704745, + "ewc_loss": 0.007286537438631058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.28653758415021e-05, + "grad_norm": 3.665273427963257, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8709312081336975, + "num_tokens": 297986446.0, + "step": 7811 + }, + { + "epoch": 0.993766696349065, + "ewc_loss": 0.007331647910177708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.331647793762386e-05, + "grad_norm": 3.699533462524414, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8596569299697876, + "num_tokens": 298024004.0, + "step": 7812 + }, + { + "epoch": 0.9938939066276555, + "ewc_loss": 0.007300693076103926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.300692959688604e-05, + "grad_norm": 3.6452596187591553, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8579848408699036, + "num_tokens": 298065518.0, + "step": 7813 + }, + { + "epoch": 0.9940211169062461, + "ewc_loss": 0.007255770266056061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.255770469782874e-05, + "grad_norm": 3.6786513328552246, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8593025207519531, + "num_tokens": 298108138.0, + "step": 7814 + }, + { + "epoch": 0.9941483271848366, + "ewc_loss": 0.007301083765923977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.301083678612486e-05, + "grad_norm": 3.640514373779297, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.868196427822113, + "num_tokens": 298149348.0, + "step": 7815 + }, + { + "epoch": 0.994275537463427, + "ewc_loss": 0.007255529053509235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.255528907990083e-05, + "grad_norm": 3.6599957942962646, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8656765222549438, + "num_tokens": 298186347.0, + "step": 7816 + }, + { + "epoch": 0.9944027477420175, + "ewc_loss": 0.007275138981640339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.27513906895183e-05, + "grad_norm": 3.680971384048462, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8474199771881104, + "num_tokens": 298223322.0, + "step": 7817 + }, + { + "epoch": 0.9945299580206081, + "ewc_loss": 0.007291328627616167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.29132880223915e-05, + "grad_norm": 3.674534559249878, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8634735941886902, + "num_tokens": 298258794.0, + "step": 7818 + }, + { + "epoch": 0.9946571682991986, + "ewc_loss": 0.007284414488822222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.284414459718391e-05, + "grad_norm": 3.6806135177612305, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8614311218261719, + "num_tokens": 298297510.0, + "step": 7819 + }, + { + "epoch": 0.9947843785777891, + "ewc_loss": 0.007300092838704586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.300092693185434e-05, + "grad_norm": 3.6691060066223145, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8737115859985352, + "num_tokens": 298333784.0, + "step": 7820 + }, + { + "epoch": 0.9949115888563796, + "ewc_loss": 0.007302542682737112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.302542508114129e-05, + "grad_norm": 3.644080400466919, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8743627071380615, + "num_tokens": 298371241.0, + "step": 7821 + }, + { + "epoch": 0.9950387991349701, + "ewc_loss": 0.007302890531718731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.302890298888087e-05, + "grad_norm": 3.676969051361084, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8535209894180298, + "num_tokens": 298407902.0, + "step": 7822 + }, + { + "epoch": 0.9951660094135606, + "ewc_loss": 0.007321028504520655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.321028533624485e-05, + "grad_norm": 3.617959499359131, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8443192839622498, + "num_tokens": 298452339.0, + "step": 7823 + }, + { + "epoch": 0.9952932196921511, + "ewc_loss": 0.007286294363439083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.286294567165896e-05, + "grad_norm": 3.6113908290863037, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8583612442016602, + "num_tokens": 298495854.0, + "step": 7824 + }, + { + "epoch": 0.9954204299707416, + "ewc_loss": 0.007326510269194841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.326510240091011e-05, + "grad_norm": 3.7195982933044434, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8605909943580627, + "num_tokens": 298529642.0, + "step": 7825 + }, + { + "epoch": 0.9955476402493322, + "ewc_loss": 0.007385551929473877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.385551725747064e-05, + "grad_norm": 3.629538059234619, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8673652410507202, + "num_tokens": 298574396.0, + "step": 7826 + }, + { + "epoch": 0.9956748505279227, + "ewc_loss": 0.007292920257896185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.292920054169372e-05, + "grad_norm": 3.7016994953155518, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8534051775932312, + "num_tokens": 298612457.0, + "step": 7827 + }, + { + "epoch": 0.9958020608065131, + "ewc_loss": 0.00738296564668417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.382965850410983e-05, + "grad_norm": 3.713923215866089, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8681475520133972, + "num_tokens": 298645061.0, + "step": 7828 + }, + { + "epoch": 0.9959292710851037, + "ewc_loss": 0.007353910710662603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.353910768870264e-05, + "grad_norm": 3.617215871810913, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8648188710212708, + "num_tokens": 298688014.0, + "step": 7829 + }, + { + "epoch": 0.9960564813636942, + "ewc_loss": 0.007306393701583147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.306393672479317e-05, + "grad_norm": 3.6507394313812256, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8693078756332397, + "num_tokens": 298727815.0, + "step": 7830 + }, + { + "epoch": 0.9961836916422847, + "ewc_loss": 0.007376885507255793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.37688533263281e-05, + "grad_norm": 3.6102583408355713, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8880826234817505, + "num_tokens": 298765663.0, + "step": 7831 + }, + { + "epoch": 0.9963109019208752, + "ewc_loss": 0.007322733290493488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.322733290493488e-05, + "grad_norm": 3.7088537216186523, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8704531788825989, + "num_tokens": 298799360.0, + "step": 7832 + }, + { + "epoch": 0.9964381121994658, + "ewc_loss": 0.007391277235001326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.391277176793665e-05, + "grad_norm": 3.6459178924560547, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8661759495735168, + "num_tokens": 298835669.0, + "step": 7833 + }, + { + "epoch": 0.9965653224780562, + "ewc_loss": 0.007319174706935883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.319174619624391e-05, + "grad_norm": 3.7641029357910156, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8645411133766174, + "num_tokens": 298866871.0, + "step": 7834 + }, + { + "epoch": 0.9966925327566467, + "ewc_loss": 0.0074289701879024506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.428970275213942e-05, + "grad_norm": 3.6713428497314453, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8580116629600525, + "num_tokens": 298904136.0, + "step": 7835 + }, + { + "epoch": 0.9968197430352372, + "ewc_loss": 0.007326256949454546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.326257036766037e-05, + "grad_norm": 3.6461737155914307, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8798436522483826, + "num_tokens": 298937069.0, + "step": 7836 + }, + { + "epoch": 0.9969469533138278, + "ewc_loss": 0.0073464238084852695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.34642380848527e-05, + "grad_norm": 3.6628549098968506, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8584795594215393, + "num_tokens": 298976778.0, + "step": 7837 + }, + { + "epoch": 0.9970741635924183, + "ewc_loss": 0.007371018640697002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.371018728008494e-05, + "grad_norm": 3.6721270084381104, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8769785165786743, + "num_tokens": 299010808.0, + "step": 7838 + }, + { + "epoch": 0.9972013738710088, + "ewc_loss": 0.00738633843138814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.386338256765157e-05, + "grad_norm": 3.6783831119537354, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8566954135894775, + "num_tokens": 299047611.0, + "step": 7839 + }, + { + "epoch": 0.9973285841495992, + "ewc_loss": 0.007391586899757385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.391587132588029e-05, + "grad_norm": 3.6369242668151855, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8721668124198914, + "num_tokens": 299088668.0, + "step": 7840 + }, + { + "epoch": 0.9974557944281898, + "ewc_loss": 0.0073553635738790035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.355363777605817e-05, + "grad_norm": 3.667928695678711, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8471845984458923, + "num_tokens": 299126572.0, + "step": 7841 + }, + { + "epoch": 0.9975830047067803, + "ewc_loss": 0.007406498305499554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.406498480122536e-05, + "grad_norm": 3.6699891090393066, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8783042430877686, + "num_tokens": 299164806.0, + "step": 7842 + }, + { + "epoch": 0.9977102149853708, + "ewc_loss": 0.0073746428824961185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.374642882496119e-05, + "grad_norm": 3.6756231784820557, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8623083233833313, + "num_tokens": 299207039.0, + "step": 7843 + }, + { + "epoch": 0.9978374252639614, + "ewc_loss": 0.00738453958183527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.384539640042931e-05, + "grad_norm": 3.6393749713897705, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8757960796356201, + "num_tokens": 299243242.0, + "step": 7844 + }, + { + "epoch": 0.9979646355425519, + "ewc_loss": 0.007347848266363144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.347848440986127e-05, + "grad_norm": 3.6461198329925537, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8738377094268799, + "num_tokens": 299282326.0, + "step": 7845 + }, + { + "epoch": 0.9980918458211423, + "ewc_loss": 0.0073622651398181915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.36226502340287e-05, + "grad_norm": 3.638204336166382, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8942986726760864, + "num_tokens": 299316645.0, + "step": 7846 + }, + { + "epoch": 0.9982190560997328, + "ewc_loss": 0.0073427907191216946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.342790922848508e-05, + "grad_norm": 3.7243704795837402, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8685654401779175, + "num_tokens": 299349932.0, + "step": 7847 + }, + { + "epoch": 0.9983462663783234, + "ewc_loss": 0.007397118490189314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.397118315566331e-05, + "grad_norm": 3.6835479736328125, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8463248014450073, + "num_tokens": 299392452.0, + "step": 7848 + }, + { + "epoch": 0.9984734766569139, + "ewc_loss": 0.007360008545219898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.360008748946711e-05, + "grad_norm": 3.661360740661621, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8731647729873657, + "num_tokens": 299431479.0, + "step": 7849 + }, + { + "epoch": 0.9986006869355044, + "ewc_loss": 0.007348192390054464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.348192593781278e-05, + "grad_norm": 3.7534992694854736, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8640725612640381, + "num_tokens": 299475585.0, + "step": 7850 + }, + { + "epoch": 0.9987278972140949, + "ewc_loss": 0.007390379905700684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.390380051219836e-05, + "grad_norm": 3.602865695953369, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.883446991443634, + "num_tokens": 299514026.0, + "step": 7851 + }, + { + "epoch": 0.9988551074926854, + "ewc_loss": 0.0072845169343054295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.284517050720751e-05, + "grad_norm": 3.7095558643341064, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8561215400695801, + "num_tokens": 299550858.0, + "step": 7852 + }, + { + "epoch": 0.9989823177712759, + "ewc_loss": 0.007404169533401728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.404169446090236e-05, + "grad_norm": 3.6489460468292236, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8590409755706787, + "num_tokens": 299586831.0, + "step": 7853 + }, + { + "epoch": 0.9991095280498664, + "ewc_loss": 0.007320899050682783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.320899021578953e-05, + "grad_norm": 3.698956251144409, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8785945177078247, + "num_tokens": 299623116.0, + "step": 7854 + }, + { + "epoch": 0.9992367383284569, + "ewc_loss": 0.007372855208814144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.372855179710314e-05, + "grad_norm": 3.6418943405151367, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.869320273399353, + "num_tokens": 299662165.0, + "step": 7855 + }, + { + "epoch": 0.9993639486070475, + "ewc_loss": 0.007329961284995079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.329961226787418e-05, + "grad_norm": 3.6334667205810547, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8642492294311523, + "num_tokens": 299702581.0, + "step": 7856 + }, + { + "epoch": 0.999491158885638, + "ewc_loss": 0.007352008018642664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.352008105954155e-05, + "grad_norm": 3.6704065799713135, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8517228364944458, + "num_tokens": 299742559.0, + "step": 7857 + }, + { + "epoch": 0.9996183691642284, + "ewc_loss": 0.007372075691819191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.372075924649835e-05, + "grad_norm": 3.6329610347747803, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8821732997894287, + "num_tokens": 299778637.0, + "step": 7858 + }, + { + "epoch": 0.9997455794428189, + "ewc_loss": 0.007340381853282452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.340381853282452e-05, + "grad_norm": 3.697056770324707, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8758264183998108, + "num_tokens": 299812808.0, + "step": 7859 + }, + { + "epoch": 0.9998727897214095, + "ewc_loss": 0.007379854563623667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.379854650935158e-05, + "grad_norm": 3.6488707065582275, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.864901065826416, + "num_tokens": 299848987.0, + "step": 7860 + }, + { + "epoch": 1.0, + "ewc_loss": 0.0073465933091938496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.34659333829768e-05, + "grad_norm": 3.689779043197632, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8496739268302917, + "num_tokens": 299886286.0, + "step": 7861 + }, + { + "epoch": 1.0001272102785905, + "ewc_loss": 0.007391513790935278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.391513645416126e-05, + "grad_norm": 3.6364219188690186, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8650999665260315, + "num_tokens": 299925456.0, + "step": 7862 + }, + { + "epoch": 1.000254420557181, + "ewc_loss": 0.007335873786360025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.335873669944704e-05, + "grad_norm": 3.622792959213257, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8677774667739868, + "num_tokens": 299965936.0, + "step": 7863 + }, + { + "epoch": 1.0003816308357716, + "ewc_loss": 0.007359902374446392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.359902519965544e-05, + "grad_norm": 3.644676923751831, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8838056921958923, + "num_tokens": 300003181.0, + "step": 7864 + }, + { + "epoch": 1.0005088411143621, + "ewc_loss": 0.007382895331829786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.382895273622125e-05, + "grad_norm": 3.69425630569458, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8676179647445679, + "num_tokens": 300040502.0, + "step": 7865 + }, + { + "epoch": 1.0006360513929526, + "ewc_loss": 0.007376132998615503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.376132998615503e-05, + "grad_norm": 3.6085503101348877, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.885372519493103, + "num_tokens": 300078795.0, + "step": 7866 + }, + { + "epoch": 1.0007632616715432, + "ewc_loss": 0.007316024042665958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.31602412997745e-05, + "grad_norm": 3.6504931449890137, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8804619908332825, + "num_tokens": 300115188.0, + "step": 7867 + }, + { + "epoch": 1.0008904719501335, + "ewc_loss": 0.007369872648268938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.36987276468426e-05, + "grad_norm": 3.6374058723449707, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8727589249610901, + "num_tokens": 300158583.0, + "step": 7868 + }, + { + "epoch": 1.001017682228724, + "ewc_loss": 0.007335633039474487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.335632835747674e-05, + "grad_norm": 3.695875644683838, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8758238554000854, + "num_tokens": 300194415.0, + "step": 7869 + }, + { + "epoch": 1.0011448925073145, + "ewc_loss": 0.0073608518578112125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.360852032434195e-05, + "grad_norm": 3.619231939315796, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8658652305603027, + "num_tokens": 300235066.0, + "step": 7870 + }, + { + "epoch": 1.001272102785905, + "ewc_loss": 0.0072905137203633785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.290513894986361e-05, + "grad_norm": 3.682426929473877, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8735795617103577, + "num_tokens": 300272167.0, + "step": 7871 + }, + { + "epoch": 1.0013993130644956, + "ewc_loss": 0.007365458644926548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.365458441199735e-05, + "grad_norm": 3.732023239135742, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.862606406211853, + "num_tokens": 300306032.0, + "step": 7872 + }, + { + "epoch": 1.0015265233430861, + "ewc_loss": 0.007352746557444334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.352746615651995e-05, + "grad_norm": 3.71456241607666, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8698866963386536, + "num_tokens": 300338433.0, + "step": 7873 + }, + { + "epoch": 1.0016537336216766, + "ewc_loss": 0.007346432656049728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.346432539634407e-05, + "grad_norm": 3.7220656871795654, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8616654872894287, + "num_tokens": 300376527.0, + "step": 7874 + }, + { + "epoch": 1.0017809439002672, + "ewc_loss": 0.007366145960986614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.366146019194275e-05, + "grad_norm": 3.5956497192382812, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8886218070983887, + "num_tokens": 300417309.0, + "step": 7875 + }, + { + "epoch": 1.0019081541788577, + "ewc_loss": 0.0072815921157598495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.28159211575985e-05, + "grad_norm": 3.6226117610931396, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8728625774383545, + "num_tokens": 300459438.0, + "step": 7876 + }, + { + "epoch": 1.0020353644574482, + "ewc_loss": 0.007342817261815071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.342817116295919e-05, + "grad_norm": 3.6727521419525146, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8484121561050415, + "num_tokens": 300501443.0, + "step": 7877 + }, + { + "epoch": 1.0021625747360388, + "ewc_loss": 0.007367557380348444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.367557554971427e-05, + "grad_norm": 3.747753858566284, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8809322118759155, + "num_tokens": 300531918.0, + "step": 7878 + }, + { + "epoch": 1.0022897850146293, + "ewc_loss": 0.007380284834653139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.380284660030156e-05, + "grad_norm": 3.611575126647949, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8649149537086487, + "num_tokens": 300575084.0, + "step": 7879 + }, + { + "epoch": 1.0024169952932196, + "ewc_loss": 0.007300385739654303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.300385914277285e-05, + "grad_norm": 3.7110400199890137, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8712559938430786, + "num_tokens": 300608737.0, + "step": 7880 + }, + { + "epoch": 1.0025442055718101, + "ewc_loss": 0.007415858097374439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.415858271997422e-05, + "grad_norm": 3.6592116355895996, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8614514470100403, + "num_tokens": 300647855.0, + "step": 7881 + }, + { + "epoch": 1.0026714158504006, + "ewc_loss": 0.007342077791690826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.342077879002318e-05, + "grad_norm": 3.635371208190918, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8718969821929932, + "num_tokens": 300689763.0, + "step": 7882 + }, + { + "epoch": 1.0027986261289912, + "ewc_loss": 0.007357236463576555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.357236609095708e-05, + "grad_norm": 3.679553747177124, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8815451860427856, + "num_tokens": 300725838.0, + "step": 7883 + }, + { + "epoch": 1.0029258364075817, + "ewc_loss": 0.007405523676425219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.405523501802236e-05, + "grad_norm": 3.71097469329834, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8767505288124084, + "num_tokens": 300760423.0, + "step": 7884 + }, + { + "epoch": 1.0030530466861722, + "ewc_loss": 0.007397188805043697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.397188892355189e-05, + "grad_norm": 3.655787467956543, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8771318793296814, + "num_tokens": 300796236.0, + "step": 7885 + }, + { + "epoch": 1.0031802569647628, + "ewc_loss": 0.00735162477940321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.351624662987888e-05, + "grad_norm": 3.6568827629089355, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8762286901473999, + "num_tokens": 300835213.0, + "step": 7886 + }, + { + "epoch": 1.0033074672433533, + "ewc_loss": 0.007376125548034906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.376125722657889e-05, + "grad_norm": 3.6509933471679688, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8783641457557678, + "num_tokens": 300873363.0, + "step": 7887 + }, + { + "epoch": 1.0034346775219438, + "ewc_loss": 0.007385903969407082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.38590388209559e-05, + "grad_norm": 3.761059045791626, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8664308786392212, + "num_tokens": 300910233.0, + "step": 7888 + }, + { + "epoch": 1.0035618878005343, + "ewc_loss": 0.007443389855325222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.44338976801373e-05, + "grad_norm": 3.654024124145508, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.878108024597168, + "num_tokens": 300946986.0, + "step": 7889 + }, + { + "epoch": 1.0036890980791249, + "ewc_loss": 0.007345461752265692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.345461926888674e-05, + "grad_norm": 3.6479032039642334, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8741079568862915, + "num_tokens": 300985750.0, + "step": 7890 + }, + { + "epoch": 1.0038163083577154, + "ewc_loss": 0.007393723353743553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.393723353743553e-05, + "grad_norm": 3.6212258338928223, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8698697686195374, + "num_tokens": 301032820.0, + "step": 7891 + }, + { + "epoch": 1.0039435186363057, + "ewc_loss": 0.007366362027823925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.366362115135416e-05, + "grad_norm": 3.6489555835723877, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8627545833587646, + "num_tokens": 301070825.0, + "step": 7892 + }, + { + "epoch": 1.0040707289148962, + "ewc_loss": 0.0073930490761995316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.393048872472718e-05, + "grad_norm": 3.676616668701172, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8684583306312561, + "num_tokens": 301110764.0, + "step": 7893 + }, + { + "epoch": 1.0041979391934868, + "ewc_loss": 0.007382530719041824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.382530748145655e-05, + "grad_norm": 3.6016945838928223, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8751028776168823, + "num_tokens": 301152421.0, + "step": 7894 + }, + { + "epoch": 1.0043251494720773, + "ewc_loss": 0.007330328691750765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.330328662646934e-05, + "grad_norm": 3.647580623626709, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8715654611587524, + "num_tokens": 301192780.0, + "step": 7895 + }, + { + "epoch": 1.0044523597506678, + "ewc_loss": 0.007383684162050486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.383683987427503e-05, + "grad_norm": 3.7256810665130615, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8680312037467957, + "num_tokens": 301228147.0, + "step": 7896 + }, + { + "epoch": 1.0045795700292584, + "ewc_loss": 0.0074218083173036575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.421808550134301e-05, + "grad_norm": 3.7334976196289062, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8817278146743774, + "num_tokens": 301259568.0, + "step": 7897 + }, + { + "epoch": 1.0047067803078489, + "ewc_loss": 0.007389367092400789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.389367237919942e-05, + "grad_norm": 3.6723568439483643, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8722116351127625, + "num_tokens": 301296483.0, + "step": 7898 + }, + { + "epoch": 1.0048339905864394, + "ewc_loss": 0.007359080947935581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.359081064350903e-05, + "grad_norm": 3.611816167831421, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8741171360015869, + "num_tokens": 301337437.0, + "step": 7899 + }, + { + "epoch": 1.00496120086503, + "ewc_loss": 0.007369604893028736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.369605009444058e-05, + "grad_norm": 3.7421200275421143, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8706364631652832, + "num_tokens": 301368777.0, + "step": 7900 + }, + { + "epoch": 1.0050884111436205, + "ewc_loss": 0.007450680248439312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.450680277543142e-05, + "grad_norm": 3.6923232078552246, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8679252862930298, + "num_tokens": 301404885.0, + "step": 7901 + }, + { + "epoch": 1.005215621422211, + "ewc_loss": 0.007391203194856644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.391202962026e-05, + "grad_norm": 3.650635004043579, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8624014258384705, + "num_tokens": 301445308.0, + "step": 7902 + }, + { + "epoch": 1.0053428317008015, + "ewc_loss": 0.007397341076284647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.397340959869325e-05, + "grad_norm": 3.656348466873169, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8678119778633118, + "num_tokens": 301484845.0, + "step": 7903 + }, + { + "epoch": 1.0054700419793918, + "ewc_loss": 0.007415145169943571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.415145228151232e-05, + "grad_norm": 3.662001848220825, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8684297800064087, + "num_tokens": 301524498.0, + "step": 7904 + }, + { + "epoch": 1.0055972522579824, + "ewc_loss": 0.007417734712362289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.41773474146612e-05, + "grad_norm": 3.62121319770813, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8763296604156494, + "num_tokens": 301565522.0, + "step": 7905 + }, + { + "epoch": 1.0057244625365729, + "ewc_loss": 0.007387041114270687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.387041114270687e-05, + "grad_norm": 3.6630020141601562, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8672360181808472, + "num_tokens": 301603724.0, + "step": 7906 + }, + { + "epoch": 1.0058516728151634, + "ewc_loss": 0.007432218175381422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.432218262692913e-05, + "grad_norm": 3.754519462585449, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8686410188674927, + "num_tokens": 301636650.0, + "step": 7907 + }, + { + "epoch": 1.005978883093754, + "ewc_loss": 0.007483034394681454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.483034278266132e-05, + "grad_norm": 3.660038709640503, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8627667427062988, + "num_tokens": 301675328.0, + "step": 7908 + }, + { + "epoch": 1.0061060933723445, + "ewc_loss": 0.007403329946100712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.40332980058156e-05, + "grad_norm": 3.7159969806671143, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8645853996276855, + "num_tokens": 301710058.0, + "step": 7909 + }, + { + "epoch": 1.006233303650935, + "ewc_loss": 0.007477353326976299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.477353210560977e-05, + "grad_norm": 3.6100592613220215, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8658435344696045, + "num_tokens": 301753567.0, + "step": 7910 + }, + { + "epoch": 1.0063605139295255, + "ewc_loss": 0.0074038319289684296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.403831841656938e-05, + "grad_norm": 3.6035265922546387, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8776633143424988, + "num_tokens": 301792895.0, + "step": 7911 + }, + { + "epoch": 1.006487724208116, + "ewc_loss": 0.007442274130880833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.442274363711476e-05, + "grad_norm": 3.666104316711426, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8816764950752258, + "num_tokens": 301833001.0, + "step": 7912 + }, + { + "epoch": 1.0066149344867066, + "ewc_loss": 0.0074815223924815655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.481522334273905e-05, + "grad_norm": 3.683032989501953, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8754851818084717, + "num_tokens": 301871623.0, + "step": 7913 + }, + { + "epoch": 1.006742144765297, + "ewc_loss": 0.007462641224265099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.462641224265099e-05, + "grad_norm": 3.6719114780426025, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8826795816421509, + "num_tokens": 301903750.0, + "step": 7914 + }, + { + "epoch": 1.0068693550438876, + "ewc_loss": 0.007462130393832922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.462130452040583e-05, + "grad_norm": 3.644244909286499, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8710095882415771, + "num_tokens": 301943079.0, + "step": 7915 + }, + { + "epoch": 1.0069965653224782, + "ewc_loss": 0.007442138157784939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.442138303304091e-05, + "grad_norm": 3.6480231285095215, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.877794086933136, + "num_tokens": 301980031.0, + "step": 7916 + }, + { + "epoch": 1.0071237756010685, + "ewc_loss": 0.0074551766738295555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.455176819348708e-05, + "grad_norm": 3.6458370685577393, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8716319799423218, + "num_tokens": 302020454.0, + "step": 7917 + }, + { + "epoch": 1.007250985879659, + "ewc_loss": 0.007473446894437075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.473446748917922e-05, + "grad_norm": 3.6679723262786865, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8656066060066223, + "num_tokens": 302064391.0, + "step": 7918 + }, + { + "epoch": 1.0073781961582495, + "ewc_loss": 0.00746881403028965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.468814146704972e-05, + "grad_norm": 3.6830077171325684, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8700295686721802, + "num_tokens": 302101598.0, + "step": 7919 + }, + { + "epoch": 1.00750540643684, + "ewc_loss": 0.007472052704542875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.472052675439045e-05, + "grad_norm": 3.6549763679504395, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8753151297569275, + "num_tokens": 302138803.0, + "step": 7920 + }, + { + "epoch": 1.0076326167154306, + "ewc_loss": 0.007456254214048386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.456254388671368e-05, + "grad_norm": 3.718916654586792, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8673998713493347, + "num_tokens": 302173682.0, + "step": 7921 + }, + { + "epoch": 1.0077598269940211, + "ewc_loss": 0.0074880896136164665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.488089613616467e-05, + "grad_norm": 3.671234607696533, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8674942851066589, + "num_tokens": 302207390.0, + "step": 7922 + }, + { + "epoch": 1.0078870372726116, + "ewc_loss": 0.00745568098500371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.455681043211371e-05, + "grad_norm": 3.68039870262146, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8769514560699463, + "num_tokens": 302244440.0, + "step": 7923 + }, + { + "epoch": 1.0080142475512022, + "ewc_loss": 0.007472841534763575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.472841389244422e-05, + "grad_norm": 3.654528856277466, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8665878772735596, + "num_tokens": 302281058.0, + "step": 7924 + }, + { + "epoch": 1.0081414578297927, + "ewc_loss": 0.007451316341757774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.451316196238622e-05, + "grad_norm": 3.6244966983795166, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.886290431022644, + "num_tokens": 302319586.0, + "step": 7925 + }, + { + "epoch": 1.0082686681083832, + "ewc_loss": 0.007456147577613592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.45614743209444e-05, + "grad_norm": 3.651510238647461, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8645352125167847, + "num_tokens": 302361356.0, + "step": 7926 + }, + { + "epoch": 1.0083958783869738, + "ewc_loss": 0.007474077399820089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.474077574443072e-05, + "grad_norm": 3.667999505996704, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8796101808547974, + "num_tokens": 302399974.0, + "step": 7927 + }, + { + "epoch": 1.0085230886655643, + "ewc_loss": 0.00747706787660718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.477067993022501e-05, + "grad_norm": 3.6966285705566406, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8759071826934814, + "num_tokens": 302435709.0, + "step": 7928 + }, + { + "epoch": 1.0086502989441546, + "ewc_loss": 0.007472252007573843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.472252036677673e-05, + "grad_norm": 3.6558444499969482, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8678240776062012, + "num_tokens": 302474151.0, + "step": 7929 + }, + { + "epoch": 1.0087775092227451, + "ewc_loss": 0.007447566371411085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.447566167684272e-05, + "grad_norm": 3.7265050411224365, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8742834329605103, + "num_tokens": 302509877.0, + "step": 7930 + }, + { + "epoch": 1.0089047195013356, + "ewc_loss": 0.007489430718123913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.48943057260476e-05, + "grad_norm": 3.6526377201080322, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8846611380577087, + "num_tokens": 302547689.0, + "step": 7931 + }, + { + "epoch": 1.0090319297799262, + "ewc_loss": 0.007431878708302975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.431878475472331e-05, + "grad_norm": 3.6547043323516846, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8702567219734192, + "num_tokens": 302590728.0, + "step": 7932 + }, + { + "epoch": 1.0091591400585167, + "ewc_loss": 0.007449378725141287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.449378608725965e-05, + "grad_norm": 3.7519209384918213, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8490439653396606, + "num_tokens": 302625709.0, + "step": 7933 + }, + { + "epoch": 1.0092863503371072, + "ewc_loss": 0.007510497234761715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.510497380280867e-05, + "grad_norm": 3.6857919692993164, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8792189955711365, + "num_tokens": 302662086.0, + "step": 7934 + }, + { + "epoch": 1.0094135606156978, + "ewc_loss": 0.0074268486350774765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.426848605973646e-05, + "grad_norm": 3.583688974380493, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8875033259391785, + "num_tokens": 302703680.0, + "step": 7935 + }, + { + "epoch": 1.0095407708942883, + "ewc_loss": 0.007404780481010675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.404780626529828e-05, + "grad_norm": 3.697972297668457, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8576610088348389, + "num_tokens": 302740426.0, + "step": 7936 + }, + { + "epoch": 1.0096679811728788, + "ewc_loss": 0.007512186653912067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51218685763888e-05, + "grad_norm": 3.615550994873047, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8823021650314331, + "num_tokens": 302780778.0, + "step": 7937 + }, + { + "epoch": 1.0097951914514693, + "ewc_loss": 0.007413939572870731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.413939601974562e-05, + "grad_norm": 3.673489570617676, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8740519881248474, + "num_tokens": 302816012.0, + "step": 7938 + }, + { + "epoch": 1.0099224017300599, + "ewc_loss": 0.007470420561730862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.470420678146183e-05, + "grad_norm": 3.6742711067199707, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8504678010940552, + "num_tokens": 302858325.0, + "step": 7939 + }, + { + "epoch": 1.0100496120086504, + "ewc_loss": 0.007466932758688927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.466932584065944e-05, + "grad_norm": 3.6869564056396484, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8723764419555664, + "num_tokens": 302898210.0, + "step": 7940 + }, + { + "epoch": 1.0101768222872407, + "ewc_loss": 0.007462685462087393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.462685607606545e-05, + "grad_norm": 3.661552906036377, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8711071610450745, + "num_tokens": 302936072.0, + "step": 7941 + }, + { + "epoch": 1.0103040325658312, + "ewc_loss": 0.007437459658831358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.437459862558171e-05, + "grad_norm": 3.635944128036499, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8642531633377075, + "num_tokens": 302978941.0, + "step": 7942 + }, + { + "epoch": 1.0104312428444218, + "ewc_loss": 0.007442896254360676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.442896458087489e-05, + "grad_norm": 3.711935043334961, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8637030720710754, + "num_tokens": 303016664.0, + "step": 7943 + }, + { + "epoch": 1.0105584531230123, + "ewc_loss": 0.007482401095330715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.482401269953698e-05, + "grad_norm": 3.588283061981201, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8694449067115784, + "num_tokens": 303063313.0, + "step": 7944 + }, + { + "epoch": 1.0106856634016028, + "ewc_loss": 0.007392446510493755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.392446423182264e-05, + "grad_norm": 3.70786714553833, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8619804978370667, + "num_tokens": 303103786.0, + "step": 7945 + }, + { + "epoch": 1.0108128736801933, + "ewc_loss": 0.0075163161382079124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51631596358493e-05, + "grad_norm": 3.6971187591552734, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8775618672370911, + "num_tokens": 303136928.0, + "step": 7946 + }, + { + "epoch": 1.0109400839587839, + "ewc_loss": 0.007443699520081282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.443699723808095e-05, + "grad_norm": 3.6602654457092285, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8791699409484863, + "num_tokens": 303174854.0, + "step": 7947 + }, + { + "epoch": 1.0110672942373744, + "ewc_loss": 0.007415127009153366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.415127038257197e-05, + "grad_norm": 3.6359987258911133, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8700228929519653, + "num_tokens": 303214234.0, + "step": 7948 + }, + { + "epoch": 1.011194504515965, + "ewc_loss": 0.007426985539495945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.426985393976793e-05, + "grad_norm": 3.672849178314209, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8842570185661316, + "num_tokens": 303251830.0, + "step": 7949 + }, + { + "epoch": 1.0113217147945555, + "ewc_loss": 0.007439163979142904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.439163891831413e-05, + "grad_norm": 3.6535584926605225, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.861491322517395, + "num_tokens": 303291564.0, + "step": 7950 + }, + { + "epoch": 1.011448925073146, + "ewc_loss": 0.007423722185194492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.423722126986831e-05, + "grad_norm": 3.6981818675994873, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8696891069412231, + "num_tokens": 303328642.0, + "step": 7951 + }, + { + "epoch": 1.0115761353517365, + "ewc_loss": 0.007466705050319433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.46670484659262e-05, + "grad_norm": 3.6097488403320312, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.869236409664154, + "num_tokens": 303373693.0, + "step": 7952 + }, + { + "epoch": 1.0117033456303268, + "ewc_loss": 0.00738503597676754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.385035860352218e-05, + "grad_norm": 3.7247283458709717, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.87529456615448, + "num_tokens": 303407124.0, + "step": 7953 + }, + { + "epoch": 1.0118305559089174, + "ewc_loss": 0.007487944792956114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487944822059944e-05, + "grad_norm": 3.686749219894409, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8808063268661499, + "num_tokens": 303441586.0, + "step": 7954 + }, + { + "epoch": 1.0119577661875079, + "ewc_loss": 0.007436864543706179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.436864689225331e-05, + "grad_norm": 3.77698016166687, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8723469972610474, + "num_tokens": 303473593.0, + "step": 7955 + }, + { + "epoch": 1.0120849764660984, + "ewc_loss": 0.007499207276850939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.499207276850939e-05, + "grad_norm": 3.678595542907715, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8693598508834839, + "num_tokens": 303510239.0, + "step": 7956 + }, + { + "epoch": 1.012212186744689, + "ewc_loss": 0.007422180380672216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.422180351568386e-05, + "grad_norm": 3.7106010913848877, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8687418103218079, + "num_tokens": 303544709.0, + "step": 7957 + }, + { + "epoch": 1.0123393970232795, + "ewc_loss": 0.007482759188860655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.482759247068316e-05, + "grad_norm": 3.6256210803985596, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8818585872650146, + "num_tokens": 303581990.0, + "step": 7958 + }, + { + "epoch": 1.01246660730187, + "ewc_loss": 0.007428294979035854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.428295066347346e-05, + "grad_norm": 3.6379854679107666, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8760141730308533, + "num_tokens": 303625167.0, + "step": 7959 + }, + { + "epoch": 1.0125938175804605, + "ewc_loss": 0.007469247095286846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469247066183016e-05, + "grad_norm": 3.669980049133301, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8722110986709595, + "num_tokens": 303664751.0, + "step": 7960 + }, + { + "epoch": 1.012721027859051, + "ewc_loss": 0.007478839717805386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.478839688701555e-05, + "grad_norm": 3.6815147399902344, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8689615726470947, + "num_tokens": 303702190.0, + "step": 7961 + }, + { + "epoch": 1.0128482381376416, + "ewc_loss": 0.007479097228497267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.479097257601097e-05, + "grad_norm": 3.6304898262023926, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8857652544975281, + "num_tokens": 303742872.0, + "step": 7962 + }, + { + "epoch": 1.012975448416232, + "ewc_loss": 0.0074359034188091755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.435903535224497e-05, + "grad_norm": 3.6754493713378906, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8636105060577393, + "num_tokens": 303781265.0, + "step": 7963 + }, + { + "epoch": 1.0131026586948226, + "ewc_loss": 0.007499586325138807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.499586354242638e-05, + "grad_norm": 3.685838460922241, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8724654912948608, + "num_tokens": 303820686.0, + "step": 7964 + }, + { + "epoch": 1.0132298689734132, + "ewc_loss": 0.00748140225186944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.48140228097327e-05, + "grad_norm": 3.619036912918091, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8743243217468262, + "num_tokens": 303863428.0, + "step": 7965 + }, + { + "epoch": 1.0133570792520035, + "ewc_loss": 0.007435470819473267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.435470615746453e-05, + "grad_norm": 3.666370153427124, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8676818013191223, + "num_tokens": 303903847.0, + "step": 7966 + }, + { + "epoch": 1.013484289530594, + "ewc_loss": 0.007466913666576147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.466913666576147e-05, + "grad_norm": 3.671518564224243, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.863624632358551, + "num_tokens": 303943083.0, + "step": 7967 + }, + { + "epoch": 1.0136114998091845, + "ewc_loss": 0.007462979760020971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.462979556294158e-05, + "grad_norm": 3.6784474849700928, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.882624626159668, + "num_tokens": 303978773.0, + "step": 7968 + }, + { + "epoch": 1.013738710087775, + "ewc_loss": 0.007457548752427101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.457548781530932e-05, + "grad_norm": 3.631305694580078, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8767156600952148, + "num_tokens": 304017058.0, + "step": 7969 + }, + { + "epoch": 1.0138659203663656, + "ewc_loss": 0.007425461895763874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.425461808452383e-05, + "grad_norm": 3.6865732669830322, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8637558817863464, + "num_tokens": 304054104.0, + "step": 7970 + }, + { + "epoch": 1.013993130644956, + "ewc_loss": 0.0074804434552788734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.480443309759721e-05, + "grad_norm": 3.650568962097168, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8860552906990051, + "num_tokens": 304091100.0, + "step": 7971 + }, + { + "epoch": 1.0141203409235466, + "ewc_loss": 0.007432729005813599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.432729034917429e-05, + "grad_norm": 3.6815505027770996, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8720252513885498, + "num_tokens": 304127747.0, + "step": 7972 + }, + { + "epoch": 1.0142475512021372, + "ewc_loss": 0.007456120569258928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.456120511051267e-05, + "grad_norm": 3.5881309509277344, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8725221157073975, + "num_tokens": 304171089.0, + "step": 7973 + }, + { + "epoch": 1.0143747614807277, + "ewc_loss": 0.0073876166716217995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.387616642517969e-05, + "grad_norm": 3.686220407485962, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8635783791542053, + "num_tokens": 304210207.0, + "step": 7974 + }, + { + "epoch": 1.0145019717593182, + "ewc_loss": 0.007483501452952623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.483501394744962e-05, + "grad_norm": 3.6686489582061768, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.887991726398468, + "num_tokens": 304249423.0, + "step": 7975 + }, + { + "epoch": 1.0146291820379088, + "ewc_loss": 0.0074112918227910995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.41129188099876e-05, + "grad_norm": 3.651484966278076, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8748536109924316, + "num_tokens": 304286872.0, + "step": 7976 + }, + { + "epoch": 1.0147563923164993, + "ewc_loss": 0.007427961099892855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.427961099892855e-05, + "grad_norm": 3.7166237831115723, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.875088095664978, + "num_tokens": 304323033.0, + "step": 7977 + }, + { + "epoch": 1.0148836025950896, + "ewc_loss": 0.007471853401511908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.471853314200416e-05, + "grad_norm": 3.7078750133514404, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8757885098457336, + "num_tokens": 304360096.0, + "step": 7978 + }, + { + "epoch": 1.0150108128736801, + "ewc_loss": 0.007432141341269016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.432141137542203e-05, + "grad_norm": 3.639167308807373, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8915492296218872, + "num_tokens": 304396364.0, + "step": 7979 + }, + { + "epoch": 1.0151380231522706, + "ewc_loss": 0.0074099646881222725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.409964746329933e-05, + "grad_norm": 3.720546245574951, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8716893196105957, + "num_tokens": 304431583.0, + "step": 7980 + }, + { + "epoch": 1.0152652334308612, + "ewc_loss": 0.00747034652158618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.470346463378519e-05, + "grad_norm": 3.6659603118896484, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8661791086196899, + "num_tokens": 304470531.0, + "step": 7981 + }, + { + "epoch": 1.0153924437094517, + "ewc_loss": 0.007428640034049749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.428639946738258e-05, + "grad_norm": 3.7137584686279297, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8758699893951416, + "num_tokens": 304505187.0, + "step": 7982 + }, + { + "epoch": 1.0155196539880422, + "ewc_loss": 0.007474895566701889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.474895392078906e-05, + "grad_norm": 3.7226223945617676, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.849537193775177, + "num_tokens": 304542685.0, + "step": 7983 + }, + { + "epoch": 1.0156468642666328, + "ewc_loss": 0.007465185597538948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.465185626642779e-05, + "grad_norm": 3.6600146293640137, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8718845248222351, + "num_tokens": 304579204.0, + "step": 7984 + }, + { + "epoch": 1.0157740745452233, + "ewc_loss": 0.007427203468978405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.427203672705218e-05, + "grad_norm": 3.6753687858581543, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8802436590194702, + "num_tokens": 304615224.0, + "step": 7985 + }, + { + "epoch": 1.0159012848238138, + "ewc_loss": 0.00747216260060668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.472162542399019e-05, + "grad_norm": 3.681671380996704, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8682823181152344, + "num_tokens": 304653513.0, + "step": 7986 + }, + { + "epoch": 1.0160284951024043, + "ewc_loss": 0.007476402446627617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.476402242900804e-05, + "grad_norm": 3.684246778488159, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8771791458129883, + "num_tokens": 304686744.0, + "step": 7987 + }, + { + "epoch": 1.0161557053809949, + "ewc_loss": 0.00748059106990695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.480591011699289e-05, + "grad_norm": 3.6354496479034424, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8713057637214661, + "num_tokens": 304727262.0, + "step": 7988 + }, + { + "epoch": 1.0162829156595854, + "ewc_loss": 0.007442576345056295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.442576315952465e-05, + "grad_norm": 3.6727869510650635, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8695127367973328, + "num_tokens": 304763418.0, + "step": 7989 + }, + { + "epoch": 1.0164101259381757, + "ewc_loss": 0.007504408713430166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.504408858949319e-05, + "grad_norm": 3.6571919918060303, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8757164478302002, + "num_tokens": 304800763.0, + "step": 7990 + }, + { + "epoch": 1.0165373362167662, + "ewc_loss": 0.007451305165886879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.451305282302201e-05, + "grad_norm": 3.671647071838379, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.890175461769104, + "num_tokens": 304833136.0, + "step": 7991 + }, + { + "epoch": 1.0166645464953568, + "ewc_loss": 0.007475520949810743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.475521124433726e-05, + "grad_norm": 3.636864185333252, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8814521431922913, + "num_tokens": 304873208.0, + "step": 7992 + }, + { + "epoch": 1.0167917567739473, + "ewc_loss": 0.007446590345352888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.44659046176821e-05, + "grad_norm": 3.6466567516326904, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8735532760620117, + "num_tokens": 304914237.0, + "step": 7993 + }, + { + "epoch": 1.0169189670525378, + "ewc_loss": 0.007465382106602192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.465382077498361e-05, + "grad_norm": 3.6144869327545166, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8870521187782288, + "num_tokens": 304955453.0, + "step": 7994 + }, + { + "epoch": 1.0170461773311283, + "ewc_loss": 0.007438968867063522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.438968896167353e-05, + "grad_norm": 3.731714963912964, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8717157244682312, + "num_tokens": 304990533.0, + "step": 7995 + }, + { + "epoch": 1.0171733876097189, + "ewc_loss": 0.0075295064598321915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.529506547143683e-05, + "grad_norm": 3.746514320373535, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8704147338867188, + "num_tokens": 305024849.0, + "step": 7996 + }, + { + "epoch": 1.0173005978883094, + "ewc_loss": 0.007487447001039982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487447146559134e-05, + "grad_norm": 3.669215440750122, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8694744110107422, + "num_tokens": 305061615.0, + "step": 7997 + }, + { + "epoch": 1.0174278081669, + "ewc_loss": 0.007425302639603615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.425302464980632e-05, + "grad_norm": 3.666217565536499, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8531004190444946, + "num_tokens": 305102251.0, + "step": 7998 + }, + { + "epoch": 1.0175550184454905, + "ewc_loss": 0.0074602337554097176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.460233609890565e-05, + "grad_norm": 3.700597047805786, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8822818398475647, + "num_tokens": 305135405.0, + "step": 7999 + }, + { + "epoch": 1.017682228724081, + "ewc_loss": 0.007489574141800404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.48957390896976e-05, + "grad_norm": 3.770416259765625, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8721369504928589, + "num_tokens": 305169005.0, + "step": 8000 + }, + { + "epoch": 1.0178094390026715, + "ewc_loss": 0.007521257735788822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.521257793996483e-05, + "grad_norm": 3.6395890712738037, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8679487109184265, + "num_tokens": 305209050.0, + "step": 8001 + }, + { + "epoch": 1.0179366492812618, + "ewc_loss": 0.007435500621795654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.435500447172672e-05, + "grad_norm": 3.7419135570526123, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8727508187294006, + "num_tokens": 305238250.0, + "step": 8002 + }, + { + "epoch": 1.0180638595598523, + "ewc_loss": 0.0075437151826918125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54371503717266e-05, + "grad_norm": 3.6034064292907715, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8750492930412292, + "num_tokens": 305283544.0, + "step": 8003 + }, + { + "epoch": 1.0181910698384429, + "ewc_loss": 0.007434978615492582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.434978761011735e-05, + "grad_norm": 3.7063047885894775, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8560303449630737, + "num_tokens": 305320548.0, + "step": 8004 + }, + { + "epoch": 1.0183182801170334, + "ewc_loss": 0.0075322650372982025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.53226486267522e-05, + "grad_norm": 3.6511435508728027, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8782688975334167, + "num_tokens": 305359560.0, + "step": 8005 + }, + { + "epoch": 1.018445490395624, + "ewc_loss": 0.0074663120321929455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.466311944881454e-05, + "grad_norm": 3.692201852798462, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8702554702758789, + "num_tokens": 305396680.0, + "step": 8006 + }, + { + "epoch": 1.0185727006742145, + "ewc_loss": 0.007519515231251717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.519515202147886e-05, + "grad_norm": 3.6576125621795654, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8819273710250854, + "num_tokens": 305432856.0, + "step": 8007 + }, + { + "epoch": 1.018699910952805, + "ewc_loss": 0.007476984988898039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.4769850471057e-05, + "grad_norm": 3.6690473556518555, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8737986087799072, + "num_tokens": 305469973.0, + "step": 8008 + }, + { + "epoch": 1.0188271212313955, + "ewc_loss": 0.007498989813029766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.498989725718275e-05, + "grad_norm": 3.7264022827148438, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8784826993942261, + "num_tokens": 305505576.0, + "step": 8009 + }, + { + "epoch": 1.018954331509986, + "ewc_loss": 0.007516176905483007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.516176992794499e-05, + "grad_norm": 3.7648508548736572, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8541175127029419, + "num_tokens": 305542037.0, + "step": 8010 + }, + { + "epoch": 1.0190815417885766, + "ewc_loss": 0.007520802319049835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.520802319049835e-05, + "grad_norm": 3.672062397003174, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8794408440589905, + "num_tokens": 305587784.0, + "step": 8011 + }, + { + "epoch": 1.019208752067167, + "ewc_loss": 0.007429128047078848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.42912816349417e-05, + "grad_norm": 3.7150702476501465, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8644832968711853, + "num_tokens": 305621888.0, + "step": 8012 + }, + { + "epoch": 1.0193359623457576, + "ewc_loss": 0.007491531316190958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.491531141567975e-05, + "grad_norm": 3.613027334213257, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8904045820236206, + "num_tokens": 305663844.0, + "step": 8013 + }, + { + "epoch": 1.0194631726243482, + "ewc_loss": 0.007422727532684803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.422727503580973e-05, + "grad_norm": 3.706852436065674, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8864701986312866, + "num_tokens": 305695913.0, + "step": 8014 + }, + { + "epoch": 1.0195903829029385, + "ewc_loss": 0.007514188997447491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.514189201174304e-05, + "grad_norm": 3.682755708694458, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8713109493255615, + "num_tokens": 305733632.0, + "step": 8015 + }, + { + "epoch": 1.019717593181529, + "ewc_loss": 0.007442903704941273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.442903734045103e-05, + "grad_norm": 3.7353761196136475, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8558582067489624, + "num_tokens": 305766917.0, + "step": 8016 + }, + { + "epoch": 1.0198448034601195, + "ewc_loss": 0.007489101029932499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.489100971724838e-05, + "grad_norm": 3.6564924716949463, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8673222064971924, + "num_tokens": 305805768.0, + "step": 8017 + }, + { + "epoch": 1.01997201373871, + "ewc_loss": 0.007418733090162277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.418733002850786e-05, + "grad_norm": 3.6369729042053223, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8737267255783081, + "num_tokens": 305843412.0, + "step": 8018 + }, + { + "epoch": 1.0200992240173006, + "ewc_loss": 0.007448229007422924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.448229007422924e-05, + "grad_norm": 3.6216495037078857, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8891415596008301, + "num_tokens": 305880523.0, + "step": 8019 + }, + { + "epoch": 1.020226434295891, + "ewc_loss": 0.007461403496563435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.461403583874926e-05, + "grad_norm": 3.702287197113037, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8767598867416382, + "num_tokens": 305915379.0, + "step": 8020 + }, + { + "epoch": 1.0203536445744816, + "ewc_loss": 0.00751684932038188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51684929127805e-05, + "grad_norm": 3.6594338417053223, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8859830498695374, + "num_tokens": 305952471.0, + "step": 8021 + }, + { + "epoch": 1.0204808548530722, + "ewc_loss": 0.007451925426721573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.451925193890929e-05, + "grad_norm": 3.6043877601623535, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8730112910270691, + "num_tokens": 305994023.0, + "step": 8022 + }, + { + "epoch": 1.0206080651316627, + "ewc_loss": 0.007454195059835911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.454195292666554e-05, + "grad_norm": 3.70540452003479, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.876410961151123, + "num_tokens": 306026563.0, + "step": 8023 + }, + { + "epoch": 1.0207352754102532, + "ewc_loss": 0.00752397021278739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52397027099505e-05, + "grad_norm": 3.6125504970550537, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8701238632202148, + "num_tokens": 306067913.0, + "step": 8024 + }, + { + "epoch": 1.0208624856888437, + "ewc_loss": 0.007454278413206339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.454278238583356e-05, + "grad_norm": 3.6857621669769287, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8703583478927612, + "num_tokens": 306108692.0, + "step": 8025 + }, + { + "epoch": 1.0209896959674343, + "ewc_loss": 0.00751815689727664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518156780861318e-05, + "grad_norm": 3.649150848388672, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8744581937789917, + "num_tokens": 306148082.0, + "step": 8026 + }, + { + "epoch": 1.0211169062460246, + "ewc_loss": 0.007461181841790676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.461181667167693e-05, + "grad_norm": 3.682528257369995, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8804808855056763, + "num_tokens": 306182800.0, + "step": 8027 + }, + { + "epoch": 1.021244116524615, + "ewc_loss": 0.007503734901547432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.503735105274245e-05, + "grad_norm": 3.6086795330047607, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8695669174194336, + "num_tokens": 306226741.0, + "step": 8028 + }, + { + "epoch": 1.0213713268032056, + "ewc_loss": 0.007420537527650595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.420537440339103e-05, + "grad_norm": 3.672389507293701, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8709594011306763, + "num_tokens": 306265993.0, + "step": 8029 + }, + { + "epoch": 1.0214985370817962, + "ewc_loss": 0.007483982481062412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.48398233554326e-05, + "grad_norm": 3.6673355102539062, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8764914274215698, + "num_tokens": 306306511.0, + "step": 8030 + }, + { + "epoch": 1.0216257473603867, + "ewc_loss": 0.007447970565408468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.44797071092762e-05, + "grad_norm": 3.7237772941589355, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8870981931686401, + "num_tokens": 306337687.0, + "step": 8031 + }, + { + "epoch": 1.0217529576389772, + "ewc_loss": 0.007482615765184164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.482615910703316e-05, + "grad_norm": 3.7053656578063965, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8388338088989258, + "num_tokens": 306379646.0, + "step": 8032 + }, + { + "epoch": 1.0218801679175677, + "ewc_loss": 0.007434063591063023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.43406344554387e-05, + "grad_norm": 3.690502643585205, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8730307817459106, + "num_tokens": 306415389.0, + "step": 8033 + }, + { + "epoch": 1.0220073781961583, + "ewc_loss": 0.007455097511410713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.455097511410713e-05, + "grad_norm": 3.6802289485931396, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.867773175239563, + "num_tokens": 306452035.0, + "step": 8034 + }, + { + "epoch": 1.0221345884747488, + "ewc_loss": 0.007434652652591467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.434652798110619e-05, + "grad_norm": 3.6389026641845703, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8719451427459717, + "num_tokens": 306496277.0, + "step": 8035 + }, + { + "epoch": 1.0222617987533393, + "ewc_loss": 0.007421969436109066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.421969348797575e-05, + "grad_norm": 3.711822748184204, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8770465850830078, + "num_tokens": 306528212.0, + "step": 8036 + }, + { + "epoch": 1.0223890090319299, + "ewc_loss": 0.007481110282242298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.481110515072942e-05, + "grad_norm": 3.7488367557525635, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8761241436004639, + "num_tokens": 306562047.0, + "step": 8037 + }, + { + "epoch": 1.0225162193105204, + "ewc_loss": 0.007477221544831991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.477221515728161e-05, + "grad_norm": 3.7243525981903076, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.864186704158783, + "num_tokens": 306601298.0, + "step": 8038 + }, + { + "epoch": 1.0226434295891107, + "ewc_loss": 0.0074483477510511875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.448347605532035e-05, + "grad_norm": 3.6548831462860107, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8725701570510864, + "num_tokens": 306640961.0, + "step": 8039 + }, + { + "epoch": 1.0227706398677012, + "ewc_loss": 0.007405174430459738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.405174255836755e-05, + "grad_norm": 3.609179735183716, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8910554051399231, + "num_tokens": 306679507.0, + "step": 8040 + }, + { + "epoch": 1.0228978501462918, + "ewc_loss": 0.007415669970214367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.415669824695215e-05, + "grad_norm": 3.691579580307007, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8623418807983398, + "num_tokens": 306717337.0, + "step": 8041 + }, + { + "epoch": 1.0230250604248823, + "ewc_loss": 0.007485334295779467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.485334208467975e-05, + "grad_norm": 3.703695058822632, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8481672406196594, + "num_tokens": 306754152.0, + "step": 8042 + }, + { + "epoch": 1.0231522707034728, + "ewc_loss": 0.007464121095836163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.464121154043823e-05, + "grad_norm": 3.69347882270813, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8660421371459961, + "num_tokens": 306794887.0, + "step": 8043 + }, + { + "epoch": 1.0232794809820633, + "ewc_loss": 0.007460481021553278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.460480992449448e-05, + "grad_norm": 3.6952767372131348, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8742949366569519, + "num_tokens": 306828374.0, + "step": 8044 + }, + { + "epoch": 1.0234066912606539, + "ewc_loss": 0.007468164898455143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.468165131285787e-05, + "grad_norm": 3.6591360569000244, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8753244876861572, + "num_tokens": 306865285.0, + "step": 8045 + }, + { + "epoch": 1.0235339015392444, + "ewc_loss": 0.007453482132405043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.453482248820364e-05, + "grad_norm": 3.701794385910034, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8791462182998657, + "num_tokens": 306897195.0, + "step": 8046 + }, + { + "epoch": 1.023661111817835, + "ewc_loss": 0.0074913715943694115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.491371798096225e-05, + "grad_norm": 3.6761629581451416, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8724720478057861, + "num_tokens": 306935352.0, + "step": 8047 + }, + { + "epoch": 1.0237883220964255, + "ewc_loss": 0.007461594883352518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.461594941560179e-05, + "grad_norm": 3.729945659637451, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8780452609062195, + "num_tokens": 306964621.0, + "step": 8048 + }, + { + "epoch": 1.023915532375016, + "ewc_loss": 0.00751345744356513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.513457239838317e-05, + "grad_norm": 3.6888904571533203, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8728890419006348, + "num_tokens": 307003451.0, + "step": 8049 + }, + { + "epoch": 1.0240427426536065, + "ewc_loss": 0.007487985771149397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487985567422584e-05, + "grad_norm": 3.6678102016448975, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.871563196182251, + "num_tokens": 307044695.0, + "step": 8050 + }, + { + "epoch": 1.0241699529321968, + "ewc_loss": 0.007490469608455896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.490469579352066e-05, + "grad_norm": 3.665864944458008, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8798360824584961, + "num_tokens": 307080603.0, + "step": 8051 + }, + { + "epoch": 1.0242971632107873, + "ewc_loss": 0.007490352261811495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.490352436434478e-05, + "grad_norm": 3.676405906677246, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8666449785232544, + "num_tokens": 307122300.0, + "step": 8052 + }, + { + "epoch": 1.0244243734893779, + "ewc_loss": 0.007504455279558897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.504455425078049e-05, + "grad_norm": 3.6936590671539307, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8784127235412598, + "num_tokens": 307158685.0, + "step": 8053 + }, + { + "epoch": 1.0245515837679684, + "ewc_loss": 0.0075105419382452965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.510541763622314e-05, + "grad_norm": 3.649730682373047, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.877476692199707, + "num_tokens": 307199104.0, + "step": 8054 + }, + { + "epoch": 1.024678794046559, + "ewc_loss": 0.007478542625904083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.478542829630896e-05, + "grad_norm": 3.7297706604003906, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8651488423347473, + "num_tokens": 307235840.0, + "step": 8055 + }, + { + "epoch": 1.0248060043251495, + "ewc_loss": 0.007542662788182497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.542662933701649e-05, + "grad_norm": 3.710824728012085, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8620579242706299, + "num_tokens": 307273089.0, + "step": 8056 + }, + { + "epoch": 1.02493321460374, + "ewc_loss": 0.007506986614316702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.506986730732024e-05, + "grad_norm": 3.6550984382629395, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8735584616661072, + "num_tokens": 307318825.0, + "step": 8057 + }, + { + "epoch": 1.0250604248823305, + "ewc_loss": 0.007489826995879412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.489827112294734e-05, + "grad_norm": 3.6996099948883057, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8780174255371094, + "num_tokens": 307357006.0, + "step": 8058 + }, + { + "epoch": 1.025187635160921, + "ewc_loss": 0.007521107327193022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52110718167387e-05, + "grad_norm": 3.732116937637329, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8701983094215393, + "num_tokens": 307390274.0, + "step": 8059 + }, + { + "epoch": 1.0253148454395116, + "ewc_loss": 0.007520284969359636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.520284998463467e-05, + "grad_norm": 3.668909788131714, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8803938627243042, + "num_tokens": 307427157.0, + "step": 8060 + }, + { + "epoch": 1.025442055718102, + "ewc_loss": 0.00747607322409749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.476073369616643e-05, + "grad_norm": 3.6759419441223145, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8710938096046448, + "num_tokens": 307466555.0, + "step": 8061 + }, + { + "epoch": 1.0255692659966926, + "ewc_loss": 0.0075037721544504166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.503772212658077e-05, + "grad_norm": 3.710564374923706, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8655107021331787, + "num_tokens": 307503375.0, + "step": 8062 + }, + { + "epoch": 1.0256964762752832, + "ewc_loss": 0.007521438412368298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.521438237745315e-05, + "grad_norm": 3.6682894229888916, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8826366662979126, + "num_tokens": 307543170.0, + "step": 8063 + }, + { + "epoch": 1.0258236865538735, + "ewc_loss": 0.007473289500921965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.473289588233456e-05, + "grad_norm": 3.673056125640869, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8681122064590454, + "num_tokens": 307582890.0, + "step": 8064 + }, + { + "epoch": 1.025950896832464, + "ewc_loss": 0.0074963076040148735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.496307807741687e-05, + "grad_norm": 3.7323169708251953, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8651962280273438, + "num_tokens": 307617643.0, + "step": 8065 + }, + { + "epoch": 1.0260781071110545, + "ewc_loss": 0.0075332997366786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.533299503847957e-05, + "grad_norm": 3.7079572677612305, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8704333305358887, + "num_tokens": 307652258.0, + "step": 8066 + }, + { + "epoch": 1.026205317389645, + "ewc_loss": 0.0074976589530706406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.49765895307064e-05, + "grad_norm": 3.6624574661254883, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8693703413009644, + "num_tokens": 307693245.0, + "step": 8067 + }, + { + "epoch": 1.0263325276682356, + "ewc_loss": 0.0074800411239266396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.480040949303657e-05, + "grad_norm": 3.727191209793091, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8767473697662354, + "num_tokens": 307728751.0, + "step": 8068 + }, + { + "epoch": 1.026459737946826, + "ewc_loss": 0.007541012950241566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541012746514753e-05, + "grad_norm": 3.674757480621338, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8610680103302002, + "num_tokens": 307767869.0, + "step": 8069 + }, + { + "epoch": 1.0265869482254166, + "ewc_loss": 0.007461113389581442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.46111327316612e-05, + "grad_norm": 3.639230728149414, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8813717365264893, + "num_tokens": 307809438.0, + "step": 8070 + }, + { + "epoch": 1.0267141585040072, + "ewc_loss": 0.007468669209629297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.46866935514845e-05, + "grad_norm": 3.7211437225341797, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8541584014892578, + "num_tokens": 307848752.0, + "step": 8071 + }, + { + "epoch": 1.0268413687825977, + "ewc_loss": 0.007517422549426556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517422636738047e-05, + "grad_norm": 3.7086477279663086, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8672739863395691, + "num_tokens": 307886080.0, + "step": 8072 + }, + { + "epoch": 1.0269685790611882, + "ewc_loss": 0.007463289424777031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.463289512088522e-05, + "grad_norm": 3.6792707443237305, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8847860097885132, + "num_tokens": 307922854.0, + "step": 8073 + }, + { + "epoch": 1.0270957893397787, + "ewc_loss": 0.007457666099071503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.45766592444852e-05, + "grad_norm": 3.681629180908203, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8703457117080688, + "num_tokens": 307960032.0, + "step": 8074 + }, + { + "epoch": 1.0272229996183693, + "ewc_loss": 0.007483608555048704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.483608351321891e-05, + "grad_norm": 3.734269142150879, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8718889951705933, + "num_tokens": 307996074.0, + "step": 8075 + }, + { + "epoch": 1.0273502098969596, + "ewc_loss": 0.00750869233161211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.508692215196788e-05, + "grad_norm": 3.7192869186401367, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8844336271286011, + "num_tokens": 308030833.0, + "step": 8076 + }, + { + "epoch": 1.02747742017555, + "ewc_loss": 0.007455166429281235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.455166633008048e-05, + "grad_norm": 3.6274189949035645, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.881877064704895, + "num_tokens": 308079620.0, + "step": 8077 + }, + { + "epoch": 1.0276046304541406, + "ewc_loss": 0.007426710333675146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.426710362778977e-05, + "grad_norm": 3.7145235538482666, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8667226433753967, + "num_tokens": 308118555.0, + "step": 8078 + }, + { + "epoch": 1.0277318407327312, + "ewc_loss": 0.007517000660300255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517000631196424e-05, + "grad_norm": 3.7789993286132812, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8580324053764343, + "num_tokens": 308154410.0, + "step": 8079 + }, + { + "epoch": 1.0278590510113217, + "ewc_loss": 0.0075036827474832535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.503682718379423e-05, + "grad_norm": 3.6789982318878174, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8741639852523804, + "num_tokens": 308191990.0, + "step": 8080 + }, + { + "epoch": 1.0279862612899122, + "ewc_loss": 0.007435785606503487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.435785664711148e-05, + "grad_norm": 3.7076096534729004, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8533240556716919, + "num_tokens": 308231047.0, + "step": 8081 + }, + { + "epoch": 1.0281134715685027, + "ewc_loss": 0.007507675793021917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507675763918087e-05, + "grad_norm": 3.716338872909546, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8858903646469116, + "num_tokens": 308269487.0, + "step": 8082 + }, + { + "epoch": 1.0282406818470933, + "ewc_loss": 0.0075008380226790905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.500837818952277e-05, + "grad_norm": 3.718010425567627, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8724482655525208, + "num_tokens": 308304974.0, + "step": 8083 + }, + { + "epoch": 1.0283678921256838, + "ewc_loss": 0.007490896619856358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.490896678064018e-05, + "grad_norm": 3.672328472137451, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8939541578292847, + "num_tokens": 308342072.0, + "step": 8084 + }, + { + "epoch": 1.0284951024042743, + "ewc_loss": 0.007462950423359871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.462950452463701e-05, + "grad_norm": 3.732180595397949, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8694547414779663, + "num_tokens": 308374293.0, + "step": 8085 + }, + { + "epoch": 1.0286223126828649, + "ewc_loss": 0.007514048833400011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51404877519235e-05, + "grad_norm": 3.6370010375976562, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8764677047729492, + "num_tokens": 308415872.0, + "step": 8086 + }, + { + "epoch": 1.0287495229614554, + "ewc_loss": 0.007455090060830116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.455090235453099e-05, + "grad_norm": 3.691737413406372, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8666144609451294, + "num_tokens": 308451198.0, + "step": 8087 + }, + { + "epoch": 1.0288767332400457, + "ewc_loss": 0.007507775444537401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507775444537401e-05, + "grad_norm": 3.66506290435791, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8803215026855469, + "num_tokens": 308490554.0, + "step": 8088 + }, + { + "epoch": 1.0290039435186362, + "ewc_loss": 0.007469093427062035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469093543477356e-05, + "grad_norm": 3.6708338260650635, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8668323755264282, + "num_tokens": 308530221.0, + "step": 8089 + }, + { + "epoch": 1.0291311537972267, + "ewc_loss": 0.007487835828214884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487835682695732e-05, + "grad_norm": 3.751471519470215, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8590131402015686, + "num_tokens": 308563634.0, + "step": 8090 + }, + { + "epoch": 1.0292583640758173, + "ewc_loss": 0.007525644265115261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525644468842074e-05, + "grad_norm": 3.673841714859009, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8737275004386902, + "num_tokens": 308605033.0, + "step": 8091 + }, + { + "epoch": 1.0293855743544078, + "ewc_loss": 0.007454313803464174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.454313890775666e-05, + "grad_norm": 3.691988706588745, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8726698160171509, + "num_tokens": 308642603.0, + "step": 8092 + }, + { + "epoch": 1.0295127846329983, + "ewc_loss": 0.007484111003577709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.484111119993031e-05, + "grad_norm": 3.634056806564331, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8712469339370728, + "num_tokens": 308685189.0, + "step": 8093 + }, + { + "epoch": 1.0296399949115889, + "ewc_loss": 0.00744390906766057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.443909271387383e-05, + "grad_norm": 3.7065629959106445, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8723072409629822, + "num_tokens": 308726052.0, + "step": 8094 + }, + { + "epoch": 1.0297672051901794, + "ewc_loss": 0.007496927864849567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.496927719330415e-05, + "grad_norm": 3.695986747741699, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8610374331474304, + "num_tokens": 308764828.0, + "step": 8095 + }, + { + "epoch": 1.02989441546877, + "ewc_loss": 0.00746923265978694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469232514267787e-05, + "grad_norm": 3.6772255897521973, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8820700645446777, + "num_tokens": 308799712.0, + "step": 8096 + }, + { + "epoch": 1.0300216257473604, + "ewc_loss": 0.0074577778577804565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.457777974195778e-05, + "grad_norm": 3.6742284297943115, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.872133731842041, + "num_tokens": 308839879.0, + "step": 8097 + }, + { + "epoch": 1.030148836025951, + "ewc_loss": 0.007477478124201298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.477478357031941e-05, + "grad_norm": 3.7106893062591553, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8647321462631226, + "num_tokens": 308880362.0, + "step": 8098 + }, + { + "epoch": 1.0302760463045415, + "ewc_loss": 0.007498961873352528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.498962077079341e-05, + "grad_norm": 3.644240617752075, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8824514150619507, + "num_tokens": 308921395.0, + "step": 8099 + }, + { + "epoch": 1.0304032565831318, + "ewc_loss": 0.007430632598698139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.430632831528783e-05, + "grad_norm": 3.640152931213379, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8813410997390747, + "num_tokens": 308959976.0, + "step": 8100 + }, + { + "epoch": 1.0305304668617223, + "ewc_loss": 0.00744390394538641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.443904178217053e-05, + "grad_norm": 3.673492193222046, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8849889039993286, + "num_tokens": 309000702.0, + "step": 8101 + }, + { + "epoch": 1.0306576771403129, + "ewc_loss": 0.007463289424777031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.463289512088522e-05, + "grad_norm": 3.688673496246338, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8894807696342468, + "num_tokens": 309036540.0, + "step": 8102 + }, + { + "epoch": 1.0307848874189034, + "ewc_loss": 0.007445255294442177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.445255323546007e-05, + "grad_norm": 3.6832895278930664, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8721133470535278, + "num_tokens": 309072408.0, + "step": 8103 + }, + { + "epoch": 1.030912097697494, + "ewc_loss": 0.007453558500856161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.453558646375313e-05, + "grad_norm": 3.719069719314575, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8603843450546265, + "num_tokens": 309110366.0, + "step": 8104 + }, + { + "epoch": 1.0310393079760845, + "ewc_loss": 0.007467944175004959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.467943942174315e-05, + "grad_norm": 3.6180269718170166, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8742830753326416, + "num_tokens": 309155194.0, + "step": 8105 + }, + { + "epoch": 1.031166518254675, + "ewc_loss": 0.007393005769699812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.393005944322795e-05, + "grad_norm": 3.700359344482422, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8819254636764526, + "num_tokens": 309193747.0, + "step": 8106 + }, + { + "epoch": 1.0312937285332655, + "ewc_loss": 0.007480103522539139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.480103522539139e-05, + "grad_norm": 3.661891460418701, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8598031401634216, + "num_tokens": 309234161.0, + "step": 8107 + }, + { + "epoch": 1.031420938811856, + "ewc_loss": 0.007418993394821882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.418993482133374e-05, + "grad_norm": 3.686100482940674, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.861718475818634, + "num_tokens": 309272574.0, + "step": 8108 + }, + { + "epoch": 1.0315481490904466, + "ewc_loss": 0.007448315620422363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.448315591318533e-05, + "grad_norm": 3.7479820251464844, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8738195896148682, + "num_tokens": 309306567.0, + "step": 8109 + }, + { + "epoch": 1.031675359369037, + "ewc_loss": 0.007478868123143911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.47886806493625e-05, + "grad_norm": 3.6167945861816406, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8787544965744019, + "num_tokens": 309347218.0, + "step": 8110 + }, + { + "epoch": 1.0318025696476276, + "ewc_loss": 0.007383689284324646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.383689080597833e-05, + "grad_norm": 3.6511340141296387, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8870722055435181, + "num_tokens": 309385701.0, + "step": 8111 + }, + { + "epoch": 1.0319297799262181, + "ewc_loss": 0.007450280711054802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.450280827470124e-05, + "grad_norm": 3.6722300052642822, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8699906468391418, + "num_tokens": 309424600.0, + "step": 8112 + }, + { + "epoch": 1.0320569902048085, + "ewc_loss": 0.007438420318067074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.438420288963243e-05, + "grad_norm": 3.7597649097442627, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8515750169754028, + "num_tokens": 309459929.0, + "step": 8113 + }, + { + "epoch": 1.032184200483399, + "ewc_loss": 0.007484118454158306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.484118395950645e-05, + "grad_norm": 3.6626734733581543, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8639571666717529, + "num_tokens": 309505085.0, + "step": 8114 + }, + { + "epoch": 1.0323114107619895, + "ewc_loss": 0.007402706425637007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.402706251014024e-05, + "grad_norm": 3.6683743000030518, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8740150332450867, + "num_tokens": 309544746.0, + "step": 8115 + }, + { + "epoch": 1.03243862104058, + "ewc_loss": 0.007451818790286779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.451818964909762e-05, + "grad_norm": 3.672234296798706, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8603579998016357, + "num_tokens": 309584301.0, + "step": 8116 + }, + { + "epoch": 1.0325658313191706, + "ewc_loss": 0.007447496522217989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.447496318491176e-05, + "grad_norm": 3.685288429260254, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8753926753997803, + "num_tokens": 309622570.0, + "step": 8117 + }, + { + "epoch": 1.032693041597761, + "ewc_loss": 0.007447015959769487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.44701610528864e-05, + "grad_norm": 3.6458382606506348, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8563119173049927, + "num_tokens": 309666417.0, + "step": 8118 + }, + { + "epoch": 1.0328202518763516, + "ewc_loss": 0.007427988573908806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.427988748531789e-05, + "grad_norm": 3.6878538131713867, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8637217283248901, + "num_tokens": 309705706.0, + "step": 8119 + }, + { + "epoch": 1.0329474621549422, + "ewc_loss": 0.007459399756044149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.45939978514798e-05, + "grad_norm": 3.645394802093506, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8809852004051208, + "num_tokens": 309744288.0, + "step": 8120 + }, + { + "epoch": 1.0330746724335327, + "ewc_loss": 0.00741716381162405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.417163578793406e-05, + "grad_norm": 3.708402156829834, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8605265617370605, + "num_tokens": 309780406.0, + "step": 8121 + }, + { + "epoch": 1.0332018827121232, + "ewc_loss": 0.007469093427062035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469093543477356e-05, + "grad_norm": 3.8136231899261475, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8430519104003906, + "num_tokens": 309811135.0, + "step": 8122 + }, + { + "epoch": 1.0333290929907137, + "ewc_loss": 0.007517732679843903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517732592532411e-05, + "grad_norm": 3.6845691204071045, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8771094083786011, + "num_tokens": 309847037.0, + "step": 8123 + }, + { + "epoch": 1.0334563032693043, + "ewc_loss": 0.0074053313583135605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.405331416521221e-05, + "grad_norm": 3.687117099761963, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8731566667556763, + "num_tokens": 309884274.0, + "step": 8124 + }, + { + "epoch": 1.0335835135478946, + "ewc_loss": 0.007471331395208836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.471331628039479e-05, + "grad_norm": 3.759242057800293, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8837717771530151, + "num_tokens": 309915572.0, + "step": 8125 + }, + { + "epoch": 1.033710723826485, + "ewc_loss": 0.007509575691074133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.50957551645115e-05, + "grad_norm": 3.649153709411621, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8763691186904907, + "num_tokens": 309954502.0, + "step": 8126 + }, + { + "epoch": 1.0338379341050756, + "ewc_loss": 0.007431765086948872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.43176497053355e-05, + "grad_norm": 3.6181907653808594, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8925030827522278, + "num_tokens": 309993015.0, + "step": 8127 + }, + { + "epoch": 1.0339651443836662, + "ewc_loss": 0.00745724979788065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.457249739672989e-05, + "grad_norm": 3.729966878890991, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8613077998161316, + "num_tokens": 310030840.0, + "step": 8128 + }, + { + "epoch": 1.0340923546622567, + "ewc_loss": 0.007530920207500458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.530920265708119e-05, + "grad_norm": 3.698758363723755, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8728768229484558, + "num_tokens": 310067586.0, + "step": 8129 + }, + { + "epoch": 1.0342195649408472, + "ewc_loss": 0.007473666686564684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.473666482837871e-05, + "grad_norm": 3.6925721168518066, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8813889026641846, + "num_tokens": 310104496.0, + "step": 8130 + }, + { + "epoch": 1.0343467752194377, + "ewc_loss": 0.007493752986192703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.493753219023347e-05, + "grad_norm": 3.7106473445892334, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8560080528259277, + "num_tokens": 310141162.0, + "step": 8131 + }, + { + "epoch": 1.0344739854980283, + "ewc_loss": 0.007517149671912193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517149788327515e-05, + "grad_norm": 3.6535873413085938, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8703323602676392, + "num_tokens": 310182965.0, + "step": 8132 + }, + { + "epoch": 1.0346011957766188, + "ewc_loss": 0.007465449161827564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.465449016308412e-05, + "grad_norm": 3.7235665321350098, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8564633131027222, + "num_tokens": 310219572.0, + "step": 8133 + }, + { + "epoch": 1.0347284060552093, + "ewc_loss": 0.007531710434705019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.531710434705019e-05, + "grad_norm": 3.825697422027588, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8626573085784912, + "num_tokens": 310248711.0, + "step": 8134 + }, + { + "epoch": 1.0348556163337999, + "ewc_loss": 0.007565515115857124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565515261376277e-05, + "grad_norm": 3.6777656078338623, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8565534949302673, + "num_tokens": 310287915.0, + "step": 8135 + }, + { + "epoch": 1.0349828266123904, + "ewc_loss": 0.007457941770553589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.457941683242097e-05, + "grad_norm": 3.773291826248169, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8714415431022644, + "num_tokens": 310325619.0, + "step": 8136 + }, + { + "epoch": 1.0351100368909807, + "ewc_loss": 0.007576669100672007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57666930439882e-05, + "grad_norm": 3.711549997329712, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8799614906311035, + "num_tokens": 310362456.0, + "step": 8137 + }, + { + "epoch": 1.0352372471695712, + "ewc_loss": 0.007509943563491106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.509943679906428e-05, + "grad_norm": 3.6937832832336426, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8672961592674255, + "num_tokens": 310399584.0, + "step": 8138 + }, + { + "epoch": 1.0353644574481617, + "ewc_loss": 0.007511403411626816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.511403237003833e-05, + "grad_norm": 3.684555768966675, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8656002283096313, + "num_tokens": 310437435.0, + "step": 8139 + }, + { + "epoch": 1.0354916677267523, + "ewc_loss": 0.007526439614593983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.526439731009305e-05, + "grad_norm": 3.6682190895080566, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.876236617565155, + "num_tokens": 310476320.0, + "step": 8140 + }, + { + "epoch": 1.0356188780053428, + "ewc_loss": 0.007519447710365057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.519447535742074e-05, + "grad_norm": 3.682222604751587, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8866245746612549, + "num_tokens": 310512956.0, + "step": 8141 + }, + { + "epoch": 1.0357460882839333, + "ewc_loss": 0.007524936459958553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.524936518166214e-05, + "grad_norm": 3.7378737926483154, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8709971904754639, + "num_tokens": 310547605.0, + "step": 8142 + }, + { + "epoch": 1.0358732985625239, + "ewc_loss": 0.0075693270191550255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569327135570347e-05, + "grad_norm": 3.706465482711792, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8705666661262512, + "num_tokens": 310586991.0, + "step": 8143 + }, + { + "epoch": 1.0360005088411144, + "ewc_loss": 0.007539323065429926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.539323269156739e-05, + "grad_norm": 3.7420101165771484, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8715461492538452, + "num_tokens": 310621271.0, + "step": 8144 + }, + { + "epoch": 1.036127719119705, + "ewc_loss": 0.007564445026218891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56444496801123e-05, + "grad_norm": 3.6735761165618896, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8746216893196106, + "num_tokens": 310662154.0, + "step": 8145 + }, + { + "epoch": 1.0362549293982954, + "ewc_loss": 0.007524941582232714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.524941611336544e-05, + "grad_norm": 3.6572048664093018, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8831429481506348, + "num_tokens": 310701924.0, + "step": 8146 + }, + { + "epoch": 1.036382139676886, + "ewc_loss": 0.0075407070107758045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540707156294957e-05, + "grad_norm": 3.7103474140167236, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8770588636398315, + "num_tokens": 310739688.0, + "step": 8147 + }, + { + "epoch": 1.0365093499554765, + "ewc_loss": 0.007567196618765593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.567196735180914e-05, + "grad_norm": 3.743450164794922, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8644640445709229, + "num_tokens": 310773317.0, + "step": 8148 + }, + { + "epoch": 1.0366365602340668, + "ewc_loss": 0.007588677573949099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.588677544845268e-05, + "grad_norm": 3.6770756244659424, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8696091175079346, + "num_tokens": 310810854.0, + "step": 8149 + }, + { + "epoch": 1.0367637705126573, + "ewc_loss": 0.007541244849562645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541244849562645e-05, + "grad_norm": 3.7600884437561035, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8657594323158264, + "num_tokens": 310846080.0, + "step": 8150 + }, + { + "epoch": 1.0368909807912479, + "ewc_loss": 0.007619901560246944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.619901589350775e-05, + "grad_norm": 3.700768232345581, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8497308492660522, + "num_tokens": 310885032.0, + "step": 8151 + }, + { + "epoch": 1.0370181910698384, + "ewc_loss": 0.0075562261044979095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.556226046290249e-05, + "grad_norm": 3.6537702083587646, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8773728609085083, + "num_tokens": 310924518.0, + "step": 8152 + }, + { + "epoch": 1.037145401348429, + "ewc_loss": 0.007549532689154148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549532892880961e-05, + "grad_norm": 3.701262950897217, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.873937726020813, + "num_tokens": 310961734.0, + "step": 8153 + }, + { + "epoch": 1.0372726116270194, + "ewc_loss": 0.0075928750447928905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.59287504479289e-05, + "grad_norm": 3.6171205043792725, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.874997079372406, + "num_tokens": 311004541.0, + "step": 8154 + }, + { + "epoch": 1.03739982190561, + "ewc_loss": 0.0075402590446174145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540258957305923e-05, + "grad_norm": 3.645817279815674, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8809759616851807, + "num_tokens": 311043947.0, + "step": 8155 + }, + { + "epoch": 1.0375270321842005, + "ewc_loss": 0.007577597163617611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.577596988994628e-05, + "grad_norm": 3.6663365364074707, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8680362701416016, + "num_tokens": 311083154.0, + "step": 8156 + }, + { + "epoch": 1.037654242462791, + "ewc_loss": 0.007584014907479286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5840151112061e-05, + "grad_norm": 3.708944320678711, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8834365010261536, + "num_tokens": 311121429.0, + "step": 8157 + }, + { + "epoch": 1.0377814527413816, + "ewc_loss": 0.007586913648992777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58691385271959e-05, + "grad_norm": 3.6464176177978516, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8573033809661865, + "num_tokens": 311166578.0, + "step": 8158 + }, + { + "epoch": 1.037908663019972, + "ewc_loss": 0.007523898035287857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52389823901467e-05, + "grad_norm": 3.65929913520813, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8746326565742493, + "num_tokens": 311209645.0, + "step": 8159 + }, + { + "epoch": 1.0380358732985626, + "ewc_loss": 0.007553537376224995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553537579951808e-05, + "grad_norm": 3.6877095699310303, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8763670921325684, + "num_tokens": 311248428.0, + "step": 8160 + }, + { + "epoch": 1.0381630835771531, + "ewc_loss": 0.007552743889391422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5527437729761e-05, + "grad_norm": 3.7331831455230713, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8731342554092407, + "num_tokens": 311282040.0, + "step": 8161 + }, + { + "epoch": 1.0382902938557435, + "ewc_loss": 0.007539678364992142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.539678335888311e-05, + "grad_norm": 3.667409896850586, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8786667585372925, + "num_tokens": 311320233.0, + "step": 8162 + }, + { + "epoch": 1.038417504134334, + "ewc_loss": 0.007475448306649923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.475448364857584e-05, + "grad_norm": 3.716134548187256, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8567456603050232, + "num_tokens": 311357586.0, + "step": 8163 + }, + { + "epoch": 1.0385447144129245, + "ewc_loss": 0.00752155901864171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52155901864171e-05, + "grad_norm": 3.675785541534424, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8718981742858887, + "num_tokens": 311395545.0, + "step": 8164 + }, + { + "epoch": 1.038671924691515, + "ewc_loss": 0.007480868138372898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.48086822568439e-05, + "grad_norm": 3.729992628097534, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.871453046798706, + "num_tokens": 311428785.0, + "step": 8165 + }, + { + "epoch": 1.0387991349701056, + "ewc_loss": 0.007541097700595856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541097875218838e-05, + "grad_norm": 3.7178618907928467, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8605225682258606, + "num_tokens": 311469546.0, + "step": 8166 + }, + { + "epoch": 1.038926345248696, + "ewc_loss": 0.0074776578694581985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.477658073185012e-05, + "grad_norm": 3.7194020748138428, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8642954230308533, + "num_tokens": 311504064.0, + "step": 8167 + }, + { + "epoch": 1.0390535555272866, + "ewc_loss": 0.007509712129831314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.509712304454297e-05, + "grad_norm": 3.7816877365112305, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8547261953353882, + "num_tokens": 311536600.0, + "step": 8168 + }, + { + "epoch": 1.0391807658058771, + "ewc_loss": 0.00753428740426898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534287578891963e-05, + "grad_norm": 3.6486928462982178, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8879445791244507, + "num_tokens": 311573638.0, + "step": 8169 + }, + { + "epoch": 1.0393079760844677, + "ewc_loss": 0.007442296016961336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.442296191584319e-05, + "grad_norm": 3.676995277404785, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8639557957649231, + "num_tokens": 311611085.0, + "step": 8170 + }, + { + "epoch": 1.0394351863630582, + "ewc_loss": 0.007516293320804834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.516293408116326e-05, + "grad_norm": 3.6919803619384766, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8656697273254395, + "num_tokens": 311655497.0, + "step": 8171 + }, + { + "epoch": 1.0395623966416487, + "ewc_loss": 0.007507223170250654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507223199354485e-05, + "grad_norm": 3.725414752960205, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8571146726608276, + "num_tokens": 311688167.0, + "step": 8172 + }, + { + "epoch": 1.0396896069202393, + "ewc_loss": 0.007516638841480017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.516639016103e-05, + "grad_norm": 3.712033987045288, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8714345097541809, + "num_tokens": 311724926.0, + "step": 8173 + }, + { + "epoch": 1.0398168171988296, + "ewc_loss": 0.007511976640671492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.511976582463831e-05, + "grad_norm": 3.711418390274048, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8534204959869385, + "num_tokens": 311760729.0, + "step": 8174 + }, + { + "epoch": 1.03994402747742, + "ewc_loss": 0.007528094109147787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52809428377077e-05, + "grad_norm": 3.6448981761932373, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8719683885574341, + "num_tokens": 311802138.0, + "step": 8175 + }, + { + "epoch": 1.0400712377560106, + "ewc_loss": 0.007499719504266977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.499719504266977e-05, + "grad_norm": 3.6354458332061768, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8799315690994263, + "num_tokens": 311838510.0, + "step": 8176 + }, + { + "epoch": 1.0401984480346012, + "ewc_loss": 0.007509984541684389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.509984425269067e-05, + "grad_norm": 3.7778515815734863, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8646743297576904, + "num_tokens": 311870875.0, + "step": 8177 + }, + { + "epoch": 1.0403256583131917, + "ewc_loss": 0.007613263092935085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.613263005623594e-05, + "grad_norm": 3.6587777137756348, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8833571672439575, + "num_tokens": 311905854.0, + "step": 8178 + }, + { + "epoch": 1.0404528685917822, + "ewc_loss": 0.007484797388315201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.484797242796049e-05, + "grad_norm": 3.7652368545532227, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8692651987075806, + "num_tokens": 311936381.0, + "step": 8179 + }, + { + "epoch": 1.0405800788703727, + "ewc_loss": 0.007625507190823555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.625506987096742e-05, + "grad_norm": 3.705928325653076, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8649742603302002, + "num_tokens": 311972529.0, + "step": 8180 + }, + { + "epoch": 1.0407072891489633, + "ewc_loss": 0.007534821052104235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534820906585082e-05, + "grad_norm": 3.6681694984436035, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8881396651268005, + "num_tokens": 312007883.0, + "step": 8181 + }, + { + "epoch": 1.0408344994275538, + "ewc_loss": 0.007530194241553545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.530194125138223e-05, + "grad_norm": 3.7552876472473145, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8660575747489929, + "num_tokens": 312041000.0, + "step": 8182 + }, + { + "epoch": 1.0409617097061443, + "ewc_loss": 0.007601158693432808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.601158722536638e-05, + "grad_norm": 3.683720588684082, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8742152452468872, + "num_tokens": 312076011.0, + "step": 8183 + }, + { + "epoch": 1.0410889199847349, + "ewc_loss": 0.0075424425303936005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54244247218594e-05, + "grad_norm": 3.6799886226654053, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8640971779823303, + "num_tokens": 312115614.0, + "step": 8184 + }, + { + "epoch": 1.0412161302633254, + "ewc_loss": 0.007578318938612938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.578318763989955e-05, + "grad_norm": 3.6806302070617676, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8657891154289246, + "num_tokens": 312154111.0, + "step": 8185 + }, + { + "epoch": 1.0413433405419157, + "ewc_loss": 0.0075757610611617565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57576126488857e-05, + "grad_norm": 3.6708850860595703, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8667446374893188, + "num_tokens": 312191571.0, + "step": 8186 + }, + { + "epoch": 1.0414705508205062, + "ewc_loss": 0.007572850678116083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.572850881842896e-05, + "grad_norm": 3.7345967292785645, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8614476323127747, + "num_tokens": 312229205.0, + "step": 8187 + }, + { + "epoch": 1.0415977610990967, + "ewc_loss": 0.007621502969413996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.621503027621657e-05, + "grad_norm": 3.69508957862854, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8856250643730164, + "num_tokens": 312266604.0, + "step": 8188 + }, + { + "epoch": 1.0417249713776873, + "ewc_loss": 0.0075727179646492004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.572717731818557e-05, + "grad_norm": 3.64508318901062, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8618404865264893, + "num_tokens": 312305222.0, + "step": 8189 + }, + { + "epoch": 1.0418521816562778, + "ewc_loss": 0.007547530345618725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.547530549345538e-05, + "grad_norm": 3.69756817817688, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8506234884262085, + "num_tokens": 312345823.0, + "step": 8190 + }, + { + "epoch": 1.0419793919348683, + "ewc_loss": 0.007586451712995768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58645182941109e-05, + "grad_norm": 3.7129366397857666, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8641451597213745, + "num_tokens": 312383569.0, + "step": 8191 + }, + { + "epoch": 1.0421066022134589, + "ewc_loss": 0.0075743356719613075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.574335904791951e-05, + "grad_norm": 3.6640050411224365, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8743224143981934, + "num_tokens": 312423068.0, + "step": 8192 + }, + { + "epoch": 1.0422338124920494, + "ewc_loss": 0.007537115830928087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.537115743616596e-05, + "grad_norm": 3.7108216285705566, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.865406334400177, + "num_tokens": 312460569.0, + "step": 8193 + }, + { + "epoch": 1.04236102277064, + "ewc_loss": 0.007568982895463705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.568982982775196e-05, + "grad_norm": 3.713330030441284, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8498449325561523, + "num_tokens": 312501654.0, + "step": 8194 + }, + { + "epoch": 1.0424882330492304, + "ewc_loss": 0.007555755786597729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555756019428372e-05, + "grad_norm": 3.65598726272583, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8626155853271484, + "num_tokens": 312543630.0, + "step": 8195 + }, + { + "epoch": 1.042615443327821, + "ewc_loss": 0.007503662258386612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.503662345698103e-05, + "grad_norm": 3.676431894302368, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8690600991249084, + "num_tokens": 312579428.0, + "step": 8196 + }, + { + "epoch": 1.0427426536064115, + "ewc_loss": 0.007550275884568691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55027576815337e-05, + "grad_norm": 3.660439968109131, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8852981925010681, + "num_tokens": 312621388.0, + "step": 8197 + }, + { + "epoch": 1.0428698638850018, + "ewc_loss": 0.007518787402659655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518787606386468e-05, + "grad_norm": 3.6965818405151367, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8709304332733154, + "num_tokens": 312665303.0, + "step": 8198 + }, + { + "epoch": 1.0429970741635923, + "ewc_loss": 0.007531662471592426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.531662413384765e-05, + "grad_norm": 3.716113328933716, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8615824580192566, + "num_tokens": 312702354.0, + "step": 8199 + }, + { + "epoch": 1.0431242844421829, + "ewc_loss": 0.007522985339164734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52298510633409e-05, + "grad_norm": 3.7278637886047363, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8761366605758667, + "num_tokens": 312737223.0, + "step": 8200 + }, + { + "epoch": 1.0432514947207734, + "ewc_loss": 0.0075137438252568245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.513743912568316e-05, + "grad_norm": 3.688373327255249, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8712267279624939, + "num_tokens": 312773353.0, + "step": 8201 + }, + { + "epoch": 1.043378704999364, + "ewc_loss": 0.007485518231987953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.485518290195614e-05, + "grad_norm": 3.7162997722625732, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8537923097610474, + "num_tokens": 312809230.0, + "step": 8202 + }, + { + "epoch": 1.0435059152779544, + "ewc_loss": 0.00751379132270813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.513791206292808e-05, + "grad_norm": 3.6940348148345947, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8878122568130493, + "num_tokens": 312841278.0, + "step": 8203 + }, + { + "epoch": 1.043633125556545, + "ewc_loss": 0.007482611574232578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.482611545128748e-05, + "grad_norm": 3.6152403354644775, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8756376504898071, + "num_tokens": 312888469.0, + "step": 8204 + }, + { + "epoch": 1.0437603358351355, + "ewc_loss": 0.007452589459717274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.452589488821104e-05, + "grad_norm": 3.679621458053589, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8779171109199524, + "num_tokens": 312925446.0, + "step": 8205 + }, + { + "epoch": 1.043887546113726, + "ewc_loss": 0.007509035058319569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.509034912800416e-05, + "grad_norm": 3.659858226776123, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8690426349639893, + "num_tokens": 312965425.0, + "step": 8206 + }, + { + "epoch": 1.0440147563923166, + "ewc_loss": 0.007476643193513155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.476643077097833e-05, + "grad_norm": 3.73584246635437, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8659127950668335, + "num_tokens": 313000637.0, + "step": 8207 + }, + { + "epoch": 1.044141966670907, + "ewc_loss": 0.0075222342275083065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.522234227508307e-05, + "grad_norm": 3.711440324783325, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8724187612533569, + "num_tokens": 313035986.0, + "step": 8208 + }, + { + "epoch": 1.0442691769494976, + "ewc_loss": 0.007475823629647493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.475823804270476e-05, + "grad_norm": 3.6521918773651123, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8603525161743164, + "num_tokens": 313077027.0, + "step": 8209 + }, + { + "epoch": 1.0443963872280881, + "ewc_loss": 0.007469560485333204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469560659956187e-05, + "grad_norm": 3.6416685581207275, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8811602592468262, + "num_tokens": 313121733.0, + "step": 8210 + }, + { + "epoch": 1.0445235975066784, + "ewc_loss": 0.007466751616448164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.466751412721351e-05, + "grad_norm": 3.6551265716552734, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8664054870605469, + "num_tokens": 313165542.0, + "step": 8211 + }, + { + "epoch": 1.044650807785269, + "ewc_loss": 0.007486206479370594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.486206595785916e-05, + "grad_norm": 3.7829554080963135, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8598206043243408, + "num_tokens": 313201082.0, + "step": 8212 + }, + { + "epoch": 1.0447780180638595, + "ewc_loss": 0.0075520239770412445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.552024180768058e-05, + "grad_norm": 3.6349360942840576, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8830338716506958, + "num_tokens": 313241765.0, + "step": 8213 + }, + { + "epoch": 1.04490522834245, + "ewc_loss": 0.007415821310132742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.41582116461359e-05, + "grad_norm": 3.669799327850342, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8688708543777466, + "num_tokens": 313282471.0, + "step": 8214 + }, + { + "epoch": 1.0450324386210406, + "ewc_loss": 0.007494140416383743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.494140299968421e-05, + "grad_norm": 3.602893829345703, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8666219115257263, + "num_tokens": 313331505.0, + "step": 8215 + }, + { + "epoch": 1.045159648899631, + "ewc_loss": 0.007426735013723373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.426735101034865e-05, + "grad_norm": 3.6979427337646484, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8707613945007324, + "num_tokens": 313369823.0, + "step": 8216 + }, + { + "epoch": 1.0452868591782216, + "ewc_loss": 0.007513649296015501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.513649325119331e-05, + "grad_norm": 3.7314674854278564, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8728567957878113, + "num_tokens": 313404255.0, + "step": 8217 + }, + { + "epoch": 1.0454140694568121, + "ewc_loss": 0.007503399159759283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.50339895603247e-05, + "grad_norm": 3.7027504444122314, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8812710046768188, + "num_tokens": 313439368.0, + "step": 8218 + }, + { + "epoch": 1.0455412797354027, + "ewc_loss": 0.007462329231202602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.46232908568345e-05, + "grad_norm": 3.6582932472229004, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8627954125404358, + "num_tokens": 313479475.0, + "step": 8219 + }, + { + "epoch": 1.0456684900139932, + "ewc_loss": 0.007458707317709923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.45870711398311e-05, + "grad_norm": 3.745513677597046, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8598567247390747, + "num_tokens": 313514604.0, + "step": 8220 + }, + { + "epoch": 1.0457957002925837, + "ewc_loss": 0.007525445893406868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525445835199207e-05, + "grad_norm": 3.6968753337860107, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8679369688034058, + "num_tokens": 313548907.0, + "step": 8221 + }, + { + "epoch": 1.0459229105711743, + "ewc_loss": 0.007468050345778465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.468050171155483e-05, + "grad_norm": 3.677546262741089, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8710857629776001, + "num_tokens": 313586615.0, + "step": 8222 + }, + { + "epoch": 1.0460501208497646, + "ewc_loss": 0.007473655045032501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.473654841305688e-05, + "grad_norm": 3.6719601154327393, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8616199493408203, + "num_tokens": 313625202.0, + "step": 8223 + }, + { + "epoch": 1.046177331128355, + "ewc_loss": 0.007486449088901281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.486448885174468e-05, + "grad_norm": 3.6839375495910645, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8652673959732056, + "num_tokens": 313663257.0, + "step": 8224 + }, + { + "epoch": 1.0463045414069456, + "ewc_loss": 0.007498716935515404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.498716877307743e-05, + "grad_norm": 3.6779208183288574, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8764727711677551, + "num_tokens": 313701430.0, + "step": 8225 + }, + { + "epoch": 1.0464317516855361, + "ewc_loss": 0.007496741600334644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.496741454815492e-05, + "grad_norm": 3.6804070472717285, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.88233482837677, + "num_tokens": 313741286.0, + "step": 8226 + }, + { + "epoch": 1.0465589619641267, + "ewc_loss": 0.007490336429327726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.490336429327726e-05, + "grad_norm": 3.7786247730255127, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8566128015518188, + "num_tokens": 313779115.0, + "step": 8227 + }, + { + "epoch": 1.0466861722427172, + "ewc_loss": 0.007557169534265995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.557169737992808e-05, + "grad_norm": 3.6747584342956543, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8760566115379333, + "num_tokens": 313822331.0, + "step": 8228 + }, + { + "epoch": 1.0468133825213077, + "ewc_loss": 0.007452647667378187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.452647696482018e-05, + "grad_norm": 3.620753765106201, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8652293682098389, + "num_tokens": 313863545.0, + "step": 8229 + }, + { + "epoch": 1.0469405927998983, + "ewc_loss": 0.007463722489774227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.463722431566566e-05, + "grad_norm": 3.722177743911743, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8911051750183105, + "num_tokens": 313894956.0, + "step": 8230 + }, + { + "epoch": 1.0470678030784888, + "ewc_loss": 0.007525501772761345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525501860072836e-05, + "grad_norm": 3.7070252895355225, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8789527416229248, + "num_tokens": 313927920.0, + "step": 8231 + }, + { + "epoch": 1.0471950133570793, + "ewc_loss": 0.007492993492633104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.492993609048426e-05, + "grad_norm": 3.7134056091308594, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8793749213218689, + "num_tokens": 313963230.0, + "step": 8232 + }, + { + "epoch": 1.0473222236356698, + "ewc_loss": 0.0075103831477463245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.510383147746325e-05, + "grad_norm": 3.788034439086914, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8639086484909058, + "num_tokens": 313995951.0, + "step": 8233 + }, + { + "epoch": 1.0474494339142604, + "ewc_loss": 0.007553017232567072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553017348982394e-05, + "grad_norm": 3.660499095916748, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8715623021125793, + "num_tokens": 314040928.0, + "step": 8234 + }, + { + "epoch": 1.0475766441928507, + "ewc_loss": 0.0074555049650371075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.455504965037107e-05, + "grad_norm": 3.6762497425079346, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8623816967010498, + "num_tokens": 314080329.0, + "step": 8235 + }, + { + "epoch": 1.0477038544714412, + "ewc_loss": 0.0075182802975177765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518280472140759e-05, + "grad_norm": 3.7057037353515625, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8717465400695801, + "num_tokens": 314117107.0, + "step": 8236 + }, + { + "epoch": 1.0478310647500317, + "ewc_loss": 0.007505827117711306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.505826943088323e-05, + "grad_norm": 3.7567880153656006, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8702155351638794, + "num_tokens": 314148077.0, + "step": 8237 + }, + { + "epoch": 1.0479582750286223, + "ewc_loss": 0.00754938367754221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54938373574987e-05, + "grad_norm": 3.728764772415161, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8622964024543762, + "num_tokens": 314182064.0, + "step": 8238 + }, + { + "epoch": 1.0480854853072128, + "ewc_loss": 0.007512005511671305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.512005686294287e-05, + "grad_norm": 3.6946794986724854, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.862123429775238, + "num_tokens": 314221667.0, + "step": 8239 + }, + { + "epoch": 1.0482126955858033, + "ewc_loss": 0.007513315882533789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.513316086260602e-05, + "grad_norm": 3.7210867404937744, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8673602342605591, + "num_tokens": 314255225.0, + "step": 8240 + }, + { + "epoch": 1.0483399058643939, + "ewc_loss": 0.007551392540335655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.551392627647147e-05, + "grad_norm": 3.7420804500579834, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8712447881698608, + "num_tokens": 314288011.0, + "step": 8241 + }, + { + "epoch": 1.0484671161429844, + "ewc_loss": 0.007549358997493982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549358997493982e-05, + "grad_norm": 3.6313061714172363, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8663178086280823, + "num_tokens": 314332461.0, + "step": 8242 + }, + { + "epoch": 1.048594326421575, + "ewc_loss": 0.007483731489628553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.48373131500557e-05, + "grad_norm": 3.692032814025879, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8661489486694336, + "num_tokens": 314367967.0, + "step": 8243 + }, + { + "epoch": 1.0487215367001654, + "ewc_loss": 0.007581744343042374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.581744284834713e-05, + "grad_norm": 3.693519115447998, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8805801868438721, + "num_tokens": 314404487.0, + "step": 8244 + }, + { + "epoch": 1.048848746978756, + "ewc_loss": 0.007548958994448185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.548958819825202e-05, + "grad_norm": 3.660890579223633, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8681049942970276, + "num_tokens": 314444731.0, + "step": 8245 + }, + { + "epoch": 1.0489759572573465, + "ewc_loss": 0.0075290268287062645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.529027061536908e-05, + "grad_norm": 3.727180004119873, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8728230595588684, + "num_tokens": 314479720.0, + "step": 8246 + }, + { + "epoch": 1.0491031675359368, + "ewc_loss": 0.007581586949527264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.581587124150246e-05, + "grad_norm": 3.6747231483459473, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8868883848190308, + "num_tokens": 314515613.0, + "step": 8247 + }, + { + "epoch": 1.0492303778145273, + "ewc_loss": 0.007548002526164055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.548002758994699e-05, + "grad_norm": 3.6650023460388184, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.86916583776474, + "num_tokens": 314556519.0, + "step": 8248 + }, + { + "epoch": 1.0493575880931179, + "ewc_loss": 0.007534740958362818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534740871051326e-05, + "grad_norm": 3.7239134311676025, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8790954351425171, + "num_tokens": 314590803.0, + "step": 8249 + }, + { + "epoch": 1.0494847983717084, + "ewc_loss": 0.007575720548629761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57572051952593e-05, + "grad_norm": 3.6563682556152344, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8723403811454773, + "num_tokens": 314631347.0, + "step": 8250 + }, + { + "epoch": 1.049612008650299, + "ewc_loss": 0.0075078606605529785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507860573241487e-05, + "grad_norm": 3.676628351211548, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.870459258556366, + "num_tokens": 314673739.0, + "step": 8251 + }, + { + "epoch": 1.0497392189288894, + "ewc_loss": 0.007553856819868088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55385699449107e-05, + "grad_norm": 3.63718843460083, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.870336651802063, + "num_tokens": 314718498.0, + "step": 8252 + }, + { + "epoch": 1.04986642920748, + "ewc_loss": 0.007516853045672178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.516852929256856e-05, + "grad_norm": 3.717627763748169, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8757076263427734, + "num_tokens": 314758755.0, + "step": 8253 + }, + { + "epoch": 1.0499936394860705, + "ewc_loss": 0.007570177782326937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.570177695015445e-05, + "grad_norm": 3.728074789047241, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8748966455459595, + "num_tokens": 314793644.0, + "step": 8254 + }, + { + "epoch": 1.050120849764661, + "ewc_loss": 0.007531203329563141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.53120330045931e-05, + "grad_norm": 3.715697765350342, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.867985725402832, + "num_tokens": 314828045.0, + "step": 8255 + }, + { + "epoch": 1.0502480600432516, + "ewc_loss": 0.007526421453803778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.526421541115269e-05, + "grad_norm": 3.6495275497436523, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8735124468803406, + "num_tokens": 314866088.0, + "step": 8256 + }, + { + "epoch": 1.050375270321842, + "ewc_loss": 0.007476466242223978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.476466271327808e-05, + "grad_norm": 3.6704330444335938, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8620691299438477, + "num_tokens": 314904757.0, + "step": 8257 + }, + { + "epoch": 1.0505024806004326, + "ewc_loss": 0.0075197406113147736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.519740756833926e-05, + "grad_norm": 3.6978368759155273, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8645948171615601, + "num_tokens": 314941410.0, + "step": 8258 + }, + { + "epoch": 1.0506296908790231, + "ewc_loss": 0.007517408113926649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517408084822819e-05, + "grad_norm": 3.649477481842041, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8797228336334229, + "num_tokens": 314982060.0, + "step": 8259 + }, + { + "epoch": 1.0507569011576134, + "ewc_loss": 0.007484399247914553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.484399247914553e-05, + "grad_norm": 3.6435790061950684, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8616927266120911, + "num_tokens": 315021812.0, + "step": 8260 + }, + { + "epoch": 1.050884111436204, + "ewc_loss": 0.007518359925597906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518359780078754e-05, + "grad_norm": 3.7172436714172363, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8674363493919373, + "num_tokens": 315059520.0, + "step": 8261 + }, + { + "epoch": 1.0510113217147945, + "ewc_loss": 0.007540795020759106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540795195382088e-05, + "grad_norm": 3.6796045303344727, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8652228116989136, + "num_tokens": 315098392.0, + "step": 8262 + }, + { + "epoch": 1.051138531993385, + "ewc_loss": 0.0074987635016441345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.498763443436474e-05, + "grad_norm": 3.7747368812561035, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8602414131164551, + "num_tokens": 315137375.0, + "step": 8263 + }, + { + "epoch": 1.0512657422719756, + "ewc_loss": 0.007565484847873449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565484702354297e-05, + "grad_norm": 3.770322322845459, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8771960735321045, + "num_tokens": 315171308.0, + "step": 8264 + }, + { + "epoch": 1.051392952550566, + "ewc_loss": 0.0075375777669250965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.537577766925097e-05, + "grad_norm": 3.6434128284454346, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8713681101799011, + "num_tokens": 315212176.0, + "step": 8265 + }, + { + "epoch": 1.0515201628291566, + "ewc_loss": 0.007469431031495333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469431147910655e-05, + "grad_norm": 3.7492456436157227, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8583827018737793, + "num_tokens": 315251321.0, + "step": 8266 + }, + { + "epoch": 1.0516473731077471, + "ewc_loss": 0.0075835539028048515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58355381549336e-05, + "grad_norm": 3.7159857749938965, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8574134111404419, + "num_tokens": 315288780.0, + "step": 8267 + }, + { + "epoch": 1.0517745833863377, + "ewc_loss": 0.0075168004259467125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.516800542362034e-05, + "grad_norm": 3.66693377494812, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.891093373298645, + "num_tokens": 315325630.0, + "step": 8268 + }, + { + "epoch": 1.0519017936649282, + "ewc_loss": 0.00751050841063261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.510508294217288e-05, + "grad_norm": 3.6905384063720703, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8658176064491272, + "num_tokens": 315362887.0, + "step": 8269 + }, + { + "epoch": 1.0520290039435187, + "ewc_loss": 0.007564010098576546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564009865745902e-05, + "grad_norm": 3.6913936138153076, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8829421997070312, + "num_tokens": 315397552.0, + "step": 8270 + }, + { + "epoch": 1.0521562142221093, + "ewc_loss": 0.007535160519182682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.535160693805665e-05, + "grad_norm": 3.643172025680542, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8813652992248535, + "num_tokens": 315436785.0, + "step": 8271 + }, + { + "epoch": 1.0522834245006996, + "ewc_loss": 0.0075132278725504875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51322804717347e-05, + "grad_norm": 3.7114646434783936, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8813440799713135, + "num_tokens": 315467849.0, + "step": 8272 + }, + { + "epoch": 1.05241063477929, + "ewc_loss": 0.007580755278468132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.580755482194945e-05, + "grad_norm": 3.700960159301758, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8785042762756348, + "num_tokens": 315501496.0, + "step": 8273 + }, + { + "epoch": 1.0525378450578806, + "ewc_loss": 0.007552650757133961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.552650640718639e-05, + "grad_norm": 3.672663450241089, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8793841600418091, + "num_tokens": 315540878.0, + "step": 8274 + }, + { + "epoch": 1.0526650553364711, + "ewc_loss": 0.007554794196039438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554794137831777e-05, + "grad_norm": 3.6804754734039307, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8855071663856506, + "num_tokens": 315578139.0, + "step": 8275 + }, + { + "epoch": 1.0527922656150617, + "ewc_loss": 0.007556567434221506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.556567288702354e-05, + "grad_norm": 3.669367790222168, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.871355414390564, + "num_tokens": 315615845.0, + "step": 8276 + }, + { + "epoch": 1.0529194758936522, + "ewc_loss": 0.007533658295869827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.533658208558336e-05, + "grad_norm": 3.7977750301361084, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8646335601806641, + "num_tokens": 315645407.0, + "step": 8277 + }, + { + "epoch": 1.0530466861722427, + "ewc_loss": 0.007625216618180275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.625216676387936e-05, + "grad_norm": 3.636568307876587, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8783202171325684, + "num_tokens": 315687922.0, + "step": 8278 + }, + { + "epoch": 1.0531738964508333, + "ewc_loss": 0.007473444566130638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.473444566130638e-05, + "grad_norm": 3.688316583633423, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8735605478286743, + "num_tokens": 315723730.0, + "step": 8279 + }, + { + "epoch": 1.0533011067294238, + "ewc_loss": 0.0075699505396187305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569950685137883e-05, + "grad_norm": 3.647400379180908, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8826024532318115, + "num_tokens": 315765662.0, + "step": 8280 + }, + { + "epoch": 1.0534283170080143, + "ewc_loss": 0.007515323348343372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.515323522966355e-05, + "grad_norm": 3.7922940254211426, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.847672164440155, + "num_tokens": 315808668.0, + "step": 8281 + }, + { + "epoch": 1.0535555272866048, + "ewc_loss": 0.0075941565446555614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.594156340928748e-05, + "grad_norm": 3.7334177494049072, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.866131603717804, + "num_tokens": 315844878.0, + "step": 8282 + }, + { + "epoch": 1.0536827375651954, + "ewc_loss": 0.007519363891333342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.519363862229511e-05, + "grad_norm": 3.6852123737335205, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8825312852859497, + "num_tokens": 315879472.0, + "step": 8283 + }, + { + "epoch": 1.0538099478437857, + "ewc_loss": 0.0074949427507817745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.494942838093266e-05, + "grad_norm": 3.710986852645874, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8677091002464294, + "num_tokens": 315915496.0, + "step": 8284 + }, + { + "epoch": 1.0539371581223762, + "ewc_loss": 0.00752370897680521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.523709064116701e-05, + "grad_norm": 3.671112537384033, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8728705644607544, + "num_tokens": 315954317.0, + "step": 8285 + }, + { + "epoch": 1.0540643684009667, + "ewc_loss": 0.007474794052541256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.474794256268069e-05, + "grad_norm": 3.681121349334717, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.868504524230957, + "num_tokens": 315995518.0, + "step": 8286 + }, + { + "epoch": 1.0541915786795573, + "ewc_loss": 0.00750383548438549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.503835513489321e-05, + "grad_norm": 3.6178696155548096, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8850522637367249, + "num_tokens": 316040475.0, + "step": 8287 + }, + { + "epoch": 1.0543187889581478, + "ewc_loss": 0.007447782438248396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.447782263625413e-05, + "grad_norm": 3.666393280029297, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8696697950363159, + "num_tokens": 316083003.0, + "step": 8288 + }, + { + "epoch": 1.0544459992367383, + "ewc_loss": 0.007508500944823027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.508500857511535e-05, + "grad_norm": 3.683344602584839, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8713937997817993, + "num_tokens": 316124687.0, + "step": 8289 + }, + { + "epoch": 1.0545732095153288, + "ewc_loss": 0.007483502849936485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.483502849936485e-05, + "grad_norm": 3.6693406105041504, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.858102023601532, + "num_tokens": 316170052.0, + "step": 8290 + }, + { + "epoch": 1.0547004197939194, + "ewc_loss": 0.007473022677004337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.473022560589015e-05, + "grad_norm": 3.7208473682403564, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8586174845695496, + "num_tokens": 316211941.0, + "step": 8291 + }, + { + "epoch": 1.05482763007251, + "ewc_loss": 0.007509472314268351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.509472197853029e-05, + "grad_norm": 3.697613477706909, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8744859099388123, + "num_tokens": 316252592.0, + "step": 8292 + }, + { + "epoch": 1.0549548403511004, + "ewc_loss": 0.007454756181687117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.454756268998608e-05, + "grad_norm": 3.7055914402008057, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8784921169281006, + "num_tokens": 316287250.0, + "step": 8293 + }, + { + "epoch": 1.055082050629691, + "ewc_loss": 0.007462497800588608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.462497887900099e-05, + "grad_norm": 3.6743178367614746, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8746166825294495, + "num_tokens": 316326308.0, + "step": 8294 + }, + { + "epoch": 1.0552092609082815, + "ewc_loss": 0.007441842928528786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.441842899424955e-05, + "grad_norm": 3.692171812057495, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8759608268737793, + "num_tokens": 316364519.0, + "step": 8295 + }, + { + "epoch": 1.0553364711868718, + "ewc_loss": 0.007455646060407162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.455646118614823e-05, + "grad_norm": 3.682723045349121, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8752846717834473, + "num_tokens": 316401165.0, + "step": 8296 + }, + { + "epoch": 1.0554636814654623, + "ewc_loss": 0.0074406880885362625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.440688204951584e-05, + "grad_norm": 3.70668625831604, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.866946816444397, + "num_tokens": 316440245.0, + "step": 8297 + }, + { + "epoch": 1.0555908917440529, + "ewc_loss": 0.007464625872671604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.464626105502248e-05, + "grad_norm": 3.626680374145508, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8791112899780273, + "num_tokens": 316484969.0, + "step": 8298 + }, + { + "epoch": 1.0557181020226434, + "ewc_loss": 0.007413696497678757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.413696584990248e-05, + "grad_norm": 3.7091293334960938, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8892648816108704, + "num_tokens": 316520510.0, + "step": 8299 + }, + { + "epoch": 1.055845312301234, + "ewc_loss": 0.007478397339582443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.478397310478613e-05, + "grad_norm": 3.7134859561920166, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8558147549629211, + "num_tokens": 316561946.0, + "step": 8300 + }, + { + "epoch": 1.0559725225798244, + "ewc_loss": 0.007448636461049318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.448636461049318e-05, + "grad_norm": 3.6304969787597656, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8882813453674316, + "num_tokens": 316605489.0, + "step": 8301 + }, + { + "epoch": 1.056099732858415, + "ewc_loss": 0.0074041737243533134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.404173811664805e-05, + "grad_norm": 3.684353828430176, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8552654385566711, + "num_tokens": 316648590.0, + "step": 8302 + }, + { + "epoch": 1.0562269431370055, + "ewc_loss": 0.007462956476956606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.462956273229793e-05, + "grad_norm": 3.674873113632202, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8835757970809937, + "num_tokens": 316688169.0, + "step": 8303 + }, + { + "epoch": 1.056354153415596, + "ewc_loss": 0.0074228402227163315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.422840280923992e-05, + "grad_norm": 3.757676839828491, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8524004220962524, + "num_tokens": 316724031.0, + "step": 8304 + }, + { + "epoch": 1.0564813636941865, + "ewc_loss": 0.0074694217182695866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469421689165756e-05, + "grad_norm": 3.621533155441284, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8716448545455933, + "num_tokens": 316769762.0, + "step": 8305 + }, + { + "epoch": 1.056608573972777, + "ewc_loss": 0.007394520565867424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.394520798698068e-05, + "grad_norm": 3.722886323928833, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8785984516143799, + "num_tokens": 316814156.0, + "step": 8306 + }, + { + "epoch": 1.0567357842513676, + "ewc_loss": 0.007452436722815037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.452436693711206e-05, + "grad_norm": 3.678067922592163, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8790897130966187, + "num_tokens": 316852784.0, + "step": 8307 + }, + { + "epoch": 1.0568629945299581, + "ewc_loss": 0.007394610904157162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.394611020572484e-05, + "grad_norm": 3.695274591445923, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8654210567474365, + "num_tokens": 316891992.0, + "step": 8308 + }, + { + "epoch": 1.0569902048085484, + "ewc_loss": 0.007407904136925936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.407904195133597e-05, + "grad_norm": 3.6319406032562256, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8863447904586792, + "num_tokens": 316932246.0, + "step": 8309 + }, + { + "epoch": 1.057117415087139, + "ewc_loss": 0.007367241196334362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.36724105081521e-05, + "grad_norm": 3.724003791809082, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8641065359115601, + "num_tokens": 316972017.0, + "step": 8310 + }, + { + "epoch": 1.0572446253657295, + "ewc_loss": 0.007441083434969187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.441083289450034e-05, + "grad_norm": 3.7410080432891846, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8669756650924683, + "num_tokens": 317008631.0, + "step": 8311 + }, + { + "epoch": 1.05737183564432, + "ewc_loss": 0.007411608472466469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.411608385154977e-05, + "grad_norm": 3.6400375366210938, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.876603901386261, + "num_tokens": 317048616.0, + "step": 8312 + }, + { + "epoch": 1.0574990459229106, + "ewc_loss": 0.007350538857281208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.350539090111852e-05, + "grad_norm": 3.690477132797241, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8652503490447998, + "num_tokens": 317088675.0, + "step": 8313 + }, + { + "epoch": 1.057626256201501, + "ewc_loss": 0.007425372488796711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.425372314173728e-05, + "grad_norm": 3.689469575881958, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8921277523040771, + "num_tokens": 317120351.0, + "step": 8314 + }, + { + "epoch": 1.0577534664800916, + "ewc_loss": 0.007395187858492136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.395188004011288e-05, + "grad_norm": 3.7383925914764404, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8652768731117249, + "num_tokens": 317159027.0, + "step": 8315 + }, + { + "epoch": 1.0578806767586821, + "ewc_loss": 0.0074339359998703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.433936116285622e-05, + "grad_norm": 3.7653965950012207, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8735483884811401, + "num_tokens": 317188892.0, + "step": 8316 + }, + { + "epoch": 1.0580078870372727, + "ewc_loss": 0.007432876620441675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.432876736856997e-05, + "grad_norm": 3.6748807430267334, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8761086463928223, + "num_tokens": 317229026.0, + "step": 8317 + }, + { + "epoch": 1.0581350973158632, + "ewc_loss": 0.007386402226984501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.386402285192162e-05, + "grad_norm": 3.65419864654541, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8689109683036804, + "num_tokens": 317272946.0, + "step": 8318 + }, + { + "epoch": 1.0582623075944537, + "ewc_loss": 0.007414248771965504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.414248830173165e-05, + "grad_norm": 3.6513919830322266, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8724813461303711, + "num_tokens": 317314625.0, + "step": 8319 + }, + { + "epoch": 1.058389517873044, + "ewc_loss": 0.007426588796079159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.42658885428682e-05, + "grad_norm": 3.6914405822753906, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8822855949401855, + "num_tokens": 317351450.0, + "step": 8320 + }, + { + "epoch": 1.0585167281516346, + "ewc_loss": 0.007443309761583805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.443309732479975e-05, + "grad_norm": 3.6977503299713135, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8568795919418335, + "num_tokens": 317391519.0, + "step": 8321 + }, + { + "epoch": 1.058643938430225, + "ewc_loss": 0.007449415046721697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.449414988514036e-05, + "grad_norm": 3.737230062484741, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8656780123710632, + "num_tokens": 317430672.0, + "step": 8322 + }, + { + "epoch": 1.0587711487088156, + "ewc_loss": 0.007471870165318251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.471870048902929e-05, + "grad_norm": 3.6767892837524414, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8734173774719238, + "num_tokens": 317470203.0, + "step": 8323 + }, + { + "epoch": 1.0588983589874061, + "ewc_loss": 0.0074240416288375854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.424041541526094e-05, + "grad_norm": 3.6940369606018066, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8743970394134521, + "num_tokens": 317509594.0, + "step": 8324 + }, + { + "epoch": 1.0590255692659967, + "ewc_loss": 0.007452606223523617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.452606223523617e-05, + "grad_norm": 3.6886162757873535, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8695650100708008, + "num_tokens": 317549735.0, + "step": 8325 + }, + { + "epoch": 1.0591527795445872, + "ewc_loss": 0.0074594407342374325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.459440530510619e-05, + "grad_norm": 3.718892812728882, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8715466856956482, + "num_tokens": 317585064.0, + "step": 8326 + }, + { + "epoch": 1.0592799898231777, + "ewc_loss": 0.007469165604561567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469165575457737e-05, + "grad_norm": 3.6920504570007324, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8861314654350281, + "num_tokens": 317622289.0, + "step": 8327 + }, + { + "epoch": 1.0594072001017683, + "ewc_loss": 0.00745303463190794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.453034777427092e-05, + "grad_norm": 3.7022438049316406, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.860844612121582, + "num_tokens": 317662872.0, + "step": 8328 + }, + { + "epoch": 1.0595344103803588, + "ewc_loss": 0.007462947629392147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.462947542080656e-05, + "grad_norm": 3.6952006816864014, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8728379011154175, + "num_tokens": 317701418.0, + "step": 8329 + }, + { + "epoch": 1.0596616206589493, + "ewc_loss": 0.007467435207217932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.467435352737084e-05, + "grad_norm": 3.6668601036071777, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8843616247177124, + "num_tokens": 317741416.0, + "step": 8330 + }, + { + "epoch": 1.0597888309375398, + "ewc_loss": 0.007452161051332951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.452160934917629e-05, + "grad_norm": 3.729801654815674, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.877535343170166, + "num_tokens": 317778623.0, + "step": 8331 + }, + { + "epoch": 1.0599160412161304, + "ewc_loss": 0.007490238174796104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.490238203899935e-05, + "grad_norm": 3.732292413711548, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8738417029380798, + "num_tokens": 317816577.0, + "step": 8332 + }, + { + "epoch": 1.0600432514947207, + "ewc_loss": 0.007465452887117863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.465452654287219e-05, + "grad_norm": 3.7511959075927734, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8699880838394165, + "num_tokens": 317851532.0, + "step": 8333 + }, + { + "epoch": 1.0601704617733112, + "ewc_loss": 0.007491168100386858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.491168071283028e-05, + "grad_norm": 3.6902077198028564, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8730236887931824, + "num_tokens": 317886789.0, + "step": 8334 + }, + { + "epoch": 1.0602976720519017, + "ewc_loss": 0.007466771174222231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.466771057806909e-05, + "grad_norm": 3.7086548805236816, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8727383017539978, + "num_tokens": 317929511.0, + "step": 8335 + }, + { + "epoch": 1.0604248823304923, + "ewc_loss": 0.007487592753022909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487592665711418e-05, + "grad_norm": 3.7169437408447266, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8700441122055054, + "num_tokens": 317963767.0, + "step": 8336 + }, + { + "epoch": 1.0605520926090828, + "ewc_loss": 0.007488365285098553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.488365372410044e-05, + "grad_norm": 3.7260913848876953, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8881320953369141, + "num_tokens": 318000458.0, + "step": 8337 + }, + { + "epoch": 1.0606793028876733, + "ewc_loss": 0.007502464111894369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.502463995479047e-05, + "grad_norm": 3.6518092155456543, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8895159363746643, + "num_tokens": 318040394.0, + "step": 8338 + }, + { + "epoch": 1.0608065131662638, + "ewc_loss": 0.0074747842736542225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.474784069927409e-05, + "grad_norm": 3.724140167236328, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8688666820526123, + "num_tokens": 318080859.0, + "step": 8339 + }, + { + "epoch": 1.0609337234448544, + "ewc_loss": 0.007539290469139814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.539290527347475e-05, + "grad_norm": 3.7368929386138916, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8891407251358032, + "num_tokens": 318115378.0, + "step": 8340 + }, + { + "epoch": 1.061060933723445, + "ewc_loss": 0.007508563343435526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.508563430747017e-05, + "grad_norm": 3.690717935562134, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8642812967300415, + "num_tokens": 318150351.0, + "step": 8341 + }, + { + "epoch": 1.0611881440020354, + "ewc_loss": 0.007495451252907515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.495451427530497e-05, + "grad_norm": 3.6867191791534424, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8711078763008118, + "num_tokens": 318192427.0, + "step": 8342 + }, + { + "epoch": 1.061315354280626, + "ewc_loss": 0.0074926805682480335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.492680742871016e-05, + "grad_norm": 3.725456953048706, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8597382307052612, + "num_tokens": 318232150.0, + "step": 8343 + }, + { + "epoch": 1.0614425645592165, + "ewc_loss": 0.0075243390165269375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52433916204609e-05, + "grad_norm": 3.7320737838745117, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8683873414993286, + "num_tokens": 318267742.0, + "step": 8344 + }, + { + "epoch": 1.0615697748378068, + "ewc_loss": 0.007522811181843281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.522811210947111e-05, + "grad_norm": 3.688929557800293, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8767552971839905, + "num_tokens": 318307974.0, + "step": 8345 + }, + { + "epoch": 1.0616969851163973, + "ewc_loss": 0.007491682656109333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.49168248148635e-05, + "grad_norm": 3.732330799102783, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8540753722190857, + "num_tokens": 318349880.0, + "step": 8346 + }, + { + "epoch": 1.0618241953949878, + "ewc_loss": 0.007533303927630186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.533303869422525e-05, + "grad_norm": 3.7442421913146973, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8709031939506531, + "num_tokens": 318383869.0, + "step": 8347 + }, + { + "epoch": 1.0619514056735784, + "ewc_loss": 0.007523985579609871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52398555050604e-05, + "grad_norm": 3.6752641201019287, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8872267007827759, + "num_tokens": 318419659.0, + "step": 8348 + }, + { + "epoch": 1.062078615952169, + "ewc_loss": 0.00746050663292408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.460506458301097e-05, + "grad_norm": 3.6179723739624023, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8807033896446228, + "num_tokens": 318460795.0, + "step": 8349 + }, + { + "epoch": 1.0622058262307594, + "ewc_loss": 0.007479091640561819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.479091436835006e-05, + "grad_norm": 3.751204252243042, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8624640107154846, + "num_tokens": 318499451.0, + "step": 8350 + }, + { + "epoch": 1.06233303650935, + "ewc_loss": 0.00756449019536376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564490078948438e-05, + "grad_norm": 3.70161771774292, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.875540018081665, + "num_tokens": 318533413.0, + "step": 8351 + }, + { + "epoch": 1.0624602467879405, + "ewc_loss": 0.007474065292626619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.474065205315128e-05, + "grad_norm": 3.7248694896698, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8619095087051392, + "num_tokens": 318572533.0, + "step": 8352 + }, + { + "epoch": 1.062587457066531, + "ewc_loss": 0.007518966682255268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518966594943777e-05, + "grad_norm": 3.72078800201416, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8743557333946228, + "num_tokens": 318609442.0, + "step": 8353 + }, + { + "epoch": 1.0627146673451215, + "ewc_loss": 0.007508375681936741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.508375711040571e-05, + "grad_norm": 3.7164433002471924, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8861253261566162, + "num_tokens": 318643475.0, + "step": 8354 + }, + { + "epoch": 1.062841877623712, + "ewc_loss": 0.007496051490306854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.496051694033667e-05, + "grad_norm": 3.670227289199829, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8758803009986877, + "num_tokens": 318684223.0, + "step": 8355 + }, + { + "epoch": 1.0629690879023026, + "ewc_loss": 0.007487110793590546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487110997317359e-05, + "grad_norm": 3.767592191696167, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8728903532028198, + "num_tokens": 318717148.0, + "step": 8356 + }, + { + "epoch": 1.0630962981808931, + "ewc_loss": 0.007551002781838179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.551002636319026e-05, + "grad_norm": 3.6550352573394775, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8831121921539307, + "num_tokens": 318758929.0, + "step": 8357 + }, + { + "epoch": 1.0632235084594834, + "ewc_loss": 0.0074628484435379505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.462848589057103e-05, + "grad_norm": 3.7636282444000244, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.869782567024231, + "num_tokens": 318796438.0, + "step": 8358 + }, + { + "epoch": 1.063350718738074, + "ewc_loss": 0.007568673696368933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.568673754576594e-05, + "grad_norm": 3.728606700897217, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8599210977554321, + "num_tokens": 318835732.0, + "step": 8359 + }, + { + "epoch": 1.0634779290166645, + "ewc_loss": 0.007505834102630615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.505834219045937e-05, + "grad_norm": 3.706556797027588, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8713008165359497, + "num_tokens": 318872151.0, + "step": 8360 + }, + { + "epoch": 1.063605139295255, + "ewc_loss": 0.0075163692235946655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.516369078075513e-05, + "grad_norm": 3.6933186054229736, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8605128526687622, + "num_tokens": 318911956.0, + "step": 8361 + }, + { + "epoch": 1.0637323495738455, + "ewc_loss": 0.007517159916460514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517159974668175e-05, + "grad_norm": 3.746293544769287, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8725643157958984, + "num_tokens": 318945593.0, + "step": 8362 + }, + { + "epoch": 1.063859559852436, + "ewc_loss": 0.007568418979644775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.568419096060097e-05, + "grad_norm": 3.7503952980041504, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8836867809295654, + "num_tokens": 318976670.0, + "step": 8363 + }, + { + "epoch": 1.0639867701310266, + "ewc_loss": 0.0075462376698851585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.546237611677498e-05, + "grad_norm": 3.7475087642669678, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8732389211654663, + "num_tokens": 319007798.0, + "step": 8364 + }, + { + "epoch": 1.0641139804096171, + "ewc_loss": 0.0075449044816195965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.544904656242579e-05, + "grad_norm": 3.6977546215057373, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8618859052658081, + "num_tokens": 319044808.0, + "step": 8365 + }, + { + "epoch": 1.0642411906882077, + "ewc_loss": 0.007534179370850325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534179167123511e-05, + "grad_norm": 3.6772329807281494, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8916116952896118, + "num_tokens": 319083387.0, + "step": 8366 + }, + { + "epoch": 1.0643684009667982, + "ewc_loss": 0.007534930948168039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534930773545057e-05, + "grad_norm": 3.650716543197632, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8870148658752441, + "num_tokens": 319121897.0, + "step": 8367 + }, + { + "epoch": 1.0644956112453887, + "ewc_loss": 0.00751836271956563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5183626904618e-05, + "grad_norm": 3.733762502670288, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8605254888534546, + "num_tokens": 319157190.0, + "step": 8368 + }, + { + "epoch": 1.064622821523979, + "ewc_loss": 0.007596852257847786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.596852083224803e-05, + "grad_norm": 3.744197368621826, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8830841779708862, + "num_tokens": 319188353.0, + "step": 8369 + }, + { + "epoch": 1.0647500318025696, + "ewc_loss": 0.007558400742709637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.558400830021128e-05, + "grad_norm": 3.6958982944488525, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.874215304851532, + "num_tokens": 319225536.0, + "step": 8370 + }, + { + "epoch": 1.06487724208116, + "ewc_loss": 0.007543954532593489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.543954416178167e-05, + "grad_norm": 3.6916496753692627, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.879733145236969, + "num_tokens": 319265114.0, + "step": 8371 + }, + { + "epoch": 1.0650044523597506, + "ewc_loss": 0.007535176817327738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.535176700912416e-05, + "grad_norm": 3.645220994949341, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8828797340393066, + "num_tokens": 319309280.0, + "step": 8372 + }, + { + "epoch": 1.0651316626383411, + "ewc_loss": 0.007507342379540205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507342525059357e-05, + "grad_norm": 3.6909468173980713, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8831723928451538, + "num_tokens": 319346114.0, + "step": 8373 + }, + { + "epoch": 1.0652588729169317, + "ewc_loss": 0.007557589560747147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.557589560747147e-05, + "grad_norm": 3.684372663497925, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.878325879573822, + "num_tokens": 319389001.0, + "step": 8374 + }, + { + "epoch": 1.0653860831955222, + "ewc_loss": 0.007509635761380196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.509635906899348e-05, + "grad_norm": 3.601609706878662, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8799502849578857, + "num_tokens": 319431464.0, + "step": 8375 + }, + { + "epoch": 1.0655132934741127, + "ewc_loss": 0.00746536161750555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.465361704817042e-05, + "grad_norm": 3.712902545928955, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8494353294372559, + "num_tokens": 319475638.0, + "step": 8376 + }, + { + "epoch": 1.0656405037527032, + "ewc_loss": 0.007537981495261192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.537981582572684e-05, + "grad_norm": 3.667311906814575, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8820785880088806, + "num_tokens": 319515983.0, + "step": 8377 + }, + { + "epoch": 1.0657677140312938, + "ewc_loss": 0.007464855909347534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.464856025762856e-05, + "grad_norm": 3.763378620147705, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8695800304412842, + "num_tokens": 319552152.0, + "step": 8378 + }, + { + "epoch": 1.0658949243098843, + "ewc_loss": 0.0075356625020504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.535662734881043e-05, + "grad_norm": 3.715884208679199, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8641892075538635, + "num_tokens": 319592963.0, + "step": 8379 + }, + { + "epoch": 1.0660221345884748, + "ewc_loss": 0.007470414042472839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.470414129784331e-05, + "grad_norm": 3.7141380310058594, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8597555756568909, + "num_tokens": 319635136.0, + "step": 8380 + }, + { + "epoch": 1.0661493448670654, + "ewc_loss": 0.007473448757082224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.473448931705207e-05, + "grad_norm": 3.750546455383301, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8754585385322571, + "num_tokens": 319670118.0, + "step": 8381 + }, + { + "epoch": 1.0662765551456557, + "ewc_loss": 0.007494994439184666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.494994497392327e-05, + "grad_norm": 3.6796317100524902, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8785600662231445, + "num_tokens": 319711057.0, + "step": 8382 + }, + { + "epoch": 1.0664037654242462, + "ewc_loss": 0.007446270436048508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.446270319633186e-05, + "grad_norm": 3.7275102138519287, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8712981343269348, + "num_tokens": 319746451.0, + "step": 8383 + }, + { + "epoch": 1.0665309757028367, + "ewc_loss": 0.007494330871850252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.494330930057913e-05, + "grad_norm": 3.6673336029052734, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8676818609237671, + "num_tokens": 319792149.0, + "step": 8384 + }, + { + "epoch": 1.0666581859814273, + "ewc_loss": 0.007435375824570656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.435376028297469e-05, + "grad_norm": 3.7419981956481934, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8640754222869873, + "num_tokens": 319826508.0, + "step": 8385 + }, + { + "epoch": 1.0667853962600178, + "ewc_loss": 0.007512620650231838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.512620504712686e-05, + "grad_norm": 3.688180685043335, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.874190628528595, + "num_tokens": 319867000.0, + "step": 8386 + }, + { + "epoch": 1.0669126065386083, + "ewc_loss": 0.007442422676831484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.442422793246806e-05, + "grad_norm": 3.818531036376953, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8528403043746948, + "num_tokens": 319904476.0, + "step": 8387 + }, + { + "epoch": 1.0670398168171988, + "ewc_loss": 0.007532128132879734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532128074672073e-05, + "grad_norm": 3.7419257164001465, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8513801097869873, + "num_tokens": 319943279.0, + "step": 8388 + }, + { + "epoch": 1.0671670270957894, + "ewc_loss": 0.007451774552464485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.451774581568316e-05, + "grad_norm": 3.6675167083740234, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8699674606323242, + "num_tokens": 319986610.0, + "step": 8389 + }, + { + "epoch": 1.06729423737438, + "ewc_loss": 0.007448794320225716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.448794349329546e-05, + "grad_norm": 3.7170071601867676, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8674017190933228, + "num_tokens": 320022702.0, + "step": 8390 + }, + { + "epoch": 1.0674214476529704, + "ewc_loss": 0.007509285118430853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.509285205742344e-05, + "grad_norm": 3.636040687561035, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.874943196773529, + "num_tokens": 320067918.0, + "step": 8391 + }, + { + "epoch": 1.067548657931561, + "ewc_loss": 0.00744680967181921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.446809468092397e-05, + "grad_norm": 3.7067372798919678, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8632605671882629, + "num_tokens": 320110455.0, + "step": 8392 + }, + { + "epoch": 1.0676758682101515, + "ewc_loss": 0.007521615829318762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.521615771111101e-05, + "grad_norm": 3.689744234085083, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8885741233825684, + "num_tokens": 320147675.0, + "step": 8393 + }, + { + "epoch": 1.0678030784887418, + "ewc_loss": 0.007479202933609486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.479202758986503e-05, + "grad_norm": 3.736753225326538, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8596938848495483, + "num_tokens": 320183882.0, + "step": 8394 + }, + { + "epoch": 1.0679302887673323, + "ewc_loss": 0.0075259157456457615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525915862061083e-05, + "grad_norm": 3.735520124435425, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8618541955947876, + "num_tokens": 320222215.0, + "step": 8395 + }, + { + "epoch": 1.0680574990459228, + "ewc_loss": 0.007514461874961853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.514462049584836e-05, + "grad_norm": 3.7185473442077637, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.865395188331604, + "num_tokens": 320262418.0, + "step": 8396 + }, + { + "epoch": 1.0681847093245134, + "ewc_loss": 0.007499748840928078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.499748608097434e-05, + "grad_norm": 3.7515883445739746, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8633631467819214, + "num_tokens": 320297304.0, + "step": 8397 + }, + { + "epoch": 1.068311919603104, + "ewc_loss": 0.007530520670115948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5305208156351e-05, + "grad_norm": 3.7514290809631348, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8755585551261902, + "num_tokens": 320330919.0, + "step": 8398 + }, + { + "epoch": 1.0684391298816944, + "ewc_loss": 0.007524344138801098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52434425521642e-05, + "grad_norm": 3.710691213607788, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8726497888565063, + "num_tokens": 320367602.0, + "step": 8399 + }, + { + "epoch": 1.068566340160285, + "ewc_loss": 0.00750326132401824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.503261440433562e-05, + "grad_norm": 3.7084481716156006, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8725523352622986, + "num_tokens": 320403875.0, + "step": 8400 + }, + { + "epoch": 1.0686935504388755, + "ewc_loss": 0.007521524094045162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.521524094045162e-05, + "grad_norm": 3.7278387546539307, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8761975169181824, + "num_tokens": 320439149.0, + "step": 8401 + }, + { + "epoch": 1.068820760717466, + "ewc_loss": 0.0075204139575362206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.520413782913238e-05, + "grad_norm": 3.672950506210327, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8845449686050415, + "num_tokens": 320476786.0, + "step": 8402 + }, + { + "epoch": 1.0689479709960565, + "ewc_loss": 0.00749721797183156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.497218030039221e-05, + "grad_norm": 3.7041633129119873, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8567808866500854, + "num_tokens": 320516945.0, + "step": 8403 + }, + { + "epoch": 1.069075181274647, + "ewc_loss": 0.007538135629147291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.538135832874104e-05, + "grad_norm": 3.6886682510375977, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8867829442024231, + "num_tokens": 320552801.0, + "step": 8404 + }, + { + "epoch": 1.0692023915532376, + "ewc_loss": 0.007501807063817978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.501806976506487e-05, + "grad_norm": 3.679302453994751, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8750160932540894, + "num_tokens": 320593605.0, + "step": 8405 + }, + { + "epoch": 1.0693296018318281, + "ewc_loss": 0.007507717236876488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507717236876488e-05, + "grad_norm": 3.7519445419311523, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8773607015609741, + "num_tokens": 320624476.0, + "step": 8406 + }, + { + "epoch": 1.0694568121104184, + "ewc_loss": 0.007557719945907593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55771980038844e-05, + "grad_norm": 3.6675939559936523, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8796474933624268, + "num_tokens": 320667000.0, + "step": 8407 + }, + { + "epoch": 1.069584022389009, + "ewc_loss": 0.007487965747714043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487965922337025e-05, + "grad_norm": 3.7741260528564453, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8765080571174622, + "num_tokens": 320699286.0, + "step": 8408 + }, + { + "epoch": 1.0697112326675995, + "ewc_loss": 0.0075883399695158005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58833994041197e-05, + "grad_norm": 3.744385004043579, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8797723054885864, + "num_tokens": 320732520.0, + "step": 8409 + }, + { + "epoch": 1.06983844294619, + "ewc_loss": 0.007531667593866587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.531667506555095e-05, + "grad_norm": 3.7175557613372803, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8612806797027588, + "num_tokens": 320773425.0, + "step": 8410 + }, + { + "epoch": 1.0699656532247805, + "ewc_loss": 0.00754434522241354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.544345135102049e-05, + "grad_norm": 3.7552425861358643, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8511396646499634, + "num_tokens": 320810859.0, + "step": 8411 + }, + { + "epoch": 1.070092863503371, + "ewc_loss": 0.007562223821878433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56222361815162e-05, + "grad_norm": 3.6967575550079346, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8761581182479858, + "num_tokens": 320850266.0, + "step": 8412 + }, + { + "epoch": 1.0702200737819616, + "ewc_loss": 0.007510470226407051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.510470459237695e-05, + "grad_norm": 3.6537628173828125, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8703217506408691, + "num_tokens": 320892812.0, + "step": 8413 + }, + { + "epoch": 1.0703472840605521, + "ewc_loss": 0.007519671693444252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.519671635236591e-05, + "grad_norm": 3.7202322483062744, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8834285736083984, + "num_tokens": 320929563.0, + "step": 8414 + }, + { + "epoch": 1.0704744943391427, + "ewc_loss": 0.007577579468488693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.577579526696354e-05, + "grad_norm": 3.7532129287719727, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8789589405059814, + "num_tokens": 320964010.0, + "step": 8415 + }, + { + "epoch": 1.0706017046177332, + "ewc_loss": 0.007550768554210663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55076835048385e-05, + "grad_norm": 3.648622989654541, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8683845400810242, + "num_tokens": 321006405.0, + "step": 8416 + }, + { + "epoch": 1.0707289148963237, + "ewc_loss": 0.00747811421751976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.478114275727421e-05, + "grad_norm": 3.653618097305298, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8674967885017395, + "num_tokens": 321050146.0, + "step": 8417 + }, + { + "epoch": 1.070856125174914, + "ewc_loss": 0.007524690590798855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.524690590798855e-05, + "grad_norm": 3.7590537071228027, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8719068169593811, + "num_tokens": 321086857.0, + "step": 8418 + }, + { + "epoch": 1.0709833354535045, + "ewc_loss": 0.007562899496406317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.562899554613978e-05, + "grad_norm": 3.6848769187927246, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8780292272567749, + "num_tokens": 321127310.0, + "step": 8419 + }, + { + "epoch": 1.071110545732095, + "ewc_loss": 0.007490949705243111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.490949792554602e-05, + "grad_norm": 3.7206687927246094, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8623106479644775, + "num_tokens": 321165881.0, + "step": 8420 + }, + { + "epoch": 1.0712377560106856, + "ewc_loss": 0.007539789658039808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.539789658039808e-05, + "grad_norm": 3.7134671211242676, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8654507398605347, + "num_tokens": 321204494.0, + "step": 8421 + }, + { + "epoch": 1.0713649662892761, + "ewc_loss": 0.007504456676542759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.504456880269572e-05, + "grad_norm": 3.7068710327148438, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8635282516479492, + "num_tokens": 321243639.0, + "step": 8422 + }, + { + "epoch": 1.0714921765678667, + "ewc_loss": 0.0075249867513775826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.524986722273752e-05, + "grad_norm": 3.76054310798645, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8662505745887756, + "num_tokens": 321281399.0, + "step": 8423 + }, + { + "epoch": 1.0716193868464572, + "ewc_loss": 0.007533429190516472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.533429015893489e-05, + "grad_norm": 3.770754814147949, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.859755277633667, + "num_tokens": 321316539.0, + "step": 8424 + }, + { + "epoch": 1.0717465971250477, + "ewc_loss": 0.007524055428802967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.524055399699137e-05, + "grad_norm": 3.7875375747680664, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8682271242141724, + "num_tokens": 321348618.0, + "step": 8425 + }, + { + "epoch": 1.0718738074036382, + "ewc_loss": 0.007541203871369362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541204104200006e-05, + "grad_norm": 3.7278964519500732, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8773667812347412, + "num_tokens": 321381674.0, + "step": 8426 + }, + { + "epoch": 1.0720010176822288, + "ewc_loss": 0.007521776482462883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.521776569774374e-05, + "grad_norm": 3.699916124343872, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.888549268245697, + "num_tokens": 321420342.0, + "step": 8427 + }, + { + "epoch": 1.0721282279608193, + "ewc_loss": 0.007527370937168598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52737105358392e-05, + "grad_norm": 3.7774384021759033, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8691824674606323, + "num_tokens": 321450770.0, + "step": 8428 + }, + { + "epoch": 1.0722554382394098, + "ewc_loss": 0.007571463938802481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.571464084321633e-05, + "grad_norm": 3.703437089920044, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8750010132789612, + "num_tokens": 321491450.0, + "step": 8429 + }, + { + "epoch": 1.0723826485180004, + "ewc_loss": 0.007517082616686821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517082849517465e-05, + "grad_norm": 3.69978404045105, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8677778840065002, + "num_tokens": 321531774.0, + "step": 8430 + }, + { + "epoch": 1.0725098587965907, + "ewc_loss": 0.007550643756985664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550643931608647e-05, + "grad_norm": 3.66886568069458, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.873640775680542, + "num_tokens": 321567692.0, + "step": 8431 + }, + { + "epoch": 1.0726370690751812, + "ewc_loss": 0.0075312042608857155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.531204028055072e-05, + "grad_norm": 3.782165765762329, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8680237531661987, + "num_tokens": 321601168.0, + "step": 8432 + }, + { + "epoch": 1.0727642793537717, + "ewc_loss": 0.007617530412971973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617530354764313e-05, + "grad_norm": 3.73297381401062, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8544007539749146, + "num_tokens": 321636100.0, + "step": 8433 + }, + { + "epoch": 1.0728914896323622, + "ewc_loss": 0.007560598198324442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560598169220611e-05, + "grad_norm": 3.6887245178222656, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8729304075241089, + "num_tokens": 321674365.0, + "step": 8434 + }, + { + "epoch": 1.0730186999109528, + "ewc_loss": 0.007547353859990835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.547353743575513e-05, + "grad_norm": 3.6704792976379395, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8808954954147339, + "num_tokens": 321712267.0, + "step": 8435 + }, + { + "epoch": 1.0731459101895433, + "ewc_loss": 0.007550352718681097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55035289330408e-05, + "grad_norm": 3.722435474395752, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8701406121253967, + "num_tokens": 321749955.0, + "step": 8436 + }, + { + "epoch": 1.0732731204681338, + "ewc_loss": 0.007585177198052406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.585177081637084e-05, + "grad_norm": 3.706256628036499, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8799883127212524, + "num_tokens": 321788522.0, + "step": 8437 + }, + { + "epoch": 1.0734003307467244, + "ewc_loss": 0.007548333145678043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.548333087470382e-05, + "grad_norm": 3.7091240882873535, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.869988739490509, + "num_tokens": 321826314.0, + "step": 8438 + }, + { + "epoch": 1.073527541025315, + "ewc_loss": 0.007549314759671688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549314614152536e-05, + "grad_norm": 3.7217137813568115, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8677972555160522, + "num_tokens": 321864851.0, + "step": 8439 + }, + { + "epoch": 1.0736547513039054, + "ewc_loss": 0.007543968968093395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.543968968093395e-05, + "grad_norm": 3.6997737884521484, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8815007209777832, + "num_tokens": 321901001.0, + "step": 8440 + }, + { + "epoch": 1.073781961582496, + "ewc_loss": 0.007538489531725645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.538489444414154e-05, + "grad_norm": 3.697640895843506, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8706828355789185, + "num_tokens": 321941368.0, + "step": 8441 + }, + { + "epoch": 1.0739091718610865, + "ewc_loss": 0.007529604248702526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.529604044975713e-05, + "grad_norm": 3.7660624980926514, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.875946581363678, + "num_tokens": 321978268.0, + "step": 8442 + }, + { + "epoch": 1.0740363821396768, + "ewc_loss": 0.007566623389720917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566623389720917e-05, + "grad_norm": 3.6782281398773193, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.878463864326477, + "num_tokens": 322016969.0, + "step": 8443 + }, + { + "epoch": 1.0741635924182673, + "ewc_loss": 0.0074919466860592365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.491946598747745e-05, + "grad_norm": 3.7015063762664795, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8658478260040283, + "num_tokens": 322059362.0, + "step": 8444 + }, + { + "epoch": 1.0742908026968578, + "ewc_loss": 0.007527250796556473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.527251000283286e-05, + "grad_norm": 3.7933318614959717, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8517881631851196, + "num_tokens": 322097334.0, + "step": 8445 + }, + { + "epoch": 1.0744180129754484, + "ewc_loss": 0.007558491080999374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.558491051895544e-05, + "grad_norm": 3.8122799396514893, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.892095148563385, + "num_tokens": 322121988.0, + "step": 8446 + }, + { + "epoch": 1.074545223254039, + "ewc_loss": 0.007549273315817118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549273141194135e-05, + "grad_norm": 3.7542994022369385, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8674173355102539, + "num_tokens": 322156602.0, + "step": 8447 + }, + { + "epoch": 1.0746724335326294, + "ewc_loss": 0.007518772967159748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51877305447124e-05, + "grad_norm": 3.6988494396209717, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.868204653263092, + "num_tokens": 322194475.0, + "step": 8448 + }, + { + "epoch": 1.07479964381122, + "ewc_loss": 0.007510816678404808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51081679482013e-05, + "grad_norm": 3.7462081909179688, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8802710771560669, + "num_tokens": 322232158.0, + "step": 8449 + }, + { + "epoch": 1.0749268540898105, + "ewc_loss": 0.007546952925622463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.546952838310972e-05, + "grad_norm": 3.6683149337768555, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.880841851234436, + "num_tokens": 322272170.0, + "step": 8450 + }, + { + "epoch": 1.075054064368401, + "ewc_loss": 0.0074899508617818356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.489950803574175e-05, + "grad_norm": 3.789073944091797, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.877119243144989, + "num_tokens": 322305496.0, + "step": 8451 + }, + { + "epoch": 1.0751812746469915, + "ewc_loss": 0.007579477038234472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579477096442133e-05, + "grad_norm": 3.677212715148926, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8759745359420776, + "num_tokens": 322347552.0, + "step": 8452 + }, + { + "epoch": 1.075308484925582, + "ewc_loss": 0.007467459887266159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.467460090992972e-05, + "grad_norm": 3.6675102710723877, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8766075372695923, + "num_tokens": 322386471.0, + "step": 8453 + }, + { + "epoch": 1.0754356952041726, + "ewc_loss": 0.007498797494918108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.49879764043726e-05, + "grad_norm": 3.6989235877990723, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8381041288375854, + "num_tokens": 322431019.0, + "step": 8454 + }, + { + "epoch": 1.0755629054827631, + "ewc_loss": 0.0075148832984268665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.514883327530697e-05, + "grad_norm": 3.6625497341156006, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8912434577941895, + "num_tokens": 322471525.0, + "step": 8455 + }, + { + "epoch": 1.0756901157613534, + "ewc_loss": 0.007479741703718901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.479741907445714e-05, + "grad_norm": 3.6629035472869873, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.874579906463623, + "num_tokens": 322516594.0, + "step": 8456 + }, + { + "epoch": 1.075817326039944, + "ewc_loss": 0.00746915303170681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469153206329793e-05, + "grad_norm": 3.6375370025634766, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8761828541755676, + "num_tokens": 322557837.0, + "step": 8457 + }, + { + "epoch": 1.0759445363185345, + "ewc_loss": 0.007466507144272327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.466506940545514e-05, + "grad_norm": 3.7252914905548096, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8669269680976868, + "num_tokens": 322594165.0, + "step": 8458 + }, + { + "epoch": 1.076071746597125, + "ewc_loss": 0.007512520998716354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.512520824093372e-05, + "grad_norm": 3.6944661140441895, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8542135953903198, + "num_tokens": 322637083.0, + "step": 8459 + }, + { + "epoch": 1.0761989568757155, + "ewc_loss": 0.0074542551301419735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.454254955518991e-05, + "grad_norm": 3.7345616817474365, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8701989650726318, + "num_tokens": 322673216.0, + "step": 8460 + }, + { + "epoch": 1.076326167154306, + "ewc_loss": 0.0074867079965770245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.486707909265533e-05, + "grad_norm": 3.705993890762329, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8746241927146912, + "num_tokens": 322709150.0, + "step": 8461 + }, + { + "epoch": 1.0764533774328966, + "ewc_loss": 0.007454329170286655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.454329170286655e-05, + "grad_norm": 3.6777889728546143, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8787447810173035, + "num_tokens": 322747893.0, + "step": 8462 + }, + { + "epoch": 1.0765805877114871, + "ewc_loss": 0.007429021410644054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.429021206917241e-05, + "grad_norm": 3.8067123889923096, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8562374114990234, + "num_tokens": 322781667.0, + "step": 8463 + }, + { + "epoch": 1.0767077979900777, + "ewc_loss": 0.007534197997301817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534198084613308e-05, + "grad_norm": 3.773566484451294, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8718437552452087, + "num_tokens": 322818008.0, + "step": 8464 + }, + { + "epoch": 1.0768350082686682, + "ewc_loss": 0.007454270031303167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.45427023502998e-05, + "grad_norm": 3.6712238788604736, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8520074486732483, + "num_tokens": 322856528.0, + "step": 8465 + }, + { + "epoch": 1.0769622185472587, + "ewc_loss": 0.007436610758304596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.436610758304596e-05, + "grad_norm": 3.792121410369873, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8757857084274292, + "num_tokens": 322888798.0, + "step": 8466 + }, + { + "epoch": 1.077089428825849, + "ewc_loss": 0.007544329855591059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.544329855591059e-05, + "grad_norm": 3.7021377086639404, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.872168779373169, + "num_tokens": 322923301.0, + "step": 8467 + }, + { + "epoch": 1.0772166391044395, + "ewc_loss": 0.007458494044840336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.458493928425014e-05, + "grad_norm": 3.679609775543213, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8835744261741638, + "num_tokens": 322964253.0, + "step": 8468 + }, + { + "epoch": 1.07734384938303, + "ewc_loss": 0.007477314677089453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.477314647985622e-05, + "grad_norm": 3.66860294342041, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8708118200302124, + "num_tokens": 323006926.0, + "step": 8469 + }, + { + "epoch": 1.0774710596616206, + "ewc_loss": 0.0074863010086119175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.4863011832349e-05, + "grad_norm": 3.667027235031128, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8840392231941223, + "num_tokens": 323047541.0, + "step": 8470 + }, + { + "epoch": 1.0775982699402111, + "ewc_loss": 0.007494214456528425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.494214514736086e-05, + "grad_norm": 3.7545671463012695, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8642247915267944, + "num_tokens": 323084272.0, + "step": 8471 + }, + { + "epoch": 1.0777254802188017, + "ewc_loss": 0.007544279098510742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54427892388776e-05, + "grad_norm": 3.7463526725769043, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8832436800003052, + "num_tokens": 323116698.0, + "step": 8472 + }, + { + "epoch": 1.0778526904973922, + "ewc_loss": 0.0075074718333780766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.50747203710489e-05, + "grad_norm": 3.655648946762085, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8796626925468445, + "num_tokens": 323158836.0, + "step": 8473 + }, + { + "epoch": 1.0779799007759827, + "ewc_loss": 0.007479699794203043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.479699706891552e-05, + "grad_norm": 3.7272164821624756, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8677539825439453, + "num_tokens": 323193777.0, + "step": 8474 + }, + { + "epoch": 1.0781071110545732, + "ewc_loss": 0.00755449291318655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55449291318655e-05, + "grad_norm": 3.7181055545806885, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8730523586273193, + "num_tokens": 323232546.0, + "step": 8475 + }, + { + "epoch": 1.0782343213331638, + "ewc_loss": 0.007520153187215328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52015330363065e-05, + "grad_norm": 3.7165005207061768, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8726534843444824, + "num_tokens": 323269403.0, + "step": 8476 + }, + { + "epoch": 1.0783615316117543, + "ewc_loss": 0.007503683678805828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.503683445975184e-05, + "grad_norm": 3.674018383026123, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8848937153816223, + "num_tokens": 323305982.0, + "step": 8477 + }, + { + "epoch": 1.0784887418903448, + "ewc_loss": 0.007489082869142294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.489082781830803e-05, + "grad_norm": 3.7314181327819824, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8682526350021362, + "num_tokens": 323341032.0, + "step": 8478 + }, + { + "epoch": 1.0786159521689354, + "ewc_loss": 0.0075382571667432785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.538257341366261e-05, + "grad_norm": 3.7087981700897217, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8663663864135742, + "num_tokens": 323381647.0, + "step": 8479 + }, + { + "epoch": 1.0787431624475257, + "ewc_loss": 0.0075187380425632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518738129874691e-05, + "grad_norm": 3.7281742095947266, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.87845379114151, + "num_tokens": 323417185.0, + "step": 8480 + }, + { + "epoch": 1.0788703727261162, + "ewc_loss": 0.007537757512181997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.537757483078167e-05, + "grad_norm": 3.719975233078003, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8701083660125732, + "num_tokens": 323454624.0, + "step": 8481 + }, + { + "epoch": 1.0789975830047067, + "ewc_loss": 0.007524187210947275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.524187094531953e-05, + "grad_norm": 3.6920974254608154, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.871691882610321, + "num_tokens": 323492862.0, + "step": 8482 + }, + { + "epoch": 1.0791247932832972, + "ewc_loss": 0.007508855778723955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.508855924243107e-05, + "grad_norm": 3.7170896530151367, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8686648011207581, + "num_tokens": 323536012.0, + "step": 8483 + }, + { + "epoch": 1.0792520035618878, + "ewc_loss": 0.007537988014519215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.537988130934536e-05, + "grad_norm": 3.7031447887420654, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8764876127243042, + "num_tokens": 323573495.0, + "step": 8484 + }, + { + "epoch": 1.0793792138404783, + "ewc_loss": 0.007515412289649248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.515412289649248e-05, + "grad_norm": 3.7399134635925293, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8819231986999512, + "num_tokens": 323610900.0, + "step": 8485 + }, + { + "epoch": 1.0795064241190688, + "ewc_loss": 0.007532518822699785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532518793595955e-05, + "grad_norm": 3.6822845935821533, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8704966306686401, + "num_tokens": 323650983.0, + "step": 8486 + }, + { + "epoch": 1.0796336343976594, + "ewc_loss": 0.007495260797441006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.495260797441006e-05, + "grad_norm": 3.7490758895874023, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8656586408615112, + "num_tokens": 323690155.0, + "step": 8487 + }, + { + "epoch": 1.0797608446762499, + "ewc_loss": 0.007525710854679346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525710680056363e-05, + "grad_norm": 3.6430509090423584, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8851627111434937, + "num_tokens": 323732212.0, + "step": 8488 + }, + { + "epoch": 1.0798880549548404, + "ewc_loss": 0.007444947492331266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.444947550538927e-05, + "grad_norm": 3.6876888275146484, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8803623914718628, + "num_tokens": 323773400.0, + "step": 8489 + }, + { + "epoch": 1.080015265233431, + "ewc_loss": 0.007485693786293268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.485693640774116e-05, + "grad_norm": 3.6995630264282227, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8790158033370972, + "num_tokens": 323812753.0, + "step": 8490 + }, + { + "epoch": 1.0801424755120215, + "ewc_loss": 0.007490011863410473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.490011921618134e-05, + "grad_norm": 3.6894078254699707, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8748636245727539, + "num_tokens": 323855835.0, + "step": 8491 + }, + { + "epoch": 1.0802696857906118, + "ewc_loss": 0.007457729894667864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.457729952875525e-05, + "grad_norm": 3.667819023132324, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8845842480659485, + "num_tokens": 323896065.0, + "step": 8492 + }, + { + "epoch": 1.0803968960692023, + "ewc_loss": 0.007440862245857716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.440862100338563e-05, + "grad_norm": 3.703653335571289, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.875044047832489, + "num_tokens": 323934351.0, + "step": 8493 + }, + { + "epoch": 1.0805241063477928, + "ewc_loss": 0.007463189773261547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.463189831469208e-05, + "grad_norm": 3.7535181045532227, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8620017766952515, + "num_tokens": 323968878.0, + "step": 8494 + }, + { + "epoch": 1.0806513166263834, + "ewc_loss": 0.007479961030185223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.479960913769901e-05, + "grad_norm": 3.6793434619903564, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8715466260910034, + "num_tokens": 324004838.0, + "step": 8495 + }, + { + "epoch": 1.080778526904974, + "ewc_loss": 0.007424705196171999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.424705108860508e-05, + "grad_norm": 3.7205357551574707, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8875440359115601, + "num_tokens": 324040851.0, + "step": 8496 + }, + { + "epoch": 1.0809057371835644, + "ewc_loss": 0.007471055723726749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.471055869245902e-05, + "grad_norm": 3.684548854827881, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8594682216644287, + "num_tokens": 324084675.0, + "step": 8497 + }, + { + "epoch": 1.081032947462155, + "ewc_loss": 0.0074286870658397675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.42868724046275e-05, + "grad_norm": 3.7228736877441406, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8774129152297974, + "num_tokens": 324122136.0, + "step": 8498 + }, + { + "epoch": 1.0811601577407455, + "ewc_loss": 0.007468363735824823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.468363764928654e-05, + "grad_norm": 3.7491626739501953, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8727738857269287, + "num_tokens": 324153142.0, + "step": 8499 + }, + { + "epoch": 1.081287368019336, + "ewc_loss": 0.007480226457118988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.480226486222818e-05, + "grad_norm": 3.7641475200653076, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8530523180961609, + "num_tokens": 324187244.0, + "step": 8500 + }, + { + "epoch": 1.0814145782979265, + "ewc_loss": 0.007481992244720459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.481992361135781e-05, + "grad_norm": 3.721876621246338, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8750385642051697, + "num_tokens": 324225644.0, + "step": 8501 + }, + { + "epoch": 1.081541788576517, + "ewc_loss": 0.0074698589742183685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469858974218369e-05, + "grad_norm": 3.667295455932617, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8802991509437561, + "num_tokens": 324270394.0, + "step": 8502 + }, + { + "epoch": 1.0816689988551076, + "ewc_loss": 0.0074585434049367905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.45854340493679e-05, + "grad_norm": 3.640916585922241, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.883117139339447, + "num_tokens": 324311244.0, + "step": 8503 + }, + { + "epoch": 1.0817962091336981, + "ewc_loss": 0.007455982267856598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.455982267856598e-05, + "grad_norm": 3.7311108112335205, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.854498028755188, + "num_tokens": 324350821.0, + "step": 8504 + }, + { + "epoch": 1.0819234194122884, + "ewc_loss": 0.007517638150602579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517638005083427e-05, + "grad_norm": 3.686957359313965, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8703817129135132, + "num_tokens": 324388255.0, + "step": 8505 + }, + { + "epoch": 1.082050629690879, + "ewc_loss": 0.007449333555996418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.449333497788757e-05, + "grad_norm": 3.7327539920806885, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8487260341644287, + "num_tokens": 324430068.0, + "step": 8506 + }, + { + "epoch": 1.0821778399694695, + "ewc_loss": 0.007508908398449421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.50890831113793e-05, + "grad_norm": 3.7538199424743652, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8798313736915588, + "num_tokens": 324465341.0, + "step": 8507 + }, + { + "epoch": 1.08230505024806, + "ewc_loss": 0.007479948457330465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.479948544641957e-05, + "grad_norm": 3.6912169456481934, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.879074215888977, + "num_tokens": 324502199.0, + "step": 8508 + }, + { + "epoch": 1.0824322605266505, + "ewc_loss": 0.007459910586476326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.459910557372496e-05, + "grad_norm": 3.723433017730713, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8740180730819702, + "num_tokens": 324535898.0, + "step": 8509 + }, + { + "epoch": 1.082559470805241, + "ewc_loss": 0.007496452424675226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.496452599298209e-05, + "grad_norm": 3.7283637523651123, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8618454933166504, + "num_tokens": 324574404.0, + "step": 8510 + }, + { + "epoch": 1.0826866810838316, + "ewc_loss": 0.007492451462894678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.49245155020617e-05, + "grad_norm": 3.688196897506714, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8720811605453491, + "num_tokens": 324611743.0, + "step": 8511 + }, + { + "epoch": 1.0828138913624221, + "ewc_loss": 0.007463526912033558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.463526708306745e-05, + "grad_norm": 3.680856227874756, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8631407022476196, + "num_tokens": 324653126.0, + "step": 8512 + }, + { + "epoch": 1.0829411016410126, + "ewc_loss": 0.007479261141270399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.479260966647416e-05, + "grad_norm": 3.663691759109497, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8784510493278503, + "num_tokens": 324692957.0, + "step": 8513 + }, + { + "epoch": 1.0830683119196032, + "ewc_loss": 0.007468266412615776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.468266267096624e-05, + "grad_norm": 3.682356119155884, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8674623966217041, + "num_tokens": 324737214.0, + "step": 8514 + }, + { + "epoch": 1.0831955221981937, + "ewc_loss": 0.007483482360839844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.483482477255166e-05, + "grad_norm": 3.675787925720215, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.881827175617218, + "num_tokens": 324773098.0, + "step": 8515 + }, + { + "epoch": 1.083322732476784, + "ewc_loss": 0.0074806516058743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.480651402147487e-05, + "grad_norm": 3.6814093589782715, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8621247410774231, + "num_tokens": 324815846.0, + "step": 8516 + }, + { + "epoch": 1.0834499427553745, + "ewc_loss": 0.007477388717234135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.477388862753287e-05, + "grad_norm": 3.6822290420532227, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8694032430648804, + "num_tokens": 324855912.0, + "step": 8517 + }, + { + "epoch": 1.083577153033965, + "ewc_loss": 0.007472632452845573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.472632569260895e-05, + "grad_norm": 3.7194674015045166, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8670569062232971, + "num_tokens": 324893090.0, + "step": 8518 + }, + { + "epoch": 1.0837043633125556, + "ewc_loss": 0.007507665082812309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507664849981666e-05, + "grad_norm": 3.740539073944092, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8718565702438354, + "num_tokens": 324929779.0, + "step": 8519 + }, + { + "epoch": 1.0838315735911461, + "ewc_loss": 0.007495620287954807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.495620229747146e-05, + "grad_norm": 3.7454512119293213, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.875809907913208, + "num_tokens": 324961349.0, + "step": 8520 + }, + { + "epoch": 1.0839587838697367, + "ewc_loss": 0.007506147027015686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.506147085223347e-05, + "grad_norm": 3.7073333263397217, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8733521103858948, + "num_tokens": 324998688.0, + "step": 8521 + }, + { + "epoch": 1.0840859941483272, + "ewc_loss": 0.0074854642152786255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.485464448109269e-05, + "grad_norm": 3.7150754928588867, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8665608167648315, + "num_tokens": 325040601.0, + "step": 8522 + }, + { + "epoch": 1.0842132044269177, + "ewc_loss": 0.007498844061046839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.498844206565991e-05, + "grad_norm": 3.6471266746520996, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8766124248504639, + "num_tokens": 325081729.0, + "step": 8523 + }, + { + "epoch": 1.0843404147055082, + "ewc_loss": 0.0074638803489506245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.463880319846794e-05, + "grad_norm": 3.7469942569732666, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8614908456802368, + "num_tokens": 325118886.0, + "step": 8524 + }, + { + "epoch": 1.0844676249840988, + "ewc_loss": 0.007544500287622213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54450011299923e-05, + "grad_norm": 3.70711088180542, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8902732729911804, + "num_tokens": 325156417.0, + "step": 8525 + }, + { + "epoch": 1.0845948352626893, + "ewc_loss": 0.007469760254025459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469760021194816e-05, + "grad_norm": 3.617560386657715, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8841829895973206, + "num_tokens": 325200902.0, + "step": 8526 + }, + { + "epoch": 1.0847220455412798, + "ewc_loss": 0.0074408226646482944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.440822810167447e-05, + "grad_norm": 3.7601325511932373, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8752866983413696, + "num_tokens": 325233362.0, + "step": 8527 + }, + { + "epoch": 1.0848492558198704, + "ewc_loss": 0.00756492093205452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564920815639198e-05, + "grad_norm": 3.8096892833709717, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8707390427589417, + "num_tokens": 325274646.0, + "step": 8528 + }, + { + "epoch": 1.0849764660984607, + "ewc_loss": 0.007508996408432722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.508996350225061e-05, + "grad_norm": 3.714752674102783, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8877776265144348, + "num_tokens": 325304735.0, + "step": 8529 + }, + { + "epoch": 1.0851036763770512, + "ewc_loss": 0.007468618452548981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.46861842344515e-05, + "grad_norm": 3.696070432662964, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8751569986343384, + "num_tokens": 325339833.0, + "step": 8530 + }, + { + "epoch": 1.0852308866556417, + "ewc_loss": 0.007486213929951191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.48621387174353e-05, + "grad_norm": 3.76418137550354, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8602862358093262, + "num_tokens": 325373838.0, + "step": 8531 + }, + { + "epoch": 1.0853580969342322, + "ewc_loss": 0.007550886366516352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5508862209972e-05, + "grad_norm": 3.6684751510620117, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8928358554840088, + "num_tokens": 325412913.0, + "step": 8532 + }, + { + "epoch": 1.0854853072128228, + "ewc_loss": 0.007469650823622942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469650881830603e-05, + "grad_norm": 3.703125238418579, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8777036666870117, + "num_tokens": 325451830.0, + "step": 8533 + }, + { + "epoch": 1.0856125174914133, + "ewc_loss": 0.007529327645897865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.529327558586374e-05, + "grad_norm": 3.785991668701172, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8547024726867676, + "num_tokens": 325487162.0, + "step": 8534 + }, + { + "epoch": 1.0857397277700038, + "ewc_loss": 0.007569247856736183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569247827632353e-05, + "grad_norm": 3.692580461502075, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8806100487709045, + "num_tokens": 325525906.0, + "step": 8535 + }, + { + "epoch": 1.0858669380485944, + "ewc_loss": 0.007487534545361996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487534458050504e-05, + "grad_norm": 3.7385828495025635, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8833988308906555, + "num_tokens": 325559925.0, + "step": 8536 + }, + { + "epoch": 1.0859941483271849, + "ewc_loss": 0.0075602661818265915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560266385553405e-05, + "grad_norm": 3.6915388107299805, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8721187114715576, + "num_tokens": 325599980.0, + "step": 8537 + }, + { + "epoch": 1.0861213586057754, + "ewc_loss": 0.007525804452598095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525804539909586e-05, + "grad_norm": 3.8018150329589844, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8592973947525024, + "num_tokens": 325631745.0, + "step": 8538 + }, + { + "epoch": 1.086248568884366, + "ewc_loss": 0.007598846219480038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.598846423206851e-05, + "grad_norm": 3.677452325820923, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8689277172088623, + "num_tokens": 325673583.0, + "step": 8539 + }, + { + "epoch": 1.0863757791629565, + "ewc_loss": 0.0074912472628057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.491247379221022e-05, + "grad_norm": 3.719496250152588, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.865668535232544, + "num_tokens": 325712995.0, + "step": 8540 + }, + { + "epoch": 1.0865029894415468, + "ewc_loss": 0.007567724213004112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.567724242107943e-05, + "grad_norm": 3.722323417663574, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.878294825553894, + "num_tokens": 325748259.0, + "step": 8541 + }, + { + "epoch": 1.0866301997201373, + "ewc_loss": 0.007554837968200445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554837793577462e-05, + "grad_norm": 3.679704189300537, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.87871915102005, + "num_tokens": 325787676.0, + "step": 8542 + }, + { + "epoch": 1.0867574099987278, + "ewc_loss": 0.0075269038788974285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52690393710509e-05, + "grad_norm": 3.695788860321045, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.849002480506897, + "num_tokens": 325827819.0, + "step": 8543 + }, + { + "epoch": 1.0868846202773184, + "ewc_loss": 0.007556907832622528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.556907803518698e-05, + "grad_norm": 3.7306230068206787, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8805059194564819, + "num_tokens": 325866328.0, + "step": 8544 + }, + { + "epoch": 1.0870118305559089, + "ewc_loss": 0.007555403281003237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555403135484084e-05, + "grad_norm": 3.6817421913146973, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8670740127563477, + "num_tokens": 325906752.0, + "step": 8545 + }, + { + "epoch": 1.0871390408344994, + "ewc_loss": 0.007521864026784897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.521863881265745e-05, + "grad_norm": 3.737368106842041, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.86555415391922, + "num_tokens": 325943765.0, + "step": 8546 + }, + { + "epoch": 1.08726625111309, + "ewc_loss": 0.007559084799140692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.559084770036861e-05, + "grad_norm": 3.7098686695098877, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.879669189453125, + "num_tokens": 325981471.0, + "step": 8547 + }, + { + "epoch": 1.0873934613916805, + "ewc_loss": 0.007506814785301685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.506815018132329e-05, + "grad_norm": 3.7531027793884277, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8767359852790833, + "num_tokens": 326017253.0, + "step": 8548 + }, + { + "epoch": 1.087520671670271, + "ewc_loss": 0.0075620077550411224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.562007522210479e-05, + "grad_norm": 3.7197999954223633, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8739113807678223, + "num_tokens": 326053484.0, + "step": 8549 + }, + { + "epoch": 1.0876478819488615, + "ewc_loss": 0.007509407587349415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.509407441830263e-05, + "grad_norm": 3.7742550373077393, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8793125152587891, + "num_tokens": 326084779.0, + "step": 8550 + }, + { + "epoch": 1.087775092227452, + "ewc_loss": 0.007567672058939934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56767185521312e-05, + "grad_norm": 3.704662322998047, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8706318140029907, + "num_tokens": 326120795.0, + "step": 8551 + }, + { + "epoch": 1.0879023025060426, + "ewc_loss": 0.007509477902203798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.50947801861912e-05, + "grad_norm": 3.729367733001709, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8703309297561646, + "num_tokens": 326155991.0, + "step": 8552 + }, + { + "epoch": 1.0880295127846331, + "ewc_loss": 0.007534731179475784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534731412306428e-05, + "grad_norm": 3.678070545196533, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8674898147583008, + "num_tokens": 326195738.0, + "step": 8553 + }, + { + "epoch": 1.0881567230632234, + "ewc_loss": 0.007505577988922596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.505578105337918e-05, + "grad_norm": 3.6776201725006104, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8728761672973633, + "num_tokens": 326237773.0, + "step": 8554 + }, + { + "epoch": 1.088283933341814, + "ewc_loss": 0.007523986976593733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.523987005697563e-05, + "grad_norm": 3.62661075592041, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8864253759384155, + "num_tokens": 326278589.0, + "step": 8555 + }, + { + "epoch": 1.0884111436204045, + "ewc_loss": 0.0075028096325695515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.502809603465721e-05, + "grad_norm": 3.7308108806610107, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8666573166847229, + "num_tokens": 326316862.0, + "step": 8556 + }, + { + "epoch": 1.088538353898995, + "ewc_loss": 0.007580489851534367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.580489909742028e-05, + "grad_norm": 3.7480266094207764, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8701019883155823, + "num_tokens": 326353892.0, + "step": 8557 + }, + { + "epoch": 1.0886655641775855, + "ewc_loss": 0.007539046928286552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5390467827674e-05, + "grad_norm": 3.7161965370178223, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8763334155082703, + "num_tokens": 326389478.0, + "step": 8558 + }, + { + "epoch": 1.088792774456176, + "ewc_loss": 0.007515375968068838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.515375909861177e-05, + "grad_norm": 3.7313783168792725, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8676944971084595, + "num_tokens": 326428072.0, + "step": 8559 + }, + { + "epoch": 1.0889199847347666, + "ewc_loss": 0.007541203871369362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541204104200006e-05, + "grad_norm": 3.71393084526062, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8748766183853149, + "num_tokens": 326466374.0, + "step": 8560 + }, + { + "epoch": 1.0890471950133571, + "ewc_loss": 0.0075222160667181015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.522216037614271e-05, + "grad_norm": 3.804551362991333, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8674928545951843, + "num_tokens": 326501081.0, + "step": 8561 + }, + { + "epoch": 1.0891744052919476, + "ewc_loss": 0.007585603278130293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.585603452753276e-05, + "grad_norm": 3.787777900695801, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8819370269775391, + "num_tokens": 326532539.0, + "step": 8562 + }, + { + "epoch": 1.0893016155705382, + "ewc_loss": 0.007554974406957626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554974581580609e-05, + "grad_norm": 3.7248594760894775, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8778183460235596, + "num_tokens": 326569481.0, + "step": 8563 + }, + { + "epoch": 1.0894288258491287, + "ewc_loss": 0.007512995973229408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.512995944125578e-05, + "grad_norm": 3.672203779220581, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8706635236740112, + "num_tokens": 326609746.0, + "step": 8564 + }, + { + "epoch": 1.089556036127719, + "ewc_loss": 0.007510465569794178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.510465366067365e-05, + "grad_norm": 3.712568759918213, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8628393411636353, + "num_tokens": 326652379.0, + "step": 8565 + }, + { + "epoch": 1.0896832464063095, + "ewc_loss": 0.007567457854747772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.567457942059264e-05, + "grad_norm": 3.7710554599761963, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8681679964065552, + "num_tokens": 326687617.0, + "step": 8566 + }, + { + "epoch": 1.0898104566849, + "ewc_loss": 0.007571501191705465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.571501191705465e-05, + "grad_norm": 3.7203421592712402, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8664696216583252, + "num_tokens": 326724155.0, + "step": 8567 + }, + { + "epoch": 1.0899376669634906, + "ewc_loss": 0.007532045245170593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532045128755271e-05, + "grad_norm": 3.645090341567993, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8699091672897339, + "num_tokens": 326768234.0, + "step": 8568 + }, + { + "epoch": 1.0900648772420811, + "ewc_loss": 0.007534192409366369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534192263847217e-05, + "grad_norm": 3.6734683513641357, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8759340047836304, + "num_tokens": 326810423.0, + "step": 8569 + }, + { + "epoch": 1.0901920875206716, + "ewc_loss": 0.007555531803518534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555531919933856e-05, + "grad_norm": 3.7129762172698975, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8697096109390259, + "num_tokens": 326851834.0, + "step": 8570 + }, + { + "epoch": 1.0903192977992622, + "ewc_loss": 0.007562733720988035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.562733662780374e-05, + "grad_norm": 3.7278084754943848, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.860220193862915, + "num_tokens": 326889334.0, + "step": 8571 + }, + { + "epoch": 1.0904465080778527, + "ewc_loss": 0.007555059157311916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555058982688934e-05, + "grad_norm": 3.6881463527679443, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8609411716461182, + "num_tokens": 326931884.0, + "step": 8572 + }, + { + "epoch": 1.0905737183564432, + "ewc_loss": 0.007532299961894751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532299787271768e-05, + "grad_norm": 3.7217862606048584, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8783818483352661, + "num_tokens": 326969043.0, + "step": 8573 + }, + { + "epoch": 1.0907009286350338, + "ewc_loss": 0.007582268211990595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582268153782934e-05, + "grad_norm": 3.8169355392456055, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8722190856933594, + "num_tokens": 326998369.0, + "step": 8574 + }, + { + "epoch": 1.0908281389136243, + "ewc_loss": 0.007598242722451687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.598242518724874e-05, + "grad_norm": 3.6839630603790283, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8717217445373535, + "num_tokens": 327036909.0, + "step": 8575 + }, + { + "epoch": 1.0909553491922148, + "ewc_loss": 0.007512539159506559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.512539013987407e-05, + "grad_norm": 3.7653768062591553, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8631538152694702, + "num_tokens": 327076053.0, + "step": 8576 + }, + { + "epoch": 1.0910825594708053, + "ewc_loss": 0.007588370703160763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58837049943395e-05, + "grad_norm": 3.6848340034484863, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8780136704444885, + "num_tokens": 327112285.0, + "step": 8577 + }, + { + "epoch": 1.0912097697493957, + "ewc_loss": 0.007528300862759352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.528300920967013e-05, + "grad_norm": 3.7245728969573975, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8793039917945862, + "num_tokens": 327151591.0, + "step": 8578 + }, + { + "epoch": 1.0913369800279862, + "ewc_loss": 0.007570934481918812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57093439460732e-05, + "grad_norm": 3.729417562484741, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8577684164047241, + "num_tokens": 327193600.0, + "step": 8579 + }, + { + "epoch": 1.0914641903065767, + "ewc_loss": 0.007543519139289856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.543519313912839e-05, + "grad_norm": 3.7497479915618896, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8637640476226807, + "num_tokens": 327230249.0, + "step": 8580 + }, + { + "epoch": 1.0915914005851672, + "ewc_loss": 0.007557203061878681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.557203207397833e-05, + "grad_norm": 3.6812729835510254, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8836219310760498, + "num_tokens": 327270043.0, + "step": 8581 + }, + { + "epoch": 1.0917186108637578, + "ewc_loss": 0.007507705595344305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507705595344305e-05, + "grad_norm": 3.7020938396453857, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8744940757751465, + "num_tokens": 327309396.0, + "step": 8582 + }, + { + "epoch": 1.0918458211423483, + "ewc_loss": 0.007538176607340574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.538176578236744e-05, + "grad_norm": 3.6506125926971436, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8805190324783325, + "num_tokens": 327349497.0, + "step": 8583 + }, + { + "epoch": 1.0919730314209388, + "ewc_loss": 0.007507550064474344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507549889851362e-05, + "grad_norm": 3.7914013862609863, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8648171424865723, + "num_tokens": 327385648.0, + "step": 8584 + }, + { + "epoch": 1.0921002416995294, + "ewc_loss": 0.007592865731567144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.592865586047992e-05, + "grad_norm": 3.6772873401641846, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8734046220779419, + "num_tokens": 327426379.0, + "step": 8585 + }, + { + "epoch": 1.0922274519781199, + "ewc_loss": 0.0074947914108633995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.494791498174891e-05, + "grad_norm": 3.726947784423828, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8776212930679321, + "num_tokens": 327467555.0, + "step": 8586 + }, + { + "epoch": 1.0923546622567104, + "ewc_loss": 0.007546654436737299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54665452404879e-05, + "grad_norm": 3.741260290145874, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.868289589881897, + "num_tokens": 327505303.0, + "step": 8587 + }, + { + "epoch": 1.092481872535301, + "ewc_loss": 0.007539798505604267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.539798389188945e-05, + "grad_norm": 3.705429792404175, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8772605657577515, + "num_tokens": 327547854.0, + "step": 8588 + }, + { + "epoch": 1.0926090828138915, + "ewc_loss": 0.007492139469832182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.492139411624521e-05, + "grad_norm": 3.7615208625793457, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8740750551223755, + "num_tokens": 327584132.0, + "step": 8589 + }, + { + "epoch": 1.0927362930924818, + "ewc_loss": 0.007551341783255339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.551341695943847e-05, + "grad_norm": 3.668396234512329, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8608458042144775, + "num_tokens": 327629250.0, + "step": 8590 + }, + { + "epoch": 1.0928635033710723, + "ewc_loss": 0.007455881219357252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.455881132045761e-05, + "grad_norm": 3.7516236305236816, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8643922805786133, + "num_tokens": 327666988.0, + "step": 8591 + }, + { + "epoch": 1.0929907136496628, + "ewc_loss": 0.007545725908130407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54572611185722e-05, + "grad_norm": 3.730043411254883, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8739057779312134, + "num_tokens": 327705955.0, + "step": 8592 + }, + { + "epoch": 1.0931179239282534, + "ewc_loss": 0.007505767047405243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.505767280235887e-05, + "grad_norm": 3.7137985229492188, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.864055335521698, + "num_tokens": 327746362.0, + "step": 8593 + }, + { + "epoch": 1.0932451342068439, + "ewc_loss": 0.007478104904294014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.478104816982523e-05, + "grad_norm": 3.675915241241455, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8707528710365295, + "num_tokens": 327789034.0, + "step": 8594 + }, + { + "epoch": 1.0933723444854344, + "ewc_loss": 0.007474839221686125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.474839367205277e-05, + "grad_norm": 3.8201887607574463, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8627752661705017, + "num_tokens": 327824203.0, + "step": 8595 + }, + { + "epoch": 1.093499554764025, + "ewc_loss": 0.007562560494989157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.562560494989157e-05, + "grad_norm": 3.735356330871582, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8725739121437073, + "num_tokens": 327860340.0, + "step": 8596 + }, + { + "epoch": 1.0936267650426155, + "ewc_loss": 0.007474185433238745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.474185258615762e-05, + "grad_norm": 3.6997101306915283, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8648682236671448, + "num_tokens": 327901709.0, + "step": 8597 + }, + { + "epoch": 1.093753975321206, + "ewc_loss": 0.007479919586330652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.4799194408115e-05, + "grad_norm": 3.677955389022827, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8735105991363525, + "num_tokens": 327940327.0, + "step": 8598 + }, + { + "epoch": 1.0938811855997965, + "ewc_loss": 0.007476476486772299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.476476457668468e-05, + "grad_norm": 3.7683279514312744, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8764801025390625, + "num_tokens": 327977966.0, + "step": 8599 + }, + { + "epoch": 1.094008395878387, + "ewc_loss": 0.00753837451338768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.53837448428385e-05, + "grad_norm": 3.6635377407073975, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8686187267303467, + "num_tokens": 328021143.0, + "step": 8600 + }, + { + "epoch": 1.0941356061569776, + "ewc_loss": 0.007446335628628731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.446335803251714e-05, + "grad_norm": 3.6951801776885986, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.876957356929779, + "num_tokens": 328060792.0, + "step": 8601 + }, + { + "epoch": 1.094262816435568, + "ewc_loss": 0.007505964487791061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.505964458687231e-05, + "grad_norm": 3.710787296295166, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8715075254440308, + "num_tokens": 328099698.0, + "step": 8602 + }, + { + "epoch": 1.0943900267141584, + "ewc_loss": 0.007505673915147781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.505674147978425e-05, + "grad_norm": 3.6625711917877197, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8762427568435669, + "num_tokens": 328141075.0, + "step": 8603 + }, + { + "epoch": 1.094517236992749, + "ewc_loss": 0.007455609738826752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.455609738826752e-05, + "grad_norm": 3.738663911819458, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8665647506713867, + "num_tokens": 328178945.0, + "step": 8604 + }, + { + "epoch": 1.0946444472713395, + "ewc_loss": 0.007510321214795113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.510321302106604e-05, + "grad_norm": 3.775869607925415, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8467054963111877, + "num_tokens": 328214423.0, + "step": 8605 + }, + { + "epoch": 1.09477165754993, + "ewc_loss": 0.007523761596530676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.523761451011524e-05, + "grad_norm": 3.765784740447998, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8714025020599365, + "num_tokens": 328248640.0, + "step": 8606 + }, + { + "epoch": 1.0948988678285205, + "ewc_loss": 0.007491917349398136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.491917494917288e-05, + "grad_norm": 3.7771801948547363, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8515291213989258, + "num_tokens": 328283016.0, + "step": 8607 + }, + { + "epoch": 1.095026078107111, + "ewc_loss": 0.007523011416196823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.523011299781501e-05, + "grad_norm": 3.6748509407043457, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8749598264694214, + "num_tokens": 328322377.0, + "step": 8608 + }, + { + "epoch": 1.0951532883857016, + "ewc_loss": 0.007450337056070566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.450336852343753e-05, + "grad_norm": 3.747746706008911, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.873691976070404, + "num_tokens": 328359409.0, + "step": 8609 + }, + { + "epoch": 1.0952804986642921, + "ewc_loss": 0.007550343405455351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550343434559181e-05, + "grad_norm": 3.6863138675689697, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8750559091567993, + "num_tokens": 328401257.0, + "step": 8610 + }, + { + "epoch": 1.0954077089428826, + "ewc_loss": 0.007473852951079607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.473852747352794e-05, + "grad_norm": 3.7440974712371826, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8741602897644043, + "num_tokens": 328435886.0, + "step": 8611 + }, + { + "epoch": 1.0955349192214732, + "ewc_loss": 0.007538557983934879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.538557838415727e-05, + "grad_norm": 3.7579846382141113, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8620390892028809, + "num_tokens": 328472091.0, + "step": 8612 + }, + { + "epoch": 1.0956621295000637, + "ewc_loss": 0.007518650032579899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51865009078756e-05, + "grad_norm": 3.710590124130249, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8651621341705322, + "num_tokens": 328509453.0, + "step": 8613 + }, + { + "epoch": 1.095789339778654, + "ewc_loss": 0.007517112884670496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517112680943683e-05, + "grad_norm": 3.724609375, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8720704317092896, + "num_tokens": 328545590.0, + "step": 8614 + }, + { + "epoch": 1.0959165500572445, + "ewc_loss": 0.007539665326476097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.539665239164606e-05, + "grad_norm": 3.730778217315674, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8615769147872925, + "num_tokens": 328583093.0, + "step": 8615 + }, + { + "epoch": 1.096043760335835, + "ewc_loss": 0.007542463019490242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.542462844867259e-05, + "grad_norm": 3.7293314933776855, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8561668395996094, + "num_tokens": 328625312.0, + "step": 8616 + }, + { + "epoch": 1.0961709706144256, + "ewc_loss": 0.007543702609837055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.543702668044716e-05, + "grad_norm": 3.7941036224365234, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8664827346801758, + "num_tokens": 328655392.0, + "step": 8617 + }, + { + "epoch": 1.0962981808930161, + "ewc_loss": 0.00757997902110219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579979137517512e-05, + "grad_norm": 3.716752767562866, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8552912473678589, + "num_tokens": 328693686.0, + "step": 8618 + }, + { + "epoch": 1.0964253911716066, + "ewc_loss": 0.007532855495810509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532855670433491e-05, + "grad_norm": 3.7122838497161865, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8817275762557983, + "num_tokens": 328728051.0, + "step": 8619 + }, + { + "epoch": 1.0965526014501972, + "ewc_loss": 0.007570214569568634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.570214802399278e-05, + "grad_norm": 3.7144784927368164, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8612730503082275, + "num_tokens": 328768091.0, + "step": 8620 + }, + { + "epoch": 1.0966798117287877, + "ewc_loss": 0.007568273693323135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.568273576907814e-05, + "grad_norm": 3.722339391708374, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8691187500953674, + "num_tokens": 328803851.0, + "step": 8621 + }, + { + "epoch": 1.0968070220073782, + "ewc_loss": 0.007564240600913763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564240513602272e-05, + "grad_norm": 3.738359212875366, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8824000358581543, + "num_tokens": 328837429.0, + "step": 8622 + }, + { + "epoch": 1.0969342322859688, + "ewc_loss": 0.007576559670269489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.576559437438846e-05, + "grad_norm": 3.6991305351257324, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8698722124099731, + "num_tokens": 328874264.0, + "step": 8623 + }, + { + "epoch": 1.0970614425645593, + "ewc_loss": 0.007566159125417471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566159183625132e-05, + "grad_norm": 3.724153518676758, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8733400702476501, + "num_tokens": 328911627.0, + "step": 8624 + }, + { + "epoch": 1.0971886528431498, + "ewc_loss": 0.00759280938655138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.592809561174363e-05, + "grad_norm": 3.7584879398345947, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8650181293487549, + "num_tokens": 328944471.0, + "step": 8625 + }, + { + "epoch": 1.0973158631217403, + "ewc_loss": 0.007606962695717812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.606962753925472e-05, + "grad_norm": 3.6511082649230957, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8785183429718018, + "num_tokens": 328983826.0, + "step": 8626 + }, + { + "epoch": 1.0974430734003306, + "ewc_loss": 0.007535090204328299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.535090117016807e-05, + "grad_norm": 3.7668683528900146, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.87092524766922, + "num_tokens": 329019385.0, + "step": 8627 + }, + { + "epoch": 1.0975702836789212, + "ewc_loss": 0.007643686607480049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64368669479154e-05, + "grad_norm": 3.667848825454712, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8590998649597168, + "num_tokens": 329065179.0, + "step": 8628 + }, + { + "epoch": 1.0976974939575117, + "ewc_loss": 0.00755411759018898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554117473773658e-05, + "grad_norm": 3.714918375015259, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8764082193374634, + "num_tokens": 329101077.0, + "step": 8629 + }, + { + "epoch": 1.0978247042361022, + "ewc_loss": 0.007613943424075842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.61394330766052e-05, + "grad_norm": 3.7116124629974365, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8777222037315369, + "num_tokens": 329137538.0, + "step": 8630 + }, + { + "epoch": 1.0979519145146928, + "ewc_loss": 0.007591061759740114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.591061876155436e-05, + "grad_norm": 3.748338222503662, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8860710859298706, + "num_tokens": 329171952.0, + "step": 8631 + }, + { + "epoch": 1.0980791247932833, + "ewc_loss": 0.007613789290189743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6137890573591e-05, + "grad_norm": 3.70306658744812, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8795074224472046, + "num_tokens": 329214956.0, + "step": 8632 + }, + { + "epoch": 1.0982063350718738, + "ewc_loss": 0.0075739044696092606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57390444050543e-05, + "grad_norm": 3.670788049697876, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8624982237815857, + "num_tokens": 329254228.0, + "step": 8633 + }, + { + "epoch": 1.0983335453504643, + "ewc_loss": 0.007560457102954388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560457015642896e-05, + "grad_norm": 3.7901668548583984, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8738987445831299, + "num_tokens": 329288337.0, + "step": 8634 + }, + { + "epoch": 1.0984607556290549, + "ewc_loss": 0.007638487499207258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.638487295480445e-05, + "grad_norm": 3.735501766204834, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8653554320335388, + "num_tokens": 329329118.0, + "step": 8635 + }, + { + "epoch": 1.0985879659076454, + "ewc_loss": 0.007547651883214712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.547652057837695e-05, + "grad_norm": 3.774540901184082, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8587349653244019, + "num_tokens": 329366301.0, + "step": 8636 + }, + { + "epoch": 1.098715176186236, + "ewc_loss": 0.0075790309347212315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579031080240384e-05, + "grad_norm": 3.76273512840271, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8608242273330688, + "num_tokens": 329401612.0, + "step": 8637 + }, + { + "epoch": 1.0988423864648265, + "ewc_loss": 0.007564437575638294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564437692053616e-05, + "grad_norm": 3.6724796295166016, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8735011219978333, + "num_tokens": 329440403.0, + "step": 8638 + }, + { + "epoch": 1.0989695967434168, + "ewc_loss": 0.007515884470194578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.515884499298409e-05, + "grad_norm": 3.6489155292510986, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8813053369522095, + "num_tokens": 329478699.0, + "step": 8639 + }, + { + "epoch": 1.0990968070220073, + "ewc_loss": 0.007536840159446001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.536839984823018e-05, + "grad_norm": 3.7146341800689697, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8700199127197266, + "num_tokens": 329517860.0, + "step": 8640 + }, + { + "epoch": 1.0992240173005978, + "ewc_loss": 0.0075771925039589405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57719244575128e-05, + "grad_norm": 3.752697229385376, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8769184350967407, + "num_tokens": 329555385.0, + "step": 8641 + }, + { + "epoch": 1.0993512275791884, + "ewc_loss": 0.007567398250102997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.567398279206827e-05, + "grad_norm": 3.681034564971924, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8701286315917969, + "num_tokens": 329594775.0, + "step": 8642 + }, + { + "epoch": 1.0994784378577789, + "ewc_loss": 0.007519885431975126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.519885548390448e-05, + "grad_norm": 3.7171497344970703, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8753089904785156, + "num_tokens": 329629872.0, + "step": 8643 + }, + { + "epoch": 1.0996056481363694, + "ewc_loss": 0.007557648699730635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.557648496003821e-05, + "grad_norm": 3.7505578994750977, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8663277626037598, + "num_tokens": 329665263.0, + "step": 8644 + }, + { + "epoch": 1.09973285841496, + "ewc_loss": 0.007580096833407879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.580097008030862e-05, + "grad_norm": 3.674560070037842, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8620280027389526, + "num_tokens": 329705853.0, + "step": 8645 + }, + { + "epoch": 1.0998600686935505, + "ewc_loss": 0.007517675869166851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51767584006302e-05, + "grad_norm": 3.7505877017974854, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.87824547290802, + "num_tokens": 329741690.0, + "step": 8646 + }, + { + "epoch": 1.099987278972141, + "ewc_loss": 0.0075975689105689526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5975687650498e-05, + "grad_norm": 3.7113797664642334, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8634785413742065, + "num_tokens": 329781548.0, + "step": 8647 + }, + { + "epoch": 1.1001144892507315, + "ewc_loss": 0.007555053103715181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555053161922842e-05, + "grad_norm": 3.7198381423950195, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8784431219100952, + "num_tokens": 329822304.0, + "step": 8648 + }, + { + "epoch": 1.100241699529322, + "ewc_loss": 0.007548459805548191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.548459689132869e-05, + "grad_norm": 3.750998020172119, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8727420568466187, + "num_tokens": 329856065.0, + "step": 8649 + }, + { + "epoch": 1.1003689098079126, + "ewc_loss": 0.007576259784400463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.576259667985141e-05, + "grad_norm": 3.752988576889038, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8550665974617004, + "num_tokens": 329895002.0, + "step": 8650 + }, + { + "epoch": 1.100496120086503, + "ewc_loss": 0.007569171488285065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569171430077404e-05, + "grad_norm": 3.6769700050354004, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8657366037368774, + "num_tokens": 329938810.0, + "step": 8651 + }, + { + "epoch": 1.1006233303650934, + "ewc_loss": 0.007543298881500959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.543298852397129e-05, + "grad_norm": 3.6917145252227783, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8838596343994141, + "num_tokens": 329976513.0, + "step": 8652 + }, + { + "epoch": 1.100750540643684, + "ewc_loss": 0.007570228073745966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.570227899122983e-05, + "grad_norm": 3.699262857437134, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8793735504150391, + "num_tokens": 330014013.0, + "step": 8653 + }, + { + "epoch": 1.1008777509222745, + "ewc_loss": 0.007555392105132341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555392221547663e-05, + "grad_norm": 3.7086102962493896, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8675045371055603, + "num_tokens": 330052577.0, + "step": 8654 + }, + { + "epoch": 1.101004961200865, + "ewc_loss": 0.007589062675833702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.589062443003058e-05, + "grad_norm": 3.7395339012145996, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8701919317245483, + "num_tokens": 330091385.0, + "step": 8655 + }, + { + "epoch": 1.1011321714794555, + "ewc_loss": 0.007583220489323139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58322057663463e-05, + "grad_norm": 3.7152628898620605, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8811026811599731, + "num_tokens": 330130285.0, + "step": 8656 + }, + { + "epoch": 1.101259381758046, + "ewc_loss": 0.007559876423329115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.559876394225284e-05, + "grad_norm": 3.7647242546081543, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8577466607093811, + "num_tokens": 330166823.0, + "step": 8657 + }, + { + "epoch": 1.1013865920366366, + "ewc_loss": 0.007600758690387011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600758544867858e-05, + "grad_norm": 3.7905123233795166, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8731403946876526, + "num_tokens": 330200655.0, + "step": 8658 + }, + { + "epoch": 1.101513802315227, + "ewc_loss": 0.007593383546918631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.593383634230122e-05, + "grad_norm": 3.6344082355499268, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8709800243377686, + "num_tokens": 330243656.0, + "step": 8659 + }, + { + "epoch": 1.1016410125938176, + "ewc_loss": 0.007488915231078863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.488915434805676e-05, + "grad_norm": 3.7195112705230713, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8634728193283081, + "num_tokens": 330287200.0, + "step": 8660 + }, + { + "epoch": 1.1017682228724082, + "ewc_loss": 0.0076002622954547405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600262324558571e-05, + "grad_norm": 3.722121477127075, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8738726377487183, + "num_tokens": 330324794.0, + "step": 8661 + }, + { + "epoch": 1.1018954331509987, + "ewc_loss": 0.007559331599622965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.559331424999982e-05, + "grad_norm": 3.7148261070251465, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8741707801818848, + "num_tokens": 330365297.0, + "step": 8662 + }, + { + "epoch": 1.102022643429589, + "ewc_loss": 0.007521636318415403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52163614379242e-05, + "grad_norm": 3.627835750579834, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8877028226852417, + "num_tokens": 330408046.0, + "step": 8663 + }, + { + "epoch": 1.1021498537081795, + "ewc_loss": 0.007499980740249157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.499980711145326e-05, + "grad_norm": 3.7036590576171875, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8665655851364136, + "num_tokens": 330447815.0, + "step": 8664 + }, + { + "epoch": 1.10227706398677, + "ewc_loss": 0.007563354447484016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.563354301964864e-05, + "grad_norm": 3.723999261856079, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8598247766494751, + "num_tokens": 330487796.0, + "step": 8665 + }, + { + "epoch": 1.1024042742653606, + "ewc_loss": 0.007541131228208542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541131344623864e-05, + "grad_norm": 3.68760085105896, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8817543983459473, + "num_tokens": 330531155.0, + "step": 8666 + }, + { + "epoch": 1.1025314845439511, + "ewc_loss": 0.00750286178663373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.502861990360543e-05, + "grad_norm": 3.729071617126465, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.881014347076416, + "num_tokens": 330568611.0, + "step": 8667 + }, + { + "epoch": 1.1026586948225416, + "ewc_loss": 0.007523376494646072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.523376552853733e-05, + "grad_norm": 3.7963438034057617, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.866466224193573, + "num_tokens": 330608085.0, + "step": 8668 + }, + { + "epoch": 1.1027859051011322, + "ewc_loss": 0.007535042706876993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.535042823292315e-05, + "grad_norm": 3.7874889373779297, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8565025925636292, + "num_tokens": 330644776.0, + "step": 8669 + }, + { + "epoch": 1.1029131153797227, + "ewc_loss": 0.007518845144659281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51884508645162e-05, + "grad_norm": 3.7466700077056885, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8632813692092896, + "num_tokens": 330686401.0, + "step": 8670 + }, + { + "epoch": 1.1030403256583132, + "ewc_loss": 0.0074896421283483505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.489642302971333e-05, + "grad_norm": 3.7426533699035645, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8755651116371155, + "num_tokens": 330723115.0, + "step": 8671 + }, + { + "epoch": 1.1031675359369038, + "ewc_loss": 0.00750218378379941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.502183871110901e-05, + "grad_norm": 3.7250003814697266, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.886452317237854, + "num_tokens": 330761170.0, + "step": 8672 + }, + { + "epoch": 1.1032947462154943, + "ewc_loss": 0.007484912872314453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.484912930522114e-05, + "grad_norm": 3.8625168800354004, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.873033881187439, + "num_tokens": 330790230.0, + "step": 8673 + }, + { + "epoch": 1.1034219564940848, + "ewc_loss": 0.0075804186053574085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.580418605357409e-05, + "grad_norm": 3.737727403640747, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8809834718704224, + "num_tokens": 330826178.0, + "step": 8674 + }, + { + "epoch": 1.1035491667726753, + "ewc_loss": 0.007453559897840023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.453560101566836e-05, + "grad_norm": 3.688293933868408, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.877412736415863, + "num_tokens": 330866341.0, + "step": 8675 + }, + { + "epoch": 1.1036763770512656, + "ewc_loss": 0.007482689339667559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.48268939787522e-05, + "grad_norm": 3.7615554332733154, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8586583137512207, + "num_tokens": 330903955.0, + "step": 8676 + }, + { + "epoch": 1.1038035873298562, + "ewc_loss": 0.007548894267529249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.548894063802436e-05, + "grad_norm": 3.71460223197937, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8685027956962585, + "num_tokens": 330943024.0, + "step": 8677 + }, + { + "epoch": 1.1039307976084467, + "ewc_loss": 0.007468067109584808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.468066905857995e-05, + "grad_norm": 3.697726249694824, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8666822910308838, + "num_tokens": 330981004.0, + "step": 8678 + }, + { + "epoch": 1.1040580078870372, + "ewc_loss": 0.007497528102248907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.497527985833585e-05, + "grad_norm": 3.7189276218414307, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8719772100448608, + "num_tokens": 331021532.0, + "step": 8679 + }, + { + "epoch": 1.1041852181656278, + "ewc_loss": 0.007511774078011513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.511774310842156e-05, + "grad_norm": 3.7990882396698, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8794454336166382, + "num_tokens": 331058010.0, + "step": 8680 + }, + { + "epoch": 1.1043124284442183, + "ewc_loss": 0.007532995194196701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532995368819684e-05, + "grad_norm": 3.751667022705078, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8812587857246399, + "num_tokens": 331090713.0, + "step": 8681 + }, + { + "epoch": 1.1044396387228088, + "ewc_loss": 0.007487253751605749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487253606086597e-05, + "grad_norm": 3.7266745567321777, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8776940107345581, + "num_tokens": 331127532.0, + "step": 8682 + }, + { + "epoch": 1.1045668490013993, + "ewc_loss": 0.007484725210815668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.484725210815668e-05, + "grad_norm": 3.7113037109375, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8531489372253418, + "num_tokens": 331172088.0, + "step": 8683 + }, + { + "epoch": 1.1046940592799899, + "ewc_loss": 0.00750902621075511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.50902618165128e-05, + "grad_norm": 3.7393815517425537, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8670897483825684, + "num_tokens": 331213419.0, + "step": 8684 + }, + { + "epoch": 1.1048212695585804, + "ewc_loss": 0.00751820532605052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518205529777333e-05, + "grad_norm": 3.7449378967285156, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8713997602462769, + "num_tokens": 331250102.0, + "step": 8685 + }, + { + "epoch": 1.104948479837171, + "ewc_loss": 0.0075197299011051655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.519729842897505e-05, + "grad_norm": 3.7832424640655518, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8719154596328735, + "num_tokens": 331284597.0, + "step": 8686 + }, + { + "epoch": 1.1050756901157615, + "ewc_loss": 0.0075523764826357365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.552376337116584e-05, + "grad_norm": 3.668307065963745, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8634151220321655, + "num_tokens": 331330688.0, + "step": 8687 + }, + { + "epoch": 1.1052029003943518, + "ewc_loss": 0.007470933720469475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.470933633157983e-05, + "grad_norm": 3.715874671936035, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8733166456222534, + "num_tokens": 331368777.0, + "step": 8688 + }, + { + "epoch": 1.1053301106729423, + "ewc_loss": 0.0075543890707194805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554388866992667e-05, + "grad_norm": 3.666264772415161, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8760370016098022, + "num_tokens": 331414369.0, + "step": 8689 + }, + { + "epoch": 1.1054573209515328, + "ewc_loss": 0.007499299012124538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.499298953916878e-05, + "grad_norm": 3.7642815113067627, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8779687881469727, + "num_tokens": 331452788.0, + "step": 8690 + }, + { + "epoch": 1.1055845312301233, + "ewc_loss": 0.007572893984615803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57289380999282e-05, + "grad_norm": 3.71056866645813, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8616698980331421, + "num_tokens": 331489575.0, + "step": 8691 + }, + { + "epoch": 1.1057117415087139, + "ewc_loss": 0.00750506017357111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.505060057155788e-05, + "grad_norm": 3.6892542839050293, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8887954950332642, + "num_tokens": 331529129.0, + "step": 8692 + }, + { + "epoch": 1.1058389517873044, + "ewc_loss": 0.007510847877711058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.510848081437871e-05, + "grad_norm": 3.750013828277588, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8665224313735962, + "num_tokens": 331564989.0, + "step": 8693 + }, + { + "epoch": 1.105966162065895, + "ewc_loss": 0.0075541394762694836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554139301646501e-05, + "grad_norm": 3.7553093433380127, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8722530603408813, + "num_tokens": 331602561.0, + "step": 8694 + }, + { + "epoch": 1.1060933723444855, + "ewc_loss": 0.007537351455539465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.537351484643295e-05, + "grad_norm": 3.679326057434082, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8735631704330444, + "num_tokens": 331646902.0, + "step": 8695 + }, + { + "epoch": 1.106220582623076, + "ewc_loss": 0.007499344181269407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.499344064854085e-05, + "grad_norm": 3.775178909301758, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8707382678985596, + "num_tokens": 331682347.0, + "step": 8696 + }, + { + "epoch": 1.1063477929016665, + "ewc_loss": 0.007571270689368248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.571270543849096e-05, + "grad_norm": 3.7063069343566895, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8637608289718628, + "num_tokens": 331725813.0, + "step": 8697 + }, + { + "epoch": 1.106475003180257, + "ewc_loss": 0.0074851019307971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.485102105420083e-05, + "grad_norm": 3.6787924766540527, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8524488210678101, + "num_tokens": 331767856.0, + "step": 8698 + }, + { + "epoch": 1.1066022134588476, + "ewc_loss": 0.007519308477640152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.519308564951643e-05, + "grad_norm": 3.7417776584625244, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8792921900749207, + "num_tokens": 331804238.0, + "step": 8699 + }, + { + "epoch": 1.106729423737438, + "ewc_loss": 0.007540766149759293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540766091551632e-05, + "grad_norm": 3.6887481212615967, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8717161417007446, + "num_tokens": 331839963.0, + "step": 8700 + }, + { + "epoch": 1.1068566340160284, + "ewc_loss": 0.007489196490496397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.489196286769584e-05, + "grad_norm": 3.675103187561035, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8754972815513611, + "num_tokens": 331881563.0, + "step": 8701 + }, + { + "epoch": 1.106983844294619, + "ewc_loss": 0.0074989586137235165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.498958439100534e-05, + "grad_norm": 3.685771942138672, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8683052062988281, + "num_tokens": 331925540.0, + "step": 8702 + }, + { + "epoch": 1.1071110545732095, + "ewc_loss": 0.007508428767323494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.508428825531155e-05, + "grad_norm": 3.7653450965881348, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8798918724060059, + "num_tokens": 331963171.0, + "step": 8703 + }, + { + "epoch": 1.1072382648518, + "ewc_loss": 0.007521515246480703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.521515362896025e-05, + "grad_norm": 3.7227797508239746, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.891282856464386, + "num_tokens": 331998310.0, + "step": 8704 + }, + { + "epoch": 1.1073654751303905, + "ewc_loss": 0.007485212758183479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.485212699975818e-05, + "grad_norm": 3.718717575073242, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8775894641876221, + "num_tokens": 332034181.0, + "step": 8705 + }, + { + "epoch": 1.107492685408981, + "ewc_loss": 0.007478835992515087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.478836050722748e-05, + "grad_norm": 3.6756749153137207, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8798946142196655, + "num_tokens": 332075368.0, + "step": 8706 + }, + { + "epoch": 1.1076198956875716, + "ewc_loss": 0.007453084457665682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.453084253938869e-05, + "grad_norm": 3.683087110519409, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8826086521148682, + "num_tokens": 332113281.0, + "step": 8707 + }, + { + "epoch": 1.107747105966162, + "ewc_loss": 0.0074694505892694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.469450792996213e-05, + "grad_norm": 3.7538673877716064, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8647427558898926, + "num_tokens": 332154206.0, + "step": 8708 + }, + { + "epoch": 1.1078743162447526, + "ewc_loss": 0.007490314543247223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.490314601454884e-05, + "grad_norm": 3.741258144378662, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8579428791999817, + "num_tokens": 332195409.0, + "step": 8709 + }, + { + "epoch": 1.1080015265233432, + "ewc_loss": 0.007447222247719765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.447222014889121e-05, + "grad_norm": 3.6695005893707275, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8801121711730957, + "num_tokens": 332233836.0, + "step": 8710 + }, + { + "epoch": 1.1081287368019337, + "ewc_loss": 0.007430399302393198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.430399273289368e-05, + "grad_norm": 3.6884524822235107, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8763664960861206, + "num_tokens": 332274167.0, + "step": 8711 + }, + { + "epoch": 1.108255947080524, + "ewc_loss": 0.007464893162250519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.464893133146688e-05, + "grad_norm": 3.702016592025757, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8690381646156311, + "num_tokens": 332315856.0, + "step": 8712 + }, + { + "epoch": 1.1083831573591145, + "ewc_loss": 0.007463622838258743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.463622750947252e-05, + "grad_norm": 3.6441383361816406, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8713032007217407, + "num_tokens": 332366365.0, + "step": 8713 + }, + { + "epoch": 1.108510367637705, + "ewc_loss": 0.007424938026815653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.424937939504161e-05, + "grad_norm": 3.8022096157073975, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8695352077484131, + "num_tokens": 332400411.0, + "step": 8714 + }, + { + "epoch": 1.1086375779162956, + "ewc_loss": 0.007534376345574856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534376345574856e-05, + "grad_norm": 3.683736562728882, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8751919269561768, + "num_tokens": 332441959.0, + "step": 8715 + }, + { + "epoch": 1.108764788194886, + "ewc_loss": 0.0073987566865980625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.398756861221045e-05, + "grad_norm": 3.777045726776123, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8589562177658081, + "num_tokens": 332479545.0, + "step": 8716 + }, + { + "epoch": 1.1088919984734766, + "ewc_loss": 0.007498481310904026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.498481136281043e-05, + "grad_norm": 3.7223124504089355, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.882521390914917, + "num_tokens": 332516240.0, + "step": 8717 + }, + { + "epoch": 1.1090192087520672, + "ewc_loss": 0.007427265401929617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.427265518344939e-05, + "grad_norm": 3.694977283477783, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.877394437789917, + "num_tokens": 332553488.0, + "step": 8718 + }, + { + "epoch": 1.1091464190306577, + "ewc_loss": 0.007435825653374195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.435825682478026e-05, + "grad_norm": 3.753448486328125, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8619139194488525, + "num_tokens": 332588646.0, + "step": 8719 + }, + { + "epoch": 1.1092736293092482, + "ewc_loss": 0.007480123080313206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.480123167624697e-05, + "grad_norm": 3.7169156074523926, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.862350583076477, + "num_tokens": 332629103.0, + "step": 8720 + }, + { + "epoch": 1.1094008395878387, + "ewc_loss": 0.007452125195413828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.452125282725319e-05, + "grad_norm": 3.7095746994018555, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8773868680000305, + "num_tokens": 332664820.0, + "step": 8721 + }, + { + "epoch": 1.1095280498664293, + "ewc_loss": 0.007466446608304977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.466446550097317e-05, + "grad_norm": 3.6972341537475586, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8741553425788879, + "num_tokens": 332699772.0, + "step": 8722 + }, + { + "epoch": 1.1096552601450198, + "ewc_loss": 0.007461042609065771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.461042696377262e-05, + "grad_norm": 3.7106130123138428, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8802586197853088, + "num_tokens": 332733478.0, + "step": 8723 + }, + { + "epoch": 1.1097824704236103, + "ewc_loss": 0.007496330421417952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.49633036321029e-05, + "grad_norm": 3.647592306137085, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8692066073417664, + "num_tokens": 332774733.0, + "step": 8724 + }, + { + "epoch": 1.1099096807022006, + "ewc_loss": 0.00746167404577136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.461674249498174e-05, + "grad_norm": 3.7590622901916504, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8485008478164673, + "num_tokens": 332812970.0, + "step": 8725 + }, + { + "epoch": 1.1100368909807912, + "ewc_loss": 0.007553713861852884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553713658126071e-05, + "grad_norm": 3.690181016921997, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8675414323806763, + "num_tokens": 332854974.0, + "step": 8726 + }, + { + "epoch": 1.1101641012593817, + "ewc_loss": 0.007482358254492283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.482358341803774e-05, + "grad_norm": 3.7934889793395996, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8582930564880371, + "num_tokens": 332886462.0, + "step": 8727 + }, + { + "epoch": 1.1102913115379722, + "ewc_loss": 0.007571719586849213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.571719470433891e-05, + "grad_norm": 3.6736621856689453, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8676736950874329, + "num_tokens": 332928614.0, + "step": 8728 + }, + { + "epoch": 1.1104185218165628, + "ewc_loss": 0.007474845740944147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.47484591556713e-05, + "grad_norm": 3.7797837257385254, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8737356662750244, + "num_tokens": 332962447.0, + "step": 8729 + }, + { + "epoch": 1.1105457320951533, + "ewc_loss": 0.007580372970551252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.580372766824439e-05, + "grad_norm": 3.8196706771850586, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8640732169151306, + "num_tokens": 332991395.0, + "step": 8730 + }, + { + "epoch": 1.1106729423737438, + "ewc_loss": 0.0075677926652133465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.567792636109516e-05, + "grad_norm": 3.7278409004211426, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8653020262718201, + "num_tokens": 333027852.0, + "step": 8731 + }, + { + "epoch": 1.1108001526523343, + "ewc_loss": 0.007509429939091206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.509429997298867e-05, + "grad_norm": 3.689850091934204, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8746674060821533, + "num_tokens": 333063337.0, + "step": 8732 + }, + { + "epoch": 1.1109273629309249, + "ewc_loss": 0.007538885343819857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.538885256508365e-05, + "grad_norm": 3.777942180633545, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8641383647918701, + "num_tokens": 333094632.0, + "step": 8733 + }, + { + "epoch": 1.1110545732095154, + "ewc_loss": 0.00762167340144515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.621673285029829e-05, + "grad_norm": 3.6796703338623047, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8844164609909058, + "num_tokens": 333134071.0, + "step": 8734 + }, + { + "epoch": 1.111181783488106, + "ewc_loss": 0.007528367917984724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.528367859777063e-05, + "grad_norm": 3.6900880336761475, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8776583671569824, + "num_tokens": 333171288.0, + "step": 8735 + }, + { + "epoch": 1.1113089937666965, + "ewc_loss": 0.007586473599076271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586473657283932e-05, + "grad_norm": 3.704864978790283, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8786895275115967, + "num_tokens": 333212111.0, + "step": 8736 + }, + { + "epoch": 1.1114362040452868, + "ewc_loss": 0.007583035156130791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583035039715469e-05, + "grad_norm": 3.7407402992248535, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8787175416946411, + "num_tokens": 333247557.0, + "step": 8737 + }, + { + "epoch": 1.1115634143238773, + "ewc_loss": 0.007609892170876265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.609892054460943e-05, + "grad_norm": 3.802124500274658, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8703285455703735, + "num_tokens": 333278705.0, + "step": 8738 + }, + { + "epoch": 1.1116906246024678, + "ewc_loss": 0.007626890204846859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626890146639198e-05, + "grad_norm": 3.735830545425415, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8732583522796631, + "num_tokens": 333316493.0, + "step": 8739 + }, + { + "epoch": 1.1118178348810583, + "ewc_loss": 0.007566036190837622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566036219941452e-05, + "grad_norm": 3.6480531692504883, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8748750686645508, + "num_tokens": 333360384.0, + "step": 8740 + }, + { + "epoch": 1.1119450451596489, + "ewc_loss": 0.007537188474088907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.537188503192738e-05, + "grad_norm": 3.743436574935913, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8738564848899841, + "num_tokens": 333395568.0, + "step": 8741 + }, + { + "epoch": 1.1120722554382394, + "ewc_loss": 0.007624803576618433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62480340199545e-05, + "grad_norm": 3.800248861312866, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8764867782592773, + "num_tokens": 333429267.0, + "step": 8742 + }, + { + "epoch": 1.11219946571683, + "ewc_loss": 0.00762633141130209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626331353094429e-05, + "grad_norm": 3.6812503337860107, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8769423961639404, + "num_tokens": 333469908.0, + "step": 8743 + }, + { + "epoch": 1.1123266759954205, + "ewc_loss": 0.007521705236285925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.521705265389755e-05, + "grad_norm": 3.70900297164917, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8751041293144226, + "num_tokens": 333507865.0, + "step": 8744 + }, + { + "epoch": 1.112453886274011, + "ewc_loss": 0.007591590750962496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.591590838273987e-05, + "grad_norm": 3.7394227981567383, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8824771642684937, + "num_tokens": 333544011.0, + "step": 8745 + }, + { + "epoch": 1.1125810965526015, + "ewc_loss": 0.007593194488435984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.593194459332153e-05, + "grad_norm": 3.7253215312957764, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8741679191589355, + "num_tokens": 333581949.0, + "step": 8746 + }, + { + "epoch": 1.112708306831192, + "ewc_loss": 0.007561683189123869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.561683014500886e-05, + "grad_norm": 3.7209458351135254, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8802827596664429, + "num_tokens": 333615994.0, + "step": 8747 + }, + { + "epoch": 1.1128355171097826, + "ewc_loss": 0.007597191259264946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597191142849624e-05, + "grad_norm": 3.739515542984009, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8877835273742676, + "num_tokens": 333654454.0, + "step": 8748 + }, + { + "epoch": 1.112962727388373, + "ewc_loss": 0.0075873578898608685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.587357686134055e-05, + "grad_norm": 3.6551153659820557, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8648077845573425, + "num_tokens": 333700206.0, + "step": 8749 + }, + { + "epoch": 1.1130899376669634, + "ewc_loss": 0.007519436068832874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.519435894209892e-05, + "grad_norm": 3.7217814922332764, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8760610818862915, + "num_tokens": 333737538.0, + "step": 8750 + }, + { + "epoch": 1.113217147945554, + "ewc_loss": 0.007607751991599798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.607752195326611e-05, + "grad_norm": 3.7535629272460938, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8745633363723755, + "num_tokens": 333771237.0, + "step": 8751 + }, + { + "epoch": 1.1133443582241445, + "ewc_loss": 0.007594828028231859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.594827911816537e-05, + "grad_norm": 3.741672992706299, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8676068782806396, + "num_tokens": 333806522.0, + "step": 8752 + }, + { + "epoch": 1.113471568502735, + "ewc_loss": 0.007581376004964113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.581376121379435e-05, + "grad_norm": 3.674623489379883, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8720831871032715, + "num_tokens": 333846656.0, + "step": 8753 + }, + { + "epoch": 1.1135987787813255, + "ewc_loss": 0.007547892164438963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.547892164438963e-05, + "grad_norm": 3.7111687660217285, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8644310832023621, + "num_tokens": 333886294.0, + "step": 8754 + }, + { + "epoch": 1.113725989059916, + "ewc_loss": 0.007597833406180143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597833609906957e-05, + "grad_norm": 3.7175278663635254, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8687583208084106, + "num_tokens": 333923868.0, + "step": 8755 + }, + { + "epoch": 1.1138531993385066, + "ewc_loss": 0.007580933161079884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.580933015560731e-05, + "grad_norm": 3.7551844120025635, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8746799230575562, + "num_tokens": 333957187.0, + "step": 8756 + }, + { + "epoch": 1.113980409617097, + "ewc_loss": 0.007597504649311304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597504736622795e-05, + "grad_norm": 3.684619188308716, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8850147128105164, + "num_tokens": 333999061.0, + "step": 8757 + }, + { + "epoch": 1.1141076198956876, + "ewc_loss": 0.0075422124937176704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.542212551925331e-05, + "grad_norm": 3.677985191345215, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8660115003585815, + "num_tokens": 334043595.0, + "step": 8758 + }, + { + "epoch": 1.1142348301742782, + "ewc_loss": 0.007566624321043491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566624117316678e-05, + "grad_norm": 3.7277989387512207, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8770168423652649, + "num_tokens": 334078487.0, + "step": 8759 + }, + { + "epoch": 1.1143620404528687, + "ewc_loss": 0.0075911711901426315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.591171015519649e-05, + "grad_norm": 3.681471586227417, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8800355792045593, + "num_tokens": 334119610.0, + "step": 8760 + }, + { + "epoch": 1.114489250731459, + "ewc_loss": 0.0075477357022464275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.547735731350258e-05, + "grad_norm": 3.7099151611328125, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8685675859451294, + "num_tokens": 334160149.0, + "step": 8761 + }, + { + "epoch": 1.1146164610100495, + "ewc_loss": 0.007588965818285942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58896567276679e-05, + "grad_norm": 3.7208900451660156, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8728196620941162, + "num_tokens": 334198667.0, + "step": 8762 + }, + { + "epoch": 1.11474367128864, + "ewc_loss": 0.007579050026834011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57904999773018e-05, + "grad_norm": 3.7964584827423096, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8744144439697266, + "num_tokens": 334233838.0, + "step": 8763 + }, + { + "epoch": 1.1148708815672306, + "ewc_loss": 0.0075988988392055035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.598898810101673e-05, + "grad_norm": 3.6900148391723633, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8655366897583008, + "num_tokens": 334275703.0, + "step": 8764 + }, + { + "epoch": 1.114998091845821, + "ewc_loss": 0.007508100476115942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.508100679842755e-05, + "grad_norm": 3.7328004837036133, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8834112286567688, + "num_tokens": 334311119.0, + "step": 8765 + }, + { + "epoch": 1.1151253021244116, + "ewc_loss": 0.0075752343982458115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.575234485557303e-05, + "grad_norm": 3.7838566303253174, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8644277453422546, + "num_tokens": 334348298.0, + "step": 8766 + }, + { + "epoch": 1.1152525124030022, + "ewc_loss": 0.0075854090973734856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.585409184684977e-05, + "grad_norm": 3.728517770767212, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8607962131500244, + "num_tokens": 334386986.0, + "step": 8767 + }, + { + "epoch": 1.1153797226815927, + "ewc_loss": 0.007521336432546377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.521336374338716e-05, + "grad_norm": 3.729628562927246, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8766813278198242, + "num_tokens": 334423263.0, + "step": 8768 + }, + { + "epoch": 1.1155069329601832, + "ewc_loss": 0.007542981766164303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54298162064515e-05, + "grad_norm": 3.697268009185791, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8880325555801392, + "num_tokens": 334460363.0, + "step": 8769 + }, + { + "epoch": 1.1156341432387737, + "ewc_loss": 0.007518861908465624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518861821154132e-05, + "grad_norm": 3.712725877761841, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8758947849273682, + "num_tokens": 334501883.0, + "step": 8770 + }, + { + "epoch": 1.1157613535173643, + "ewc_loss": 0.007539374753832817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5393749284558e-05, + "grad_norm": 3.790163516998291, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8721206188201904, + "num_tokens": 334536242.0, + "step": 8771 + }, + { + "epoch": 1.1158885637959548, + "ewc_loss": 0.007572621572762728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57262168917805e-05, + "grad_norm": 3.7535665035247803, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8828217387199402, + "num_tokens": 334569685.0, + "step": 8772 + }, + { + "epoch": 1.1160157740745453, + "ewc_loss": 0.007518238853663206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518238999182358e-05, + "grad_norm": 3.7661709785461426, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8732579946517944, + "num_tokens": 334604946.0, + "step": 8773 + }, + { + "epoch": 1.1161429843531356, + "ewc_loss": 0.007549575995653868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549575821030885e-05, + "grad_norm": 3.7016971111297607, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8785330057144165, + "num_tokens": 334645495.0, + "step": 8774 + }, + { + "epoch": 1.1162701946317262, + "ewc_loss": 0.007498577702790499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.498577906517312e-05, + "grad_norm": 3.7568511962890625, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.867275059223175, + "num_tokens": 334684239.0, + "step": 8775 + }, + { + "epoch": 1.1163974049103167, + "ewc_loss": 0.007579423952847719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57942398195155e-05, + "grad_norm": 3.8347055912017822, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8783475160598755, + "num_tokens": 334713799.0, + "step": 8776 + }, + { + "epoch": 1.1165246151889072, + "ewc_loss": 0.00759090855717659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.590908353449777e-05, + "grad_norm": 3.6444852352142334, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8921332955360413, + "num_tokens": 334753374.0, + "step": 8777 + }, + { + "epoch": 1.1166518254674977, + "ewc_loss": 0.007469926495105028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.46992664062418e-05, + "grad_norm": 3.7062630653381348, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8743947148323059, + "num_tokens": 334792867.0, + "step": 8778 + }, + { + "epoch": 1.1167790357460883, + "ewc_loss": 0.00756970327347517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569703302579e-05, + "grad_norm": 3.73221755027771, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8694530725479126, + "num_tokens": 334832664.0, + "step": 8779 + }, + { + "epoch": 1.1169062460246788, + "ewc_loss": 0.007562305312603712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.562305108876899e-05, + "grad_norm": 3.723695993423462, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8650506734848022, + "num_tokens": 334876763.0, + "step": 8780 + }, + { + "epoch": 1.1170334563032693, + "ewc_loss": 0.007533019408583641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.53301937947981e-05, + "grad_norm": 3.8138790130615234, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8736562728881836, + "num_tokens": 334909431.0, + "step": 8781 + }, + { + "epoch": 1.1171606665818599, + "ewc_loss": 0.007611381355673075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.611381442984566e-05, + "grad_norm": 3.8267083168029785, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8717635273933411, + "num_tokens": 334941939.0, + "step": 8782 + }, + { + "epoch": 1.1172878768604504, + "ewc_loss": 0.007566370069980621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566370186395943e-05, + "grad_norm": 3.6978652477264404, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8812105655670166, + "num_tokens": 334981338.0, + "step": 8783 + }, + { + "epoch": 1.117415087139041, + "ewc_loss": 0.007506838999688625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.506839028792456e-05, + "grad_norm": 3.7137975692749023, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8725932836532593, + "num_tokens": 335020660.0, + "step": 8784 + }, + { + "epoch": 1.1175422974176314, + "ewc_loss": 0.007550857029855251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550857117166743e-05, + "grad_norm": 3.7375285625457764, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8749579191207886, + "num_tokens": 335057377.0, + "step": 8785 + }, + { + "epoch": 1.1176695076962218, + "ewc_loss": 0.0075607094913721085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560709491372108e-05, + "grad_norm": 3.7666237354278564, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8626880049705505, + "num_tokens": 335092912.0, + "step": 8786 + }, + { + "epoch": 1.1177967179748123, + "ewc_loss": 0.00757561856880784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.575618656119332e-05, + "grad_norm": 3.7553999423980713, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8876215219497681, + "num_tokens": 335129157.0, + "step": 8787 + }, + { + "epoch": 1.1179239282534028, + "ewc_loss": 0.007545338943600655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.545339030912146e-05, + "grad_norm": 3.7042651176452637, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.873258113861084, + "num_tokens": 335168065.0, + "step": 8788 + }, + { + "epoch": 1.1180511385319933, + "ewc_loss": 0.00752997724339366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52997730160132e-05, + "grad_norm": 3.6693975925445557, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8723306655883789, + "num_tokens": 335218876.0, + "step": 8789 + }, + { + "epoch": 1.1181783488105839, + "ewc_loss": 0.007516087498515844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.516087498515844e-05, + "grad_norm": 3.6698904037475586, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8831086158752441, + "num_tokens": 335256300.0, + "step": 8790 + }, + { + "epoch": 1.1183055590891744, + "ewc_loss": 0.007543955929577351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54395587136969e-05, + "grad_norm": 3.7474679946899414, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8836652040481567, + "num_tokens": 335291035.0, + "step": 8791 + }, + { + "epoch": 1.118432769367765, + "ewc_loss": 0.007566206622868776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566206477349624e-05, + "grad_norm": 3.7055816650390625, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8838256597518921, + "num_tokens": 335330092.0, + "step": 8792 + }, + { + "epoch": 1.1185599796463555, + "ewc_loss": 0.007506862282752991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.506862311856821e-05, + "grad_norm": 3.803656816482544, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8594209551811218, + "num_tokens": 335367146.0, + "step": 8793 + }, + { + "epoch": 1.118687189924946, + "ewc_loss": 0.007590885739773512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.590885797981173e-05, + "grad_norm": 3.7152676582336426, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8733962774276733, + "num_tokens": 335405069.0, + "step": 8794 + }, + { + "epoch": 1.1188144002035365, + "ewc_loss": 0.007500750012695789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.500749779865146e-05, + "grad_norm": 3.719987154006958, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.862951934337616, + "num_tokens": 335443779.0, + "step": 8795 + }, + { + "epoch": 1.118941610482127, + "ewc_loss": 0.0075413924641907215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541392551502213e-05, + "grad_norm": 3.7154457569122314, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8663269281387329, + "num_tokens": 335485978.0, + "step": 8796 + }, + { + "epoch": 1.1190688207607176, + "ewc_loss": 0.007532857824116945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532857853220776e-05, + "grad_norm": 3.6728858947753906, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8767191171646118, + "num_tokens": 335530375.0, + "step": 8797 + }, + { + "epoch": 1.119196031039308, + "ewc_loss": 0.007500828243792057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.500828360207379e-05, + "grad_norm": 3.739074230194092, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.878432035446167, + "num_tokens": 335569109.0, + "step": 8798 + }, + { + "epoch": 1.1193232413178984, + "ewc_loss": 0.00755112711340189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.551127055194229e-05, + "grad_norm": 3.835562229156494, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8600588440895081, + "num_tokens": 335599283.0, + "step": 8799 + }, + { + "epoch": 1.119450451596489, + "ewc_loss": 0.007583640515804291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583640399388969e-05, + "grad_norm": 3.744701385498047, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8790287971496582, + "num_tokens": 335634792.0, + "step": 8800 + }, + { + "epoch": 1.1195776618750795, + "ewc_loss": 0.007498202379792929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.49820246710442e-05, + "grad_norm": 3.734036684036255, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8852938413619995, + "num_tokens": 335672869.0, + "step": 8801 + }, + { + "epoch": 1.11970487215367, + "ewc_loss": 0.007534870877861977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.53487111069262e-05, + "grad_norm": 3.7226979732513428, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8789071440696716, + "num_tokens": 335714996.0, + "step": 8802 + }, + { + "epoch": 1.1198320824322605, + "ewc_loss": 0.007525422610342503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525422552134842e-05, + "grad_norm": 3.704678535461426, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8648679256439209, + "num_tokens": 335756884.0, + "step": 8803 + }, + { + "epoch": 1.119959292710851, + "ewc_loss": 0.007514516822993755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.514516619266942e-05, + "grad_norm": 3.699026107788086, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8823795318603516, + "num_tokens": 335793935.0, + "step": 8804 + }, + { + "epoch": 1.1200865029894416, + "ewc_loss": 0.007526465691626072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.526465924456716e-05, + "grad_norm": 3.6720120906829834, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8843961954116821, + "num_tokens": 335834169.0, + "step": 8805 + }, + { + "epoch": 1.120213713268032, + "ewc_loss": 0.0075015113689005375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.501511572627351e-05, + "grad_norm": 3.715003490447998, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8909741640090942, + "num_tokens": 335872302.0, + "step": 8806 + }, + { + "epoch": 1.1203409235466226, + "ewc_loss": 0.007529626600444317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.529626600444317e-05, + "grad_norm": 3.740525484085083, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.872295618057251, + "num_tokens": 335908074.0, + "step": 8807 + }, + { + "epoch": 1.1204681338252132, + "ewc_loss": 0.007538991514593363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.538991485489532e-05, + "grad_norm": 3.7036871910095215, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8851333856582642, + "num_tokens": 335946268.0, + "step": 8808 + }, + { + "epoch": 1.1205953441038037, + "ewc_loss": 0.007495774421840906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.495774480048567e-05, + "grad_norm": 3.728055715560913, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8738415241241455, + "num_tokens": 335985364.0, + "step": 8809 + }, + { + "epoch": 1.120722554382394, + "ewc_loss": 0.0075285350903868675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52853520680219e-05, + "grad_norm": 3.729008197784424, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8771517276763916, + "num_tokens": 336025300.0, + "step": 8810 + }, + { + "epoch": 1.1208497646609845, + "ewc_loss": 0.007518122438341379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518122583860531e-05, + "grad_norm": 3.7229321002960205, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8767951130867004, + "num_tokens": 336062105.0, + "step": 8811 + }, + { + "epoch": 1.120976974939575, + "ewc_loss": 0.007500186562538147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.500186620745808e-05, + "grad_norm": 3.6984739303588867, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.864006519317627, + "num_tokens": 336102944.0, + "step": 8812 + }, + { + "epoch": 1.1211041852181656, + "ewc_loss": 0.007504961919039488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.504961831727996e-05, + "grad_norm": 3.662809371948242, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8974229097366333, + "num_tokens": 336142964.0, + "step": 8813 + }, + { + "epoch": 1.121231395496756, + "ewc_loss": 0.00748024694621563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.480246858904138e-05, + "grad_norm": 3.673334836959839, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8769916296005249, + "num_tokens": 336186962.0, + "step": 8814 + }, + { + "epoch": 1.1213586057753466, + "ewc_loss": 0.007503677625209093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.503677625209093e-05, + "grad_norm": 3.740931272506714, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8823838829994202, + "num_tokens": 336220775.0, + "step": 8815 + }, + { + "epoch": 1.1214858160539372, + "ewc_loss": 0.0075264559127390385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.526455738116056e-05, + "grad_norm": 3.7008023262023926, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.870346188545227, + "num_tokens": 336263067.0, + "step": 8816 + }, + { + "epoch": 1.1216130263325277, + "ewc_loss": 0.00747953075915575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.479530904674903e-05, + "grad_norm": 3.6907219886779785, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8767620325088501, + "num_tokens": 336308463.0, + "step": 8817 + }, + { + "epoch": 1.1217402366111182, + "ewc_loss": 0.007498505990952253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.498505874536932e-05, + "grad_norm": 3.7098746299743652, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8865564465522766, + "num_tokens": 336346965.0, + "step": 8818 + }, + { + "epoch": 1.1218674468897087, + "ewc_loss": 0.00748397083953023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.483970694011077e-05, + "grad_norm": 3.7570347785949707, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8754311800003052, + "num_tokens": 336382114.0, + "step": 8819 + }, + { + "epoch": 1.1219946571682993, + "ewc_loss": 0.0075164176523685455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.516417826991528e-05, + "grad_norm": 3.7646143436431885, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8772127628326416, + "num_tokens": 336414020.0, + "step": 8820 + }, + { + "epoch": 1.1221218674468898, + "ewc_loss": 0.00751134380698204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.511343574151397e-05, + "grad_norm": 3.7375354766845703, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8714848756790161, + "num_tokens": 336452522.0, + "step": 8821 + }, + { + "epoch": 1.1222490777254803, + "ewc_loss": 0.007493364159017801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.493363955290988e-05, + "grad_norm": 3.761817216873169, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8833187818527222, + "num_tokens": 336484593.0, + "step": 8822 + }, + { + "epoch": 1.1223762880040706, + "ewc_loss": 0.0075210873037576675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.521087536588311e-05, + "grad_norm": 3.831447124481201, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8584380149841309, + "num_tokens": 336518964.0, + "step": 8823 + }, + { + "epoch": 1.1225034982826612, + "ewc_loss": 0.007559692952781916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.559693040093407e-05, + "grad_norm": 3.74499773979187, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8612984418869019, + "num_tokens": 336556320.0, + "step": 8824 + }, + { + "epoch": 1.1226307085612517, + "ewc_loss": 0.007491276599466801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.491276483051479e-05, + "grad_norm": 3.7028748989105225, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8636376857757568, + "num_tokens": 336595473.0, + "step": 8825 + }, + { + "epoch": 1.1227579188398422, + "ewc_loss": 0.007497226819396019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.497226761188358e-05, + "grad_norm": 3.7632663249969482, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8735275268554688, + "num_tokens": 336631793.0, + "step": 8826 + }, + { + "epoch": 1.1228851291184327, + "ewc_loss": 0.0075670708902180195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.567070861114189e-05, + "grad_norm": 3.7636215686798096, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8598753809928894, + "num_tokens": 336671230.0, + "step": 8827 + }, + { + "epoch": 1.1230123393970233, + "ewc_loss": 0.007540276274085045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540276419604197e-05, + "grad_norm": 3.688535451889038, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8730208277702332, + "num_tokens": 336712015.0, + "step": 8828 + }, + { + "epoch": 1.1231395496756138, + "ewc_loss": 0.0075045484118163586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.504548557335511e-05, + "grad_norm": 3.8565757274627686, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8589214086532593, + "num_tokens": 336741206.0, + "step": 8829 + }, + { + "epoch": 1.1232667599542043, + "ewc_loss": 0.007648150436580181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648150494787842e-05, + "grad_norm": 3.733745813369751, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8664385080337524, + "num_tokens": 336778838.0, + "step": 8830 + }, + { + "epoch": 1.1233939702327949, + "ewc_loss": 0.007533050142228603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.53304993850179e-05, + "grad_norm": 3.696467399597168, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8796118497848511, + "num_tokens": 336815471.0, + "step": 8831 + }, + { + "epoch": 1.1235211805113854, + "ewc_loss": 0.007565765641629696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565765554318205e-05, + "grad_norm": 3.72467303276062, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8762818574905396, + "num_tokens": 336850870.0, + "step": 8832 + }, + { + "epoch": 1.123648390789976, + "ewc_loss": 0.007610544562339783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.610544707858935e-05, + "grad_norm": 3.7106194496154785, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8782558441162109, + "num_tokens": 336891779.0, + "step": 8833 + }, + { + "epoch": 1.1237756010685664, + "ewc_loss": 0.007572685834020376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.572685717605054e-05, + "grad_norm": 3.6695573329925537, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8840517997741699, + "num_tokens": 336931171.0, + "step": 8834 + }, + { + "epoch": 1.1239028113471567, + "ewc_loss": 0.0075643970631062984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564396946690977e-05, + "grad_norm": 3.7542057037353516, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8835680484771729, + "num_tokens": 336963080.0, + "step": 8835 + }, + { + "epoch": 1.1240300216257473, + "ewc_loss": 0.0076220110058784485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.622010889463127e-05, + "grad_norm": 3.752511739730835, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8611247539520264, + "num_tokens": 337000817.0, + "step": 8836 + }, + { + "epoch": 1.1241572319043378, + "ewc_loss": 0.007583917118608952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583916885778308e-05, + "grad_norm": 3.7050349712371826, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8672324419021606, + "num_tokens": 337037249.0, + "step": 8837 + }, + { + "epoch": 1.1242844421829283, + "ewc_loss": 0.007563296239823103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56329609430395e-05, + "grad_norm": 3.6721079349517822, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8812603950500488, + "num_tokens": 337077752.0, + "step": 8838 + }, + { + "epoch": 1.1244116524615189, + "ewc_loss": 0.007584563922137022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.584563718410209e-05, + "grad_norm": 3.721858263015747, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8698749542236328, + "num_tokens": 337119649.0, + "step": 8839 + }, + { + "epoch": 1.1245388627401094, + "ewc_loss": 0.007608724292367697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.608724263263866e-05, + "grad_norm": 3.809783935546875, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8576928973197937, + "num_tokens": 337154376.0, + "step": 8840 + }, + { + "epoch": 1.1246660730187, + "ewc_loss": 0.007646259386092424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.646259473403916e-05, + "grad_norm": 3.6678099632263184, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8826701641082764, + "num_tokens": 337193377.0, + "step": 8841 + }, + { + "epoch": 1.1247932832972904, + "ewc_loss": 0.007528526708483696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.528526475653052e-05, + "grad_norm": 3.7154417037963867, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8656889796257019, + "num_tokens": 337233176.0, + "step": 8842 + }, + { + "epoch": 1.124920493575881, + "ewc_loss": 0.0076058413833379745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.605841528857127e-05, + "grad_norm": 3.8282954692840576, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8581068515777588, + "num_tokens": 337262995.0, + "step": 8843 + }, + { + "epoch": 1.1250477038544715, + "ewc_loss": 0.007657565642148256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.657565583940595e-05, + "grad_norm": 3.7148048877716064, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8730784058570862, + "num_tokens": 337302466.0, + "step": 8844 + }, + { + "epoch": 1.125174914133062, + "ewc_loss": 0.00755335483700037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553354953415692e-05, + "grad_norm": 3.6986918449401855, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8832827210426331, + "num_tokens": 337341197.0, + "step": 8845 + }, + { + "epoch": 1.1253021244116526, + "ewc_loss": 0.0075867390260100365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58673922973685e-05, + "grad_norm": 3.740650177001953, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8743090629577637, + "num_tokens": 337375602.0, + "step": 8846 + }, + { + "epoch": 1.125429334690243, + "ewc_loss": 0.007613298017531633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.613297930220142e-05, + "grad_norm": 3.811065435409546, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8633148670196533, + "num_tokens": 337406685.0, + "step": 8847 + }, + { + "epoch": 1.1255565449688334, + "ewc_loss": 0.007642453536391258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.642453419975936e-05, + "grad_norm": 3.679473400115967, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8681035041809082, + "num_tokens": 337447364.0, + "step": 8848 + }, + { + "epoch": 1.125683755247424, + "ewc_loss": 0.007543574552983046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.543574611190706e-05, + "grad_norm": 3.671420097351074, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8727284669876099, + "num_tokens": 337487840.0, + "step": 8849 + }, + { + "epoch": 1.1258109655260145, + "ewc_loss": 0.007599769160151482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.59976901463233e-05, + "grad_norm": 3.7255992889404297, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8765342235565186, + "num_tokens": 337525117.0, + "step": 8850 + }, + { + "epoch": 1.125938175804605, + "ewc_loss": 0.007621098309755325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.621098484378308e-05, + "grad_norm": 3.6864001750946045, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8664845824241638, + "num_tokens": 337567024.0, + "step": 8851 + }, + { + "epoch": 1.1260653860831955, + "ewc_loss": 0.007561744190752506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.561744132544845e-05, + "grad_norm": 3.7731924057006836, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8714529275894165, + "num_tokens": 337600117.0, + "step": 8852 + }, + { + "epoch": 1.126192596361786, + "ewc_loss": 0.0076297977939248085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.629797619301826e-05, + "grad_norm": 3.741626024246216, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8593034148216248, + "num_tokens": 337637188.0, + "step": 8853 + }, + { + "epoch": 1.1263198066403766, + "ewc_loss": 0.007585725747048855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.585725688841194e-05, + "grad_norm": 3.701146364212036, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.868928849697113, + "num_tokens": 337676492.0, + "step": 8854 + }, + { + "epoch": 1.126447016918967, + "ewc_loss": 0.007565795909613371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565796113340184e-05, + "grad_norm": 3.701712131500244, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8841181993484497, + "num_tokens": 337713468.0, + "step": 8855 + }, + { + "epoch": 1.1265742271975576, + "ewc_loss": 0.007585366256535053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.585366256535053e-05, + "grad_norm": 3.761876344680786, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8609532117843628, + "num_tokens": 337750944.0, + "step": 8856 + }, + { + "epoch": 1.1267014374761481, + "ewc_loss": 0.007617841009050608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617841038154438e-05, + "grad_norm": 3.6727027893066406, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8874804973602295, + "num_tokens": 337793802.0, + "step": 8857 + }, + { + "epoch": 1.1268286477547387, + "ewc_loss": 0.007531930226832628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.531930168624967e-05, + "grad_norm": 3.7564077377319336, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8858308792114258, + "num_tokens": 337828550.0, + "step": 8858 + }, + { + "epoch": 1.126955858033329, + "ewc_loss": 0.007631421554833651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631421613041312e-05, + "grad_norm": 3.782456398010254, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.882225751876831, + "num_tokens": 337862168.0, + "step": 8859 + }, + { + "epoch": 1.1270830683119195, + "ewc_loss": 0.007592173758894205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.592173642478883e-05, + "grad_norm": 3.7312545776367188, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8701298236846924, + "num_tokens": 337898837.0, + "step": 8860 + }, + { + "epoch": 1.12721027859051, + "ewc_loss": 0.007552191615104675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.552191527793184e-05, + "grad_norm": 3.7310101985931396, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8612583875656128, + "num_tokens": 337939608.0, + "step": 8861 + }, + { + "epoch": 1.1273374888691006, + "ewc_loss": 0.0075760819017887115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.576082134619355e-05, + "grad_norm": 3.6980438232421875, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8727601170539856, + "num_tokens": 337983412.0, + "step": 8862 + }, + { + "epoch": 1.127464699147691, + "ewc_loss": 0.007532866671681404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532866584369913e-05, + "grad_norm": 3.7000958919525146, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8815274834632874, + "num_tokens": 338022171.0, + "step": 8863 + }, + { + "epoch": 1.1275919094262816, + "ewc_loss": 0.0075537594966590405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55375949665904e-05, + "grad_norm": 3.7643518447875977, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8701343536376953, + "num_tokens": 338058838.0, + "step": 8864 + }, + { + "epoch": 1.1277191197048722, + "ewc_loss": 0.007560841739177704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560841913800687e-05, + "grad_norm": 3.7503809928894043, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8712818622589111, + "num_tokens": 338095080.0, + "step": 8865 + }, + { + "epoch": 1.1278463299834627, + "ewc_loss": 0.00753373745828867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.53373751649633e-05, + "grad_norm": 3.7305524349212646, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8552715182304382, + "num_tokens": 338134206.0, + "step": 8866 + }, + { + "epoch": 1.1279735402620532, + "ewc_loss": 0.007542090490460396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.542090315837413e-05, + "grad_norm": 3.7179951667785645, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8807213306427002, + "num_tokens": 338173794.0, + "step": 8867 + }, + { + "epoch": 1.1281007505406437, + "ewc_loss": 0.007540958933532238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540958904428408e-05, + "grad_norm": 3.7658798694610596, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8708764314651489, + "num_tokens": 338206591.0, + "step": 8868 + }, + { + "epoch": 1.1282279608192343, + "ewc_loss": 0.007586980238556862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586980063933879e-05, + "grad_norm": 3.7956435680389404, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8844127655029297, + "num_tokens": 338238728.0, + "step": 8869 + }, + { + "epoch": 1.1283551710978248, + "ewc_loss": 0.007590387482196093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.590387394884601e-05, + "grad_norm": 3.6703925132751465, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8823735117912292, + "num_tokens": 338282372.0, + "step": 8870 + }, + { + "epoch": 1.1284823813764153, + "ewc_loss": 0.007506547495722771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.506547262892127e-05, + "grad_norm": 3.7544162273406982, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8690887689590454, + "num_tokens": 338319175.0, + "step": 8871 + }, + { + "epoch": 1.1286095916550056, + "ewc_loss": 0.007604280021041632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.604280108353123e-05, + "grad_norm": 3.718881368637085, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8702048063278198, + "num_tokens": 338357967.0, + "step": 8872 + }, + { + "epoch": 1.1287368019335962, + "ewc_loss": 0.0075398702174425125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.539870421169326e-05, + "grad_norm": 3.794174909591675, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.880247175693512, + "num_tokens": 338391431.0, + "step": 8873 + }, + { + "epoch": 1.1288640122121867, + "ewc_loss": 0.007613167632371187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.613167690578848e-05, + "grad_norm": 3.7224535942077637, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8805265426635742, + "num_tokens": 338426298.0, + "step": 8874 + }, + { + "epoch": 1.1289912224907772, + "ewc_loss": 0.007546236738562584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.546236884081736e-05, + "grad_norm": 3.744786262512207, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8544166088104248, + "num_tokens": 338468147.0, + "step": 8875 + }, + { + "epoch": 1.1291184327693677, + "ewc_loss": 0.007586832158267498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586832361994311e-05, + "grad_norm": 3.692934274673462, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8572912216186523, + "num_tokens": 338517095.0, + "step": 8876 + }, + { + "epoch": 1.1292456430479583, + "ewc_loss": 0.007545637898147106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.545638072770089e-05, + "grad_norm": 3.7428486347198486, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8791840672492981, + "num_tokens": 338549925.0, + "step": 8877 + }, + { + "epoch": 1.1293728533265488, + "ewc_loss": 0.00759925227612257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.599252421641722e-05, + "grad_norm": 3.7772104740142822, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8761513829231262, + "num_tokens": 338583966.0, + "step": 8878 + }, + { + "epoch": 1.1295000636051393, + "ewc_loss": 0.007595601491630077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.595601346110925e-05, + "grad_norm": 3.7040069103240967, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8774007558822632, + "num_tokens": 338622715.0, + "step": 8879 + }, + { + "epoch": 1.1296272738837299, + "ewc_loss": 0.007536385208368301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.536385237472132e-05, + "grad_norm": 3.7807514667510986, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8602324724197388, + "num_tokens": 338659920.0, + "step": 8880 + }, + { + "epoch": 1.1297544841623204, + "ewc_loss": 0.007632506545633078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.632506458321586e-05, + "grad_norm": 3.738018035888672, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8687798976898193, + "num_tokens": 338698183.0, + "step": 8881 + }, + { + "epoch": 1.129881694440911, + "ewc_loss": 0.007575754076242447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.575753988930956e-05, + "grad_norm": 3.7995498180389404, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8608025908470154, + "num_tokens": 338735853.0, + "step": 8882 + }, + { + "epoch": 1.1300089047195012, + "ewc_loss": 0.007612655404955149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.61265546316281e-05, + "grad_norm": 3.752769947052002, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8639471530914307, + "num_tokens": 338771714.0, + "step": 8883 + }, + { + "epoch": 1.1301361149980917, + "ewc_loss": 0.00757600786164403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57600791985169e-05, + "grad_norm": 3.693624973297119, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8894098997116089, + "num_tokens": 338813823.0, + "step": 8884 + }, + { + "epoch": 1.1302633252766823, + "ewc_loss": 0.007555135991424322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555136107839644e-05, + "grad_norm": 3.68102765083313, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8761518001556396, + "num_tokens": 338858137.0, + "step": 8885 + }, + { + "epoch": 1.1303905355552728, + "ewc_loss": 0.007570135407149792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.570135494461283e-05, + "grad_norm": 3.7751545906066895, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8637984991073608, + "num_tokens": 338893653.0, + "step": 8886 + }, + { + "epoch": 1.1305177458338633, + "ewc_loss": 0.007631869986653328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631869812030345e-05, + "grad_norm": 3.7804532051086426, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8542701005935669, + "num_tokens": 338931230.0, + "step": 8887 + }, + { + "epoch": 1.1306449561124539, + "ewc_loss": 0.007575503084808588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.575502968393266e-05, + "grad_norm": 3.7516427040100098, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8728762865066528, + "num_tokens": 338966921.0, + "step": 8888 + }, + { + "epoch": 1.1307721663910444, + "ewc_loss": 0.007584628649055958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.584628474432975e-05, + "grad_norm": 3.8628909587860107, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8548946380615234, + "num_tokens": 339000957.0, + "step": 8889 + }, + { + "epoch": 1.130899376669635, + "ewc_loss": 0.007672874256968498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.672874198760837e-05, + "grad_norm": 3.7489662170410156, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8655681610107422, + "num_tokens": 339035876.0, + "step": 8890 + }, + { + "epoch": 1.1310265869482254, + "ewc_loss": 0.007570426445454359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57042653276585e-05, + "grad_norm": 3.67331600189209, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8750863075256348, + "num_tokens": 339073700.0, + "step": 8891 + }, + { + "epoch": 1.131153797226816, + "ewc_loss": 0.007587196305394173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58719615987502e-05, + "grad_norm": 3.7750771045684814, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8785358667373657, + "num_tokens": 339108222.0, + "step": 8892 + }, + { + "epoch": 1.1312810075054065, + "ewc_loss": 0.007683835923671722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683836156502366e-05, + "grad_norm": 3.7150344848632812, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8828999400138855, + "num_tokens": 339145749.0, + "step": 8893 + }, + { + "epoch": 1.131408217783997, + "ewc_loss": 0.007603805046528578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.603804988320917e-05, + "grad_norm": 3.7600345611572266, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8605018854141235, + "num_tokens": 339183852.0, + "step": 8894 + }, + { + "epoch": 1.1315354280625876, + "ewc_loss": 0.007682909723371267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.68290992709808e-05, + "grad_norm": 3.6944427490234375, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8865604400634766, + "num_tokens": 339218766.0, + "step": 8895 + }, + { + "epoch": 1.131662638341178, + "ewc_loss": 0.007617019582539797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617019582539797e-05, + "grad_norm": 3.6924009323120117, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8743900656700134, + "num_tokens": 339259932.0, + "step": 8896 + }, + { + "epoch": 1.1317898486197684, + "ewc_loss": 0.007645299192517996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.645299046998844e-05, + "grad_norm": 3.685617685317993, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8759409189224243, + "num_tokens": 339302201.0, + "step": 8897 + }, + { + "epoch": 1.131917058898359, + "ewc_loss": 0.007629136089235544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.629136234754696e-05, + "grad_norm": 3.7753729820251465, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8448991775512695, + "num_tokens": 339342091.0, + "step": 8898 + }, + { + "epoch": 1.1320442691769494, + "ewc_loss": 0.007694725412875414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.694725354667753e-05, + "grad_norm": 3.7675139904022217, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8669912815093994, + "num_tokens": 339379951.0, + "step": 8899 + }, + { + "epoch": 1.13217147945554, + "ewc_loss": 0.007637214846909046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.637214730493724e-05, + "grad_norm": 3.6939921379089355, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8677088022232056, + "num_tokens": 339422243.0, + "step": 8900 + }, + { + "epoch": 1.1322986897341305, + "ewc_loss": 0.007589183747768402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.589183951495215e-05, + "grad_norm": 3.682126760482788, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8857746124267578, + "num_tokens": 339464969.0, + "step": 8901 + }, + { + "epoch": 1.132425900012721, + "ewc_loss": 0.0076203374192118645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.620337419211864e-05, + "grad_norm": 3.73157000541687, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8760384321212769, + "num_tokens": 339502382.0, + "step": 8902 + }, + { + "epoch": 1.1325531102913116, + "ewc_loss": 0.007631038781255484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631038897670805e-05, + "grad_norm": 3.7843687534332275, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8582940697669983, + "num_tokens": 339542875.0, + "step": 8903 + }, + { + "epoch": 1.132680320569902, + "ewc_loss": 0.007637546397745609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.637546514160931e-05, + "grad_norm": 3.728628635406494, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8591824173927307, + "num_tokens": 339580776.0, + "step": 8904 + }, + { + "epoch": 1.1328075308484926, + "ewc_loss": 0.007568594999611378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56859517423436e-05, + "grad_norm": 3.731215476989746, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8843265175819397, + "num_tokens": 339620195.0, + "step": 8905 + }, + { + "epoch": 1.1329347411270831, + "ewc_loss": 0.0075971162877976894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597116200486198e-05, + "grad_norm": 3.707854747772217, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8796286582946777, + "num_tokens": 339659234.0, + "step": 8906 + }, + { + "epoch": 1.1330619514056737, + "ewc_loss": 0.007565975189208984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565975101897493e-05, + "grad_norm": 3.7113118171691895, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8640840649604797, + "num_tokens": 339703259.0, + "step": 8907 + }, + { + "epoch": 1.133189161684264, + "ewc_loss": 0.00756677845492959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566778367618099e-05, + "grad_norm": 3.671651601791382, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8792882561683655, + "num_tokens": 339745901.0, + "step": 8908 + }, + { + "epoch": 1.1333163719628545, + "ewc_loss": 0.007526422385126352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52642226871103e-05, + "grad_norm": 3.705127477645874, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8636939525604248, + "num_tokens": 339790417.0, + "step": 8909 + }, + { + "epoch": 1.133443582241445, + "ewc_loss": 0.007562919985502958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.562919927295297e-05, + "grad_norm": 3.781376838684082, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8665527105331421, + "num_tokens": 339827224.0, + "step": 8910 + }, + { + "epoch": 1.1335707925200356, + "ewc_loss": 0.007594934199005365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.594934140797704e-05, + "grad_norm": 3.7773256301879883, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8680901527404785, + "num_tokens": 339859932.0, + "step": 8911 + }, + { + "epoch": 1.133698002798626, + "ewc_loss": 0.007554142270237207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554142212029546e-05, + "grad_norm": 3.7487356662750244, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8603399991989136, + "num_tokens": 339902416.0, + "step": 8912 + }, + { + "epoch": 1.1338252130772166, + "ewc_loss": 0.007531771436333656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.531771552748978e-05, + "grad_norm": 3.6867518424987793, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8806271553039551, + "num_tokens": 339940495.0, + "step": 8913 + }, + { + "epoch": 1.1339524233558071, + "ewc_loss": 0.007513032294809818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.513032323913649e-05, + "grad_norm": 3.6800646781921387, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8567802906036377, + "num_tokens": 339983010.0, + "step": 8914 + }, + { + "epoch": 1.1340796336343977, + "ewc_loss": 0.007540008053183556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540007936768234e-05, + "grad_norm": 3.744260549545288, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8724997043609619, + "num_tokens": 340020683.0, + "step": 8915 + }, + { + "epoch": 1.1342068439129882, + "ewc_loss": 0.007555797696113586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555797492386773e-05, + "grad_norm": 3.7162063121795654, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8641924262046814, + "num_tokens": 340063179.0, + "step": 8916 + }, + { + "epoch": 1.1343340541915787, + "ewc_loss": 0.00753864087164402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.538640784332529e-05, + "grad_norm": 3.723963737487793, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8817070722579956, + "num_tokens": 340100166.0, + "step": 8917 + }, + { + "epoch": 1.1344612644701693, + "ewc_loss": 0.0075539094395935535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553909381385893e-05, + "grad_norm": 3.70723819732666, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8694310188293457, + "num_tokens": 340144325.0, + "step": 8918 + }, + { + "epoch": 1.1345884747487598, + "ewc_loss": 0.007532563991844654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532563904533163e-05, + "grad_norm": 3.7910726070404053, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8714391589164734, + "num_tokens": 340180563.0, + "step": 8919 + }, + { + "epoch": 1.1347156850273503, + "ewc_loss": 0.007600420154631138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600420212838799e-05, + "grad_norm": 3.7215490341186523, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8600493669509888, + "num_tokens": 340224621.0, + "step": 8920 + }, + { + "epoch": 1.1348428953059406, + "ewc_loss": 0.0075087957084178925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.50879553379491e-05, + "grad_norm": 3.7215416431427, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8770632743835449, + "num_tokens": 340264810.0, + "step": 8921 + }, + { + "epoch": 1.1349701055845312, + "ewc_loss": 0.007548958994448185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.548958819825202e-05, + "grad_norm": 3.7419252395629883, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.885391354560852, + "num_tokens": 340301158.0, + "step": 8922 + }, + { + "epoch": 1.1350973158631217, + "ewc_loss": 0.007550125941634178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550125883426517e-05, + "grad_norm": 3.714341163635254, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8724021315574646, + "num_tokens": 340339806.0, + "step": 8923 + }, + { + "epoch": 1.1352245261417122, + "ewc_loss": 0.00752599723637104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525997352786362e-05, + "grad_norm": 3.6855831146240234, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8781823515892029, + "num_tokens": 340386412.0, + "step": 8924 + }, + { + "epoch": 1.1353517364203027, + "ewc_loss": 0.007520281244069338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52028136048466e-05, + "grad_norm": 3.701495885848999, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8807166218757629, + "num_tokens": 340431391.0, + "step": 8925 + }, + { + "epoch": 1.1354789466988933, + "ewc_loss": 0.007527339272201061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.527339039370418e-05, + "grad_norm": 3.7929940223693848, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8743942379951477, + "num_tokens": 340465904.0, + "step": 8926 + }, + { + "epoch": 1.1356061569774838, + "ewc_loss": 0.007564625237137079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564625411760062e-05, + "grad_norm": 3.7008583545684814, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8757112622261047, + "num_tokens": 340504290.0, + "step": 8927 + }, + { + "epoch": 1.1357333672560743, + "ewc_loss": 0.007485007401555777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.485007517971098e-05, + "grad_norm": 3.759087324142456, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8678414225578308, + "num_tokens": 340539100.0, + "step": 8928 + }, + { + "epoch": 1.1358605775346649, + "ewc_loss": 0.007565958425402641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56595836719498e-05, + "grad_norm": 3.7964937686920166, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8854166865348816, + "num_tokens": 340569005.0, + "step": 8929 + }, + { + "epoch": 1.1359877878132554, + "ewc_loss": 0.007566245272755623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566245039924979e-05, + "grad_norm": 3.6633708477020264, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8727228045463562, + "num_tokens": 340613476.0, + "step": 8930 + }, + { + "epoch": 1.136114998091846, + "ewc_loss": 0.00748381856828928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.483818626496941e-05, + "grad_norm": 3.8141119480133057, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8651604652404785, + "num_tokens": 340647982.0, + "step": 8931 + }, + { + "epoch": 1.1362422083704362, + "ewc_loss": 0.007636323571205139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636323425685987e-05, + "grad_norm": 3.79282283782959, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8516483902931213, + "num_tokens": 340683270.0, + "step": 8932 + }, + { + "epoch": 1.1363694186490267, + "ewc_loss": 0.0075663416646420956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566341810161248e-05, + "grad_norm": 3.770214796066284, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8756351470947266, + "num_tokens": 340719955.0, + "step": 8933 + }, + { + "epoch": 1.1364966289276173, + "ewc_loss": 0.007572636008262634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.572636241093278e-05, + "grad_norm": 3.7802536487579346, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8746249079704285, + "num_tokens": 340753934.0, + "step": 8934 + }, + { + "epoch": 1.1366238392062078, + "ewc_loss": 0.007621798664331436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.621798431500793e-05, + "grad_norm": 3.752885580062866, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8792560696601868, + "num_tokens": 340787545.0, + "step": 8935 + }, + { + "epoch": 1.1367510494847983, + "ewc_loss": 0.007590658031404018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.590658060507849e-05, + "grad_norm": 3.6780052185058594, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8816415071487427, + "num_tokens": 340826362.0, + "step": 8936 + }, + { + "epoch": 1.1368782597633889, + "ewc_loss": 0.007556874305009842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.556874334113672e-05, + "grad_norm": 3.854546308517456, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8763954639434814, + "num_tokens": 340857871.0, + "step": 8937 + }, + { + "epoch": 1.1370054700419794, + "ewc_loss": 0.007708259392529726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708259363425896e-05, + "grad_norm": 3.676837682723999, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8788039684295654, + "num_tokens": 340897736.0, + "step": 8938 + }, + { + "epoch": 1.13713268032057, + "ewc_loss": 0.007529939990490675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.529940194217488e-05, + "grad_norm": 3.756382703781128, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.869476318359375, + "num_tokens": 340935774.0, + "step": 8939 + }, + { + "epoch": 1.1372598905991604, + "ewc_loss": 0.0076519157737493515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.651915802853182e-05, + "grad_norm": 3.756657361984253, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.876200795173645, + "num_tokens": 340971497.0, + "step": 8940 + }, + { + "epoch": 1.137387100877751, + "ewc_loss": 0.007604526821523905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.604526763316244e-05, + "grad_norm": 3.6852004528045654, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8739433288574219, + "num_tokens": 341016209.0, + "step": 8941 + }, + { + "epoch": 1.1375143111563415, + "ewc_loss": 0.007570320274680853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.570320303784683e-05, + "grad_norm": 3.734391689300537, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8687946200370789, + "num_tokens": 341052137.0, + "step": 8942 + }, + { + "epoch": 1.137641521434932, + "ewc_loss": 0.007635471411049366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.635471411049366e-05, + "grad_norm": 3.7816426753997803, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8753678202629089, + "num_tokens": 341083417.0, + "step": 8943 + }, + { + "epoch": 1.1377687317135226, + "ewc_loss": 0.00762554258108139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.625542639289051e-05, + "grad_norm": 3.6914005279541016, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.876063883304596, + "num_tokens": 341127798.0, + "step": 8944 + }, + { + "epoch": 1.137895941992113, + "ewc_loss": 0.007564533036202192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564533007098362e-05, + "grad_norm": 3.7077527046203613, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8601288795471191, + "num_tokens": 341173485.0, + "step": 8945 + }, + { + "epoch": 1.1380231522707034, + "ewc_loss": 0.007614242844283581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.614243077114224e-05, + "grad_norm": 3.7410330772399902, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8616333603858948, + "num_tokens": 341212834.0, + "step": 8946 + }, + { + "epoch": 1.138150362549294, + "ewc_loss": 0.007619726471602917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.619726238772273e-05, + "grad_norm": 3.790820360183716, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8663368225097656, + "num_tokens": 341246497.0, + "step": 8947 + }, + { + "epoch": 1.1382775728278844, + "ewc_loss": 0.0076344250701367855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.634425128344446e-05, + "grad_norm": 3.7141497135162354, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.859902024269104, + "num_tokens": 341291498.0, + "step": 8948 + }, + { + "epoch": 1.138404783106475, + "ewc_loss": 0.007577506825327873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.577506767120212e-05, + "grad_norm": 3.7480170726776123, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8669992685317993, + "num_tokens": 341328104.0, + "step": 8949 + }, + { + "epoch": 1.1385319933850655, + "ewc_loss": 0.007624550722539425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.624550926266238e-05, + "grad_norm": 3.77056884765625, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8836819529533386, + "num_tokens": 341359042.0, + "step": 8950 + }, + { + "epoch": 1.138659203663656, + "ewc_loss": 0.007619619835168123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.619620009791106e-05, + "grad_norm": 3.6751646995544434, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8736567497253418, + "num_tokens": 341403529.0, + "step": 8951 + }, + { + "epoch": 1.1387864139422466, + "ewc_loss": 0.0075537050142884254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553704926976934e-05, + "grad_norm": 3.710919141769409, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8795638084411621, + "num_tokens": 341442509.0, + "step": 8952 + }, + { + "epoch": 1.138913624220837, + "ewc_loss": 0.0076072667725384235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.607266888953745e-05, + "grad_norm": 3.702037811279297, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8701823353767395, + "num_tokens": 341483015.0, + "step": 8953 + }, + { + "epoch": 1.1390408344994276, + "ewc_loss": 0.007581784389913082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.581784302601591e-05, + "grad_norm": 3.7425544261932373, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.873160183429718, + "num_tokens": 341519676.0, + "step": 8954 + }, + { + "epoch": 1.1391680447780181, + "ewc_loss": 0.0076064979657530785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.606497820233926e-05, + "grad_norm": 3.706817626953125, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8720458745956421, + "num_tokens": 341563483.0, + "step": 8955 + }, + { + "epoch": 1.1392952550566087, + "ewc_loss": 0.007563781924545765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.563782128272578e-05, + "grad_norm": 3.7482707500457764, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8654496073722839, + "num_tokens": 341598387.0, + "step": 8956 + }, + { + "epoch": 1.139422465335199, + "ewc_loss": 0.007603534962981939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.603535050293431e-05, + "grad_norm": 3.7658896446228027, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8756720423698425, + "num_tokens": 341640253.0, + "step": 8957 + }, + { + "epoch": 1.1395496756137895, + "ewc_loss": 0.00758775882422924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.587758591398597e-05, + "grad_norm": 3.7192766666412354, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8988150954246521, + "num_tokens": 341675788.0, + "step": 8958 + }, + { + "epoch": 1.13967688589238, + "ewc_loss": 0.00756035465747118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560354424640536e-05, + "grad_norm": 3.6983683109283447, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.874038577079773, + "num_tokens": 341716917.0, + "step": 8959 + }, + { + "epoch": 1.1398040961709706, + "ewc_loss": 0.0075533571653068066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553357136202976e-05, + "grad_norm": 3.6908884048461914, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8713967204093933, + "num_tokens": 341758936.0, + "step": 8960 + }, + { + "epoch": 1.139931306449561, + "ewc_loss": 0.00754949264228344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549492875114083e-05, + "grad_norm": 3.797048807144165, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8501728773117065, + "num_tokens": 341797612.0, + "step": 8961 + }, + { + "epoch": 1.1400585167281516, + "ewc_loss": 0.007624644786119461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.624644786119461e-05, + "grad_norm": 3.7527127265930176, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8680406212806702, + "num_tokens": 341836061.0, + "step": 8962 + }, + { + "epoch": 1.1401857270067421, + "ewc_loss": 0.007555988151580095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555988122476265e-05, + "grad_norm": 3.731822967529297, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8719704151153564, + "num_tokens": 341875465.0, + "step": 8963 + }, + { + "epoch": 1.1403129372853327, + "ewc_loss": 0.007558681070804596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.558680954389274e-05, + "grad_norm": 3.7213470935821533, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8677504658699036, + "num_tokens": 341919795.0, + "step": 8964 + }, + { + "epoch": 1.1404401475639232, + "ewc_loss": 0.007544291205704212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.544291293015704e-05, + "grad_norm": 3.7653095722198486, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.869382917881012, + "num_tokens": 341953291.0, + "step": 8965 + }, + { + "epoch": 1.1405673578425137, + "ewc_loss": 0.007585000246763229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58500027586706e-05, + "grad_norm": 3.744319438934326, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8917063474655151, + "num_tokens": 341988364.0, + "step": 8966 + }, + { + "epoch": 1.1406945681211043, + "ewc_loss": 0.007553031202405691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553031173301861e-05, + "grad_norm": 3.722216844558716, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8758230209350586, + "num_tokens": 342028478.0, + "step": 8967 + }, + { + "epoch": 1.1408217783996948, + "ewc_loss": 0.007558175828307867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55817600293085e-05, + "grad_norm": 3.704622983932495, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8800268769264221, + "num_tokens": 342067348.0, + "step": 8968 + }, + { + "epoch": 1.1409489886782853, + "ewc_loss": 0.00754106231033802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541062223026529e-05, + "grad_norm": 3.7487893104553223, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8680515289306641, + "num_tokens": 342105845.0, + "step": 8969 + }, + { + "epoch": 1.1410761989568756, + "ewc_loss": 0.007579862140119076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579861994599923e-05, + "grad_norm": 3.807076930999756, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8722277879714966, + "num_tokens": 342142072.0, + "step": 8970 + }, + { + "epoch": 1.1412034092354661, + "ewc_loss": 0.0075777252204716206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.577725045848638e-05, + "grad_norm": 3.743788242340088, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8742878437042236, + "num_tokens": 342176592.0, + "step": 8971 + }, + { + "epoch": 1.1413306195140567, + "ewc_loss": 0.007530849892646074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.530849688919261e-05, + "grad_norm": 3.7303082942962646, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8752939701080322, + "num_tokens": 342211896.0, + "step": 8972 + }, + { + "epoch": 1.1414578297926472, + "ewc_loss": 0.007548648864030838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.548648864030838e-05, + "grad_norm": 3.6852610111236572, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8757498264312744, + "num_tokens": 342256078.0, + "step": 8973 + }, + { + "epoch": 1.1415850400712377, + "ewc_loss": 0.007526678964495659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.526679110014811e-05, + "grad_norm": 3.7152090072631836, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8736220598220825, + "num_tokens": 342296766.0, + "step": 8974 + }, + { + "epoch": 1.1417122503498283, + "ewc_loss": 0.0075575304217636585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.557530625490472e-05, + "grad_norm": 3.712313413619995, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8729456663131714, + "num_tokens": 342333689.0, + "step": 8975 + }, + { + "epoch": 1.1418394606284188, + "ewc_loss": 0.007536843419075012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.536843622801825e-05, + "grad_norm": 3.736706256866455, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8825439214706421, + "num_tokens": 342369605.0, + "step": 8976 + }, + { + "epoch": 1.1419666709070093, + "ewc_loss": 0.007559085264801979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.559085497632623e-05, + "grad_norm": 3.7052319049835205, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8818885087966919, + "num_tokens": 342410806.0, + "step": 8977 + }, + { + "epoch": 1.1420938811855998, + "ewc_loss": 0.007528381887823343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52838168409653e-05, + "grad_norm": 3.728184461593628, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8982506990432739, + "num_tokens": 342446957.0, + "step": 8978 + }, + { + "epoch": 1.1422210914641904, + "ewc_loss": 0.007540285121649504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540285150753334e-05, + "grad_norm": 3.837139844894409, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8611550331115723, + "num_tokens": 342480717.0, + "step": 8979 + }, + { + "epoch": 1.142348301742781, + "ewc_loss": 0.007590033113956451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.59003305574879e-05, + "grad_norm": 3.704092264175415, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8793872594833374, + "num_tokens": 342520431.0, + "step": 8980 + }, + { + "epoch": 1.1424755120213712, + "ewc_loss": 0.007470006588846445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.470006676157936e-05, + "grad_norm": 3.6924803256988525, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8692708015441895, + "num_tokens": 342561735.0, + "step": 8981 + }, + { + "epoch": 1.1426027222999617, + "ewc_loss": 0.007522501051425934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.522501255152747e-05, + "grad_norm": 3.788771390914917, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8845341801643372, + "num_tokens": 342593899.0, + "step": 8982 + }, + { + "epoch": 1.1427299325785523, + "ewc_loss": 0.007583060301840305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583060505567119e-05, + "grad_norm": 3.7857091426849365, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8717272281646729, + "num_tokens": 342628044.0, + "step": 8983 + }, + { + "epoch": 1.1428571428571428, + "ewc_loss": 0.007530123461037874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.530123548349366e-05, + "grad_norm": 3.7884769439697266, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8714003562927246, + "num_tokens": 342665018.0, + "step": 8984 + }, + { + "epoch": 1.1429843531357333, + "ewc_loss": 0.0075516896322369576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.551689486717805e-05, + "grad_norm": 3.800147533416748, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8626314401626587, + "num_tokens": 342704832.0, + "step": 8985 + }, + { + "epoch": 1.1431115634143239, + "ewc_loss": 0.007540076971054077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540077058365569e-05, + "grad_norm": 3.7167394161224365, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8789896368980408, + "num_tokens": 342741598.0, + "step": 8986 + }, + { + "epoch": 1.1432387736929144, + "ewc_loss": 0.007507902104407549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507902046199888e-05, + "grad_norm": 3.779195785522461, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.868371307849884, + "num_tokens": 342777302.0, + "step": 8987 + }, + { + "epoch": 1.143365983971505, + "ewc_loss": 0.007592350244522095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.592350448248908e-05, + "grad_norm": 3.795713424682617, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.85484778881073, + "num_tokens": 342809578.0, + "step": 8988 + }, + { + "epoch": 1.1434931942500954, + "ewc_loss": 0.0075925374403595924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.592537440359592e-05, + "grad_norm": 3.729464292526245, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8740262985229492, + "num_tokens": 342845809.0, + "step": 8989 + }, + { + "epoch": 1.143620404528686, + "ewc_loss": 0.007556434255093336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.556434138678014e-05, + "grad_norm": 3.730365514755249, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8592976331710815, + "num_tokens": 342886118.0, + "step": 8990 + }, + { + "epoch": 1.1437476148072765, + "ewc_loss": 0.007598145864903927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.598145748488605e-05, + "grad_norm": 3.773665189743042, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8682130575180054, + "num_tokens": 342922517.0, + "step": 8991 + }, + { + "epoch": 1.143874825085867, + "ewc_loss": 0.0076362453401088715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636245572939515e-05, + "grad_norm": 3.727527141571045, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8663815259933472, + "num_tokens": 342959741.0, + "step": 8992 + }, + { + "epoch": 1.1440020353644575, + "ewc_loss": 0.0076004741713404655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600474054925144e-05, + "grad_norm": 3.7667698860168457, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8847143650054932, + "num_tokens": 342991418.0, + "step": 8993 + }, + { + "epoch": 1.144129245643048, + "ewc_loss": 0.007663014344871044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.663014548597857e-05, + "grad_norm": 3.770664691925049, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8892933130264282, + "num_tokens": 343024263.0, + "step": 8994 + }, + { + "epoch": 1.1442564559216384, + "ewc_loss": 0.007637690752744675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.637690578121692e-05, + "grad_norm": 3.7202231884002686, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.873816728591919, + "num_tokens": 343066388.0, + "step": 8995 + }, + { + "epoch": 1.144383666200229, + "ewc_loss": 0.0076071168296039104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.607117004226893e-05, + "grad_norm": 3.7100870609283447, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8739223480224609, + "num_tokens": 343107631.0, + "step": 8996 + }, + { + "epoch": 1.1445108764788194, + "ewc_loss": 0.007621732074767351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.621732220286503e-05, + "grad_norm": 3.746609687805176, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8783495426177979, + "num_tokens": 343143645.0, + "step": 8997 + }, + { + "epoch": 1.14463808675741, + "ewc_loss": 0.0076283481903374195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62834824854508e-05, + "grad_norm": 3.7712416648864746, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8589891791343689, + "num_tokens": 343182076.0, + "step": 8998 + }, + { + "epoch": 1.1447652970360005, + "ewc_loss": 0.007631970103830099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631970220245421e-05, + "grad_norm": 3.677586555480957, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8905278444290161, + "num_tokens": 343222808.0, + "step": 8999 + }, + { + "epoch": 1.144892507314591, + "ewc_loss": 0.007581877987831831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.581878162454814e-05, + "grad_norm": 3.700213670730591, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8692252039909363, + "num_tokens": 343268229.0, + "step": 9000 + }, + { + "epoch": 1.1450197175931816, + "ewc_loss": 0.007635453715920448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.635453948751092e-05, + "grad_norm": 3.735316514968872, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8665248155593872, + "num_tokens": 343304459.0, + "step": 9001 + }, + { + "epoch": 1.145146927871772, + "ewc_loss": 0.007640128955245018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640128751518205e-05, + "grad_norm": 3.827646255493164, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8356512784957886, + "num_tokens": 343343211.0, + "step": 9002 + }, + { + "epoch": 1.1452741381503626, + "ewc_loss": 0.007665312848985195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.665313023608178e-05, + "grad_norm": 3.6540350914001465, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8688467741012573, + "num_tokens": 343388498.0, + "step": 9003 + }, + { + "epoch": 1.1454013484289531, + "ewc_loss": 0.007533220108598471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.533220195909962e-05, + "grad_norm": 3.709792137145996, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8697925806045532, + "num_tokens": 343430101.0, + "step": 9004 + }, + { + "epoch": 1.1455285587075437, + "ewc_loss": 0.0076272799633443356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.627280137967318e-05, + "grad_norm": 3.793030261993408, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8606559038162231, + "num_tokens": 343466454.0, + "step": 9005 + }, + { + "epoch": 1.145655768986134, + "ewc_loss": 0.007639662362635136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639662362635136e-05, + "grad_norm": 3.717082977294922, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8866366147994995, + "num_tokens": 343506960.0, + "step": 9006 + }, + { + "epoch": 1.1457829792647245, + "ewc_loss": 0.007561842445284128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.561842357972637e-05, + "grad_norm": 3.781061887741089, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8800698518753052, + "num_tokens": 343541133.0, + "step": 9007 + }, + { + "epoch": 1.145910189543315, + "ewc_loss": 0.007636353839188814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636353984707966e-05, + "grad_norm": 3.7421693801879883, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8809026479721069, + "num_tokens": 343580266.0, + "step": 9008 + }, + { + "epoch": 1.1460373998219056, + "ewc_loss": 0.007589966058731079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58996611693874e-05, + "grad_norm": 3.7661473751068115, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8698506355285645, + "num_tokens": 343611691.0, + "step": 9009 + }, + { + "epoch": 1.146164610100496, + "ewc_loss": 0.007607384584844112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.607384759467095e-05, + "grad_norm": 3.743224620819092, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8779741525650024, + "num_tokens": 343649355.0, + "step": 9010 + }, + { + "epoch": 1.1462918203790866, + "ewc_loss": 0.007611229550093412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.61122937547043e-05, + "grad_norm": 3.751918315887451, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8854130506515503, + "num_tokens": 343681606.0, + "step": 9011 + }, + { + "epoch": 1.1464190306576771, + "ewc_loss": 0.007608681917190552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.608682062709704e-05, + "grad_norm": 3.68572998046875, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8836337924003601, + "num_tokens": 343721592.0, + "step": 9012 + }, + { + "epoch": 1.1465462409362677, + "ewc_loss": 0.007579875644296408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57987581891939e-05, + "grad_norm": 3.6740126609802246, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8710170984268188, + "num_tokens": 343764084.0, + "step": 9013 + }, + { + "epoch": 1.1466734512148582, + "ewc_loss": 0.007596873678267002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.596873911097646e-05, + "grad_norm": 3.7832438945770264, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8734805583953857, + "num_tokens": 343801213.0, + "step": 9014 + }, + { + "epoch": 1.1468006614934487, + "ewc_loss": 0.007647198159247637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.647198071936145e-05, + "grad_norm": 3.7506253719329834, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8731328248977661, + "num_tokens": 343839761.0, + "step": 9015 + }, + { + "epoch": 1.1469278717720393, + "ewc_loss": 0.007596096023917198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.59609611122869e-05, + "grad_norm": 3.728736162185669, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8719482421875, + "num_tokens": 343877792.0, + "step": 9016 + }, + { + "epoch": 1.1470550820506298, + "ewc_loss": 0.007576890289783478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.576890493510291e-05, + "grad_norm": 3.670370578765869, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8798404335975647, + "num_tokens": 343915107.0, + "step": 9017 + }, + { + "epoch": 1.1471822923292203, + "ewc_loss": 0.00756449019536376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564490078948438e-05, + "grad_norm": 3.7826995849609375, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8776061534881592, + "num_tokens": 343949783.0, + "step": 9018 + }, + { + "epoch": 1.1473095026078106, + "ewc_loss": 0.007627340033650398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.627339800819755e-05, + "grad_norm": 3.7356131076812744, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8550572395324707, + "num_tokens": 343991058.0, + "step": 9019 + }, + { + "epoch": 1.1474367128864011, + "ewc_loss": 0.007557848002761602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55784785724245e-05, + "grad_norm": 3.746029853820801, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8737281560897827, + "num_tokens": 344030424.0, + "step": 9020 + }, + { + "epoch": 1.1475639231649917, + "ewc_loss": 0.0075805168598890305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5805168307852e-05, + "grad_norm": 3.7085654735565186, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8745691180229187, + "num_tokens": 344071385.0, + "step": 9021 + }, + { + "epoch": 1.1476911334435822, + "ewc_loss": 0.007559242658317089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.559242658317089e-05, + "grad_norm": 3.737271547317505, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8822439908981323, + "num_tokens": 344109725.0, + "step": 9022 + }, + { + "epoch": 1.1478183437221727, + "ewc_loss": 0.007585040293633938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.585040293633938e-05, + "grad_norm": 3.740960121154785, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8817622661590576, + "num_tokens": 344143947.0, + "step": 9023 + }, + { + "epoch": 1.1479455540007633, + "ewc_loss": 0.0075882780365645885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58827809477225e-05, + "grad_norm": 3.8358700275421143, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8627164959907532, + "num_tokens": 344179969.0, + "step": 9024 + }, + { + "epoch": 1.1480727642793538, + "ewc_loss": 0.007632025517523289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.632025517523289e-05, + "grad_norm": 3.7768497467041016, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8845513463020325, + "num_tokens": 344212279.0, + "step": 9025 + }, + { + "epoch": 1.1481999745579443, + "ewc_loss": 0.007551785558462143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.551785529358312e-05, + "grad_norm": 3.752643585205078, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8786019682884216, + "num_tokens": 344248402.0, + "step": 9026 + }, + { + "epoch": 1.1483271848365348, + "ewc_loss": 0.007587437052279711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58743699407205e-05, + "grad_norm": 3.7493627071380615, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8709872364997864, + "num_tokens": 344286557.0, + "step": 9027 + }, + { + "epoch": 1.1484543951151254, + "ewc_loss": 0.0075700837187469006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.570083835162222e-05, + "grad_norm": 3.713425636291504, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8741938471794128, + "num_tokens": 344331014.0, + "step": 9028 + }, + { + "epoch": 1.148581605393716, + "ewc_loss": 0.007550663780421019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550663576694205e-05, + "grad_norm": 3.7312567234039307, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8702971935272217, + "num_tokens": 344373239.0, + "step": 9029 + }, + { + "epoch": 1.1487088156723062, + "ewc_loss": 0.007578040938824415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.578040822409093e-05, + "grad_norm": 3.7336981296539307, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8874611854553223, + "num_tokens": 344409871.0, + "step": 9030 + }, + { + "epoch": 1.1488360259508967, + "ewc_loss": 0.007564154453575611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564154657302424e-05, + "grad_norm": 3.7288668155670166, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8622006773948669, + "num_tokens": 344452108.0, + "step": 9031 + }, + { + "epoch": 1.1489632362294873, + "ewc_loss": 0.007574280723929405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.574280607514083e-05, + "grad_norm": 3.8052866458892822, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8663029670715332, + "num_tokens": 344489798.0, + "step": 9032 + }, + { + "epoch": 1.1490904465080778, + "ewc_loss": 0.007603428326547146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.603428093716502e-05, + "grad_norm": 3.7456228733062744, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8809270858764648, + "num_tokens": 344528194.0, + "step": 9033 + }, + { + "epoch": 1.1492176567866683, + "ewc_loss": 0.007528350222855806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.528350397478789e-05, + "grad_norm": 3.858274459838867, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8584607243537903, + "num_tokens": 344559699.0, + "step": 9034 + }, + { + "epoch": 1.1493448670652588, + "ewc_loss": 0.007633629720658064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633629866177216e-05, + "grad_norm": 3.731785774230957, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8665388822555542, + "num_tokens": 344598234.0, + "step": 9035 + }, + { + "epoch": 1.1494720773438494, + "ewc_loss": 0.007501133717596531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.501133950427175e-05, + "grad_norm": 3.7276270389556885, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8675792217254639, + "num_tokens": 344634780.0, + "step": 9036 + }, + { + "epoch": 1.14959928762244, + "ewc_loss": 0.007564459461718798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564459519926459e-05, + "grad_norm": 3.698578357696533, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8769254684448242, + "num_tokens": 344677167.0, + "step": 9037 + }, + { + "epoch": 1.1497264979010304, + "ewc_loss": 0.00753893842920661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.538938370998949e-05, + "grad_norm": 3.7928762435913086, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8610584735870361, + "num_tokens": 344713824.0, + "step": 9038 + }, + { + "epoch": 1.149853708179621, + "ewc_loss": 0.007611243054270744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.611243199789897e-05, + "grad_norm": 3.7529044151306152, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8607399463653564, + "num_tokens": 344755814.0, + "step": 9039 + }, + { + "epoch": 1.1499809184582115, + "ewc_loss": 0.00754424836486578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54424836486578e-05, + "grad_norm": 3.6739583015441895, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8874354958534241, + "num_tokens": 344795022.0, + "step": 9040 + }, + { + "epoch": 1.150108128736802, + "ewc_loss": 0.0075256372801959515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52563719288446e-05, + "grad_norm": 3.718794345855713, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8822881579399109, + "num_tokens": 344833906.0, + "step": 9041 + }, + { + "epoch": 1.1502353390153925, + "ewc_loss": 0.007576752919703722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.576752977911383e-05, + "grad_norm": 3.6908881664276123, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8774585723876953, + "num_tokens": 344875975.0, + "step": 9042 + }, + { + "epoch": 1.150362549293983, + "ewc_loss": 0.007525987923145294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525987894041464e-05, + "grad_norm": 3.708254098892212, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8759644031524658, + "num_tokens": 344915433.0, + "step": 9043 + }, + { + "epoch": 1.1504897595725734, + "ewc_loss": 0.007544666528701782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.544666732428595e-05, + "grad_norm": 3.787750244140625, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8776441812515259, + "num_tokens": 344949038.0, + "step": 9044 + }, + { + "epoch": 1.150616969851164, + "ewc_loss": 0.007588833104819059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.588833250338212e-05, + "grad_norm": 3.6933374404907227, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8743900060653687, + "num_tokens": 344988260.0, + "step": 9045 + }, + { + "epoch": 1.1507441801297544, + "ewc_loss": 0.007513758260756731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.513758464483544e-05, + "grad_norm": 3.743471622467041, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8853099346160889, + "num_tokens": 345024355.0, + "step": 9046 + }, + { + "epoch": 1.150871390408345, + "ewc_loss": 0.0075718495063483715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.571849710075185e-05, + "grad_norm": 3.691150426864624, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8821358680725098, + "num_tokens": 345063041.0, + "step": 9047 + }, + { + "epoch": 1.1509986006869355, + "ewc_loss": 0.007523823529481888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.523823296651244e-05, + "grad_norm": 3.734189748764038, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8925313949584961, + "num_tokens": 345097813.0, + "step": 9048 + }, + { + "epoch": 1.151125810965526, + "ewc_loss": 0.0075632864609360695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.563286635559052e-05, + "grad_norm": 3.7644684314727783, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8702102303504944, + "num_tokens": 345134534.0, + "step": 9049 + }, + { + "epoch": 1.1512530212441165, + "ewc_loss": 0.007556501775979996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.556501805083826e-05, + "grad_norm": 3.7733685970306396, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8763096332550049, + "num_tokens": 345171018.0, + "step": 9050 + }, + { + "epoch": 1.151380231522707, + "ewc_loss": 0.0075635528191924095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.563552935607731e-05, + "grad_norm": 3.7525408267974854, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8822613954544067, + "num_tokens": 345208006.0, + "step": 9051 + }, + { + "epoch": 1.1515074418012976, + "ewc_loss": 0.007536410819739103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.536410703323781e-05, + "grad_norm": 3.6775906085968018, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8908362984657288, + "num_tokens": 345246616.0, + "step": 9052 + }, + { + "epoch": 1.1516346520798881, + "ewc_loss": 0.007518772501498461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518772326875478e-05, + "grad_norm": 3.735525369644165, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8884692192077637, + "num_tokens": 345288095.0, + "step": 9053 + }, + { + "epoch": 1.1517618623584787, + "ewc_loss": 0.007541536819189787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541536615462974e-05, + "grad_norm": 3.6981451511383057, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8592513203620911, + "num_tokens": 345333291.0, + "step": 9054 + }, + { + "epoch": 1.151889072637069, + "ewc_loss": 0.007497293874621391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.497293699998409e-05, + "grad_norm": 3.7148942947387695, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8714715242385864, + "num_tokens": 345375093.0, + "step": 9055 + }, + { + "epoch": 1.1520162829156595, + "ewc_loss": 0.00751375500112772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.513754826504737e-05, + "grad_norm": 3.725050210952759, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8678107261657715, + "num_tokens": 345415547.0, + "step": 9056 + }, + { + "epoch": 1.15214349319425, + "ewc_loss": 0.007504323497414589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.504323730245233e-05, + "grad_norm": 3.7329583168029785, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8782343864440918, + "num_tokens": 345457340.0, + "step": 9057 + }, + { + "epoch": 1.1522707034728406, + "ewc_loss": 0.007492730859667063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.492730946978554e-05, + "grad_norm": 3.7637362480163574, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8685230016708374, + "num_tokens": 345494711.0, + "step": 9058 + }, + { + "epoch": 1.152397913751431, + "ewc_loss": 0.00751158595085144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.511585863539949e-05, + "grad_norm": 3.7478995323181152, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8659093379974365, + "num_tokens": 345534249.0, + "step": 9059 + }, + { + "epoch": 1.1525251240300216, + "ewc_loss": 0.0074854278936982155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.485428068321198e-05, + "grad_norm": 3.748427391052246, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8727666139602661, + "num_tokens": 345570832.0, + "step": 9060 + }, + { + "epoch": 1.1526523343086121, + "ewc_loss": 0.0074958400800824165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.495839963667095e-05, + "grad_norm": 3.7520411014556885, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8682656288146973, + "num_tokens": 345609007.0, + "step": 9061 + }, + { + "epoch": 1.1527795445872027, + "ewc_loss": 0.00751293683424592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.512937008868903e-05, + "grad_norm": 3.7362284660339355, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8832602500915527, + "num_tokens": 345647979.0, + "step": 9062 + }, + { + "epoch": 1.1529067548657932, + "ewc_loss": 0.007494884543120861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.494884630432352e-05, + "grad_norm": 3.7431468963623047, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8653596639633179, + "num_tokens": 345681821.0, + "step": 9063 + }, + { + "epoch": 1.1530339651443837, + "ewc_loss": 0.007510640658438206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.510640716645867e-05, + "grad_norm": 3.7041735649108887, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8639991283416748, + "num_tokens": 345724274.0, + "step": 9064 + }, + { + "epoch": 1.1531611754229742, + "ewc_loss": 0.007492735516279936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.492735312553123e-05, + "grad_norm": 3.686885118484497, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8895580172538757, + "num_tokens": 345763558.0, + "step": 9065 + }, + { + "epoch": 1.1532883857015648, + "ewc_loss": 0.007501809857785702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.501809886889532e-05, + "grad_norm": 3.781627655029297, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8685085773468018, + "num_tokens": 345798486.0, + "step": 9066 + }, + { + "epoch": 1.1534155959801553, + "ewc_loss": 0.007577775977551937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.577775977551937e-05, + "grad_norm": 3.7446231842041016, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8705958724021912, + "num_tokens": 345839379.0, + "step": 9067 + }, + { + "epoch": 1.1535428062587456, + "ewc_loss": 0.007523040287196636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.523040403611958e-05, + "grad_norm": 3.7460789680480957, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8691554069519043, + "num_tokens": 345876675.0, + "step": 9068 + }, + { + "epoch": 1.1536700165373361, + "ewc_loss": 0.00755036436021328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550364534836262e-05, + "grad_norm": 3.706141710281372, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.891403317451477, + "num_tokens": 345913482.0, + "step": 9069 + }, + { + "epoch": 1.1537972268159267, + "ewc_loss": 0.007518173661082983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518173515563831e-05, + "grad_norm": 3.7666361331939697, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8697761297225952, + "num_tokens": 345947671.0, + "step": 9070 + }, + { + "epoch": 1.1539244370945172, + "ewc_loss": 0.007597914431244135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597914373036474e-05, + "grad_norm": 3.75591778755188, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8766472339630127, + "num_tokens": 345981520.0, + "step": 9071 + }, + { + "epoch": 1.1540516473731077, + "ewc_loss": 0.007569901179522276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569901208626106e-05, + "grad_norm": 3.7331833839416504, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8571803569793701, + "num_tokens": 346020050.0, + "step": 9072 + }, + { + "epoch": 1.1541788576516983, + "ewc_loss": 0.00757833756506443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.578337681479752e-05, + "grad_norm": 3.805248498916626, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8658266067504883, + "num_tokens": 346050000.0, + "step": 9073 + }, + { + "epoch": 1.1543060679302888, + "ewc_loss": 0.007647911086678505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.647911115782335e-05, + "grad_norm": 3.78363299369812, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8683137893676758, + "num_tokens": 346085038.0, + "step": 9074 + }, + { + "epoch": 1.1544332782088793, + "ewc_loss": 0.007587001193314791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58700116421096e-05, + "grad_norm": 3.6400952339172363, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8829519152641296, + "num_tokens": 346125871.0, + "step": 9075 + }, + { + "epoch": 1.1545604884874698, + "ewc_loss": 0.007566121406853199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566121348645538e-05, + "grad_norm": 3.726598024368286, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8818510174751282, + "num_tokens": 346160460.0, + "step": 9076 + }, + { + "epoch": 1.1546876987660604, + "ewc_loss": 0.007676892913877964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.676892710151151e-05, + "grad_norm": 3.7314646244049072, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8639619946479797, + "num_tokens": 346201621.0, + "step": 9077 + }, + { + "epoch": 1.154814909044651, + "ewc_loss": 0.007629456464201212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62945637688972e-05, + "grad_norm": 3.7242047786712646, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.875934362411499, + "num_tokens": 346242680.0, + "step": 9078 + }, + { + "epoch": 1.1549421193232412, + "ewc_loss": 0.007637977134436369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.637977250851691e-05, + "grad_norm": 3.806568145751953, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8632292747497559, + "num_tokens": 346275774.0, + "step": 9079 + }, + { + "epoch": 1.1550693296018317, + "ewc_loss": 0.007690385449677706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.690385245950893e-05, + "grad_norm": 3.6953909397125244, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8761624693870544, + "num_tokens": 346317648.0, + "step": 9080 + }, + { + "epoch": 1.1551965398804223, + "ewc_loss": 0.007601284421980381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.601284596603364e-05, + "grad_norm": 3.7122445106506348, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8761878609657288, + "num_tokens": 346357736.0, + "step": 9081 + }, + { + "epoch": 1.1553237501590128, + "ewc_loss": 0.007648446597158909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648446626262739e-05, + "grad_norm": 3.6950974464416504, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8711089491844177, + "num_tokens": 346397961.0, + "step": 9082 + }, + { + "epoch": 1.1554509604376033, + "ewc_loss": 0.007618649397045374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.618649397045374e-05, + "grad_norm": 3.6996278762817383, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8846144676208496, + "num_tokens": 346438268.0, + "step": 9083 + }, + { + "epoch": 1.1555781707161938, + "ewc_loss": 0.007622451055794954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.622451084898785e-05, + "grad_norm": 3.7299704551696777, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.864620566368103, + "num_tokens": 346479024.0, + "step": 9084 + }, + { + "epoch": 1.1557053809947844, + "ewc_loss": 0.007631198037415743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631198241142556e-05, + "grad_norm": 3.7215921878814697, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8809709548950195, + "num_tokens": 346523526.0, + "step": 9085 + }, + { + "epoch": 1.155832591273375, + "ewc_loss": 0.007617398630827665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617398659931496e-05, + "grad_norm": 3.7452752590179443, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.881077229976654, + "num_tokens": 346561985.0, + "step": 9086 + }, + { + "epoch": 1.1559598015519654, + "ewc_loss": 0.00760660832747817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.606608414789662e-05, + "grad_norm": 3.7326979637145996, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.859700620174408, + "num_tokens": 346605600.0, + "step": 9087 + }, + { + "epoch": 1.156087011830556, + "ewc_loss": 0.0075867981649935246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586798164993525e-05, + "grad_norm": 3.728823661804199, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8873189687728882, + "num_tokens": 346640798.0, + "step": 9088 + }, + { + "epoch": 1.1562142221091465, + "ewc_loss": 0.007581097539514303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.581097452202812e-05, + "grad_norm": 3.7210395336151123, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8674352765083313, + "num_tokens": 346678483.0, + "step": 9089 + }, + { + "epoch": 1.156341432387737, + "ewc_loss": 0.007581229787319899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58122987463139e-05, + "grad_norm": 3.7074966430664062, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8897638320922852, + "num_tokens": 346714604.0, + "step": 9090 + }, + { + "epoch": 1.1564686426663275, + "ewc_loss": 0.007560406811535358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560406811535358e-05, + "grad_norm": 3.7588417530059814, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8847489953041077, + "num_tokens": 346747203.0, + "step": 9091 + }, + { + "epoch": 1.156595852944918, + "ewc_loss": 0.007594688329845667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.594688213430345e-05, + "grad_norm": 3.694434881210327, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.871752142906189, + "num_tokens": 346790650.0, + "step": 9092 + }, + { + "epoch": 1.1567230632235084, + "ewc_loss": 0.007533046416938305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.533046300522983e-05, + "grad_norm": 3.787341833114624, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8957709074020386, + "num_tokens": 346822872.0, + "step": 9093 + }, + { + "epoch": 1.156850273502099, + "ewc_loss": 0.007599780801683664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.599780656164512e-05, + "grad_norm": 3.7784602642059326, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8795326352119446, + "num_tokens": 346857999.0, + "step": 9094 + }, + { + "epoch": 1.1569774837806894, + "ewc_loss": 0.007566617801785469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566617568954825e-05, + "grad_norm": 3.775888204574585, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8810174465179443, + "num_tokens": 346891804.0, + "step": 9095 + }, + { + "epoch": 1.15710469405928, + "ewc_loss": 0.007556398399174213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.556398486485705e-05, + "grad_norm": 3.7390973567962646, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8715773224830627, + "num_tokens": 346931554.0, + "step": 9096 + }, + { + "epoch": 1.1572319043378705, + "ewc_loss": 0.00753903528675437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.539035141235217e-05, + "grad_norm": 3.8214569091796875, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8613099455833435, + "num_tokens": 346965214.0, + "step": 9097 + }, + { + "epoch": 1.157359114616461, + "ewc_loss": 0.007619167678058147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.619167445227504e-05, + "grad_norm": 3.745028018951416, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8851071000099182, + "num_tokens": 347001219.0, + "step": 9098 + }, + { + "epoch": 1.1574863248950515, + "ewc_loss": 0.007533902768045664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.533902680734172e-05, + "grad_norm": 3.7343716621398926, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8572611808776855, + "num_tokens": 347043212.0, + "step": 9099 + }, + { + "epoch": 1.157613535173642, + "ewc_loss": 0.007560914847999811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560914673376828e-05, + "grad_norm": 3.7152509689331055, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8859118223190308, + "num_tokens": 347082249.0, + "step": 9100 + }, + { + "epoch": 1.1577407454522326, + "ewc_loss": 0.007546887267380953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.546887354692444e-05, + "grad_norm": 3.7295031547546387, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8708232641220093, + "num_tokens": 347119286.0, + "step": 9101 + }, + { + "epoch": 1.1578679557308231, + "ewc_loss": 0.007561070378869772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.561070378869772e-05, + "grad_norm": 3.706348419189453, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8856666088104248, + "num_tokens": 347155549.0, + "step": 9102 + }, + { + "epoch": 1.1579951660094137, + "ewc_loss": 0.0075526004657149315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.552600436611101e-05, + "grad_norm": 3.7372751235961914, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8762454986572266, + "num_tokens": 347196846.0, + "step": 9103 + }, + { + "epoch": 1.158122376288004, + "ewc_loss": 0.007561534643173218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.561534584965557e-05, + "grad_norm": 3.728732109069824, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8866724967956543, + "num_tokens": 347234028.0, + "step": 9104 + }, + { + "epoch": 1.1582495865665945, + "ewc_loss": 0.00755069637671113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550696318503469e-05, + "grad_norm": 3.720841646194458, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.885515570640564, + "num_tokens": 347276931.0, + "step": 9105 + }, + { + "epoch": 1.158376796845185, + "ewc_loss": 0.007559206802397966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55920700612478e-05, + "grad_norm": 3.73030948638916, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8714503645896912, + "num_tokens": 347317139.0, + "step": 9106 + }, + { + "epoch": 1.1585040071237755, + "ewc_loss": 0.007549290545284748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549290603492409e-05, + "grad_norm": 3.7251455783843994, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8765532970428467, + "num_tokens": 347355718.0, + "step": 9107 + }, + { + "epoch": 1.158631217402366, + "ewc_loss": 0.007541654631495476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541654485976323e-05, + "grad_norm": 3.762601375579834, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8650221824645996, + "num_tokens": 347393403.0, + "step": 9108 + }, + { + "epoch": 1.1587584276809566, + "ewc_loss": 0.007566654589027166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566654676338658e-05, + "grad_norm": 3.7503128051757812, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8812103271484375, + "num_tokens": 347433736.0, + "step": 9109 + }, + { + "epoch": 1.1588856379595471, + "ewc_loss": 0.00753253186121583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.53253189031966e-05, + "grad_norm": 3.6940672397613525, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8751034736633301, + "num_tokens": 347474574.0, + "step": 9110 + }, + { + "epoch": 1.1590128482381377, + "ewc_loss": 0.007528647780418396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.528647984145209e-05, + "grad_norm": 3.7109529972076416, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.878974199295044, + "num_tokens": 347518785.0, + "step": 9111 + }, + { + "epoch": 1.1591400585167282, + "ewc_loss": 0.0075541045516729355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554104377049953e-05, + "grad_norm": 3.782985210418701, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8627673387527466, + "num_tokens": 347557518.0, + "step": 9112 + }, + { + "epoch": 1.1592672687953187, + "ewc_loss": 0.007577813696116209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.577813812531531e-05, + "grad_norm": 3.798938512802124, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8456584215164185, + "num_tokens": 347600094.0, + "step": 9113 + }, + { + "epoch": 1.1593944790739092, + "ewc_loss": 0.0075558326207101345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555832416983321e-05, + "grad_norm": 3.796656847000122, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8720187544822693, + "num_tokens": 347633709.0, + "step": 9114 + }, + { + "epoch": 1.1595216893524998, + "ewc_loss": 0.00756911700591445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569116860395297e-05, + "grad_norm": 3.7422590255737305, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8810732364654541, + "num_tokens": 347670027.0, + "step": 9115 + }, + { + "epoch": 1.1596488996310903, + "ewc_loss": 0.0075402879156172276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54028806113638e-05, + "grad_norm": 3.7271013259887695, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8786952495574951, + "num_tokens": 347707488.0, + "step": 9116 + }, + { + "epoch": 1.1597761099096806, + "ewc_loss": 0.007549903355538845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549903239123523e-05, + "grad_norm": 3.739703893661499, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8687130808830261, + "num_tokens": 347746030.0, + "step": 9117 + }, + { + "epoch": 1.1599033201882711, + "ewc_loss": 0.0075533147901296616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553314935648814e-05, + "grad_norm": 3.7403454780578613, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8680779933929443, + "num_tokens": 347783409.0, + "step": 9118 + }, + { + "epoch": 1.1600305304668617, + "ewc_loss": 0.007564891595393419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564891711808741e-05, + "grad_norm": 3.7253215312957764, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8806601762771606, + "num_tokens": 347821335.0, + "step": 9119 + }, + { + "epoch": 1.1601577407454522, + "ewc_loss": 0.007569589652121067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56958979764022e-05, + "grad_norm": 3.7237305641174316, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8846168518066406, + "num_tokens": 347857590.0, + "step": 9120 + }, + { + "epoch": 1.1602849510240427, + "ewc_loss": 0.007564640138298273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56463996367529e-05, + "grad_norm": 3.6975505352020264, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8852744698524475, + "num_tokens": 347897000.0, + "step": 9121 + }, + { + "epoch": 1.1604121613026332, + "ewc_loss": 0.007553806062787771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553806062787771e-05, + "grad_norm": 3.8206796646118164, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8742349147796631, + "num_tokens": 347928947.0, + "step": 9122 + }, + { + "epoch": 1.1605393715812238, + "ewc_loss": 0.007667471654713154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.667471800232306e-05, + "grad_norm": 3.8981103897094727, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.844383955001831, + "num_tokens": 347959294.0, + "step": 9123 + }, + { + "epoch": 1.1606665818598143, + "ewc_loss": 0.0076588899828493595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658889808226377e-05, + "grad_norm": 3.7570838928222656, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8791050314903259, + "num_tokens": 347993103.0, + "step": 9124 + }, + { + "epoch": 1.1607937921384048, + "ewc_loss": 0.007561584003269672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.561584061477333e-05, + "grad_norm": 3.7013370990753174, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8860260844230652, + "num_tokens": 348028412.0, + "step": 9125 + }, + { + "epoch": 1.1609210024169954, + "ewc_loss": 0.0075985947623848915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5985946750734e-05, + "grad_norm": 3.7360494136810303, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8569892644882202, + "num_tokens": 348071429.0, + "step": 9126 + }, + { + "epoch": 1.161048212695586, + "ewc_loss": 0.007636073045432568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636073132744059e-05, + "grad_norm": 3.709864854812622, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8779940605163574, + "num_tokens": 348113222.0, + "step": 9127 + }, + { + "epoch": 1.1611754229741762, + "ewc_loss": 0.007600630633533001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600630488013849e-05, + "grad_norm": 3.781815528869629, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.873577892780304, + "num_tokens": 348150188.0, + "step": 9128 + }, + { + "epoch": 1.1613026332527667, + "ewc_loss": 0.00766666978597641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.666669989703223e-05, + "grad_norm": 3.73974609375, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8778072595596313, + "num_tokens": 348188547.0, + "step": 9129 + }, + { + "epoch": 1.1614298435313573, + "ewc_loss": 0.00760292075574398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.602920959470794e-05, + "grad_norm": 3.7549984455108643, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8689864873886108, + "num_tokens": 348228624.0, + "step": 9130 + }, + { + "epoch": 1.1615570538099478, + "ewc_loss": 0.0076183779165148735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.618378003826365e-05, + "grad_norm": 3.745065212249756, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8703833222389221, + "num_tokens": 348264932.0, + "step": 9131 + }, + { + "epoch": 1.1616842640885383, + "ewc_loss": 0.007619707845151424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.619708048878238e-05, + "grad_norm": 3.700136661529541, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8758851289749146, + "num_tokens": 348307680.0, + "step": 9132 + }, + { + "epoch": 1.1618114743671288, + "ewc_loss": 0.007576157804578543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.576157804578543e-05, + "grad_norm": 3.714046001434326, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8888694643974304, + "num_tokens": 348346449.0, + "step": 9133 + }, + { + "epoch": 1.1619386846457194, + "ewc_loss": 0.007621479220688343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62147901696153e-05, + "grad_norm": 3.727552652359009, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8835488557815552, + "num_tokens": 348386393.0, + "step": 9134 + }, + { + "epoch": 1.16206589492431, + "ewc_loss": 0.007598850876092911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.59885078878142e-05, + "grad_norm": 3.7388041019439697, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8670454621315002, + "num_tokens": 348429458.0, + "step": 9135 + }, + { + "epoch": 1.1621931052029004, + "ewc_loss": 0.007605026476085186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.605026621604338e-05, + "grad_norm": 3.750473737716675, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8605624437332153, + "num_tokens": 348471365.0, + "step": 9136 + }, + { + "epoch": 1.162320315481491, + "ewc_loss": 0.007590562105178833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.590562017867342e-05, + "grad_norm": 3.762545585632324, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8598232269287109, + "num_tokens": 348513425.0, + "step": 9137 + }, + { + "epoch": 1.1624475257600815, + "ewc_loss": 0.007588397245854139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.588397420477122e-05, + "grad_norm": 3.7490735054016113, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8724241256713867, + "num_tokens": 348551181.0, + "step": 9138 + }, + { + "epoch": 1.162574736038672, + "ewc_loss": 0.0075658345595002174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565834675915539e-05, + "grad_norm": 3.7508373260498047, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8947309255599976, + "num_tokens": 348583611.0, + "step": 9139 + }, + { + "epoch": 1.1627019463172625, + "ewc_loss": 0.007573950104415417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5739502790384e-05, + "grad_norm": 3.707301139831543, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8725110292434692, + "num_tokens": 348624281.0, + "step": 9140 + }, + { + "epoch": 1.162829156595853, + "ewc_loss": 0.0075409370474517345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540937076555565e-05, + "grad_norm": 3.794779062271118, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8725820183753967, + "num_tokens": 348661416.0, + "step": 9141 + }, + { + "epoch": 1.1629563668744434, + "ewc_loss": 0.007599098142236471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.599098171340302e-05, + "grad_norm": 3.7782657146453857, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.853811502456665, + "num_tokens": 348703907.0, + "step": 9142 + }, + { + "epoch": 1.163083577153034, + "ewc_loss": 0.007536720018833876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.536719931522384e-05, + "grad_norm": 3.766566038131714, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8865455389022827, + "num_tokens": 348735617.0, + "step": 9143 + }, + { + "epoch": 1.1632107874316244, + "ewc_loss": 0.007561694830656052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.561694656033069e-05, + "grad_norm": 3.733839750289917, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8725610375404358, + "num_tokens": 348773552.0, + "step": 9144 + }, + { + "epoch": 1.163337997710215, + "ewc_loss": 0.007541575934737921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54157590563409e-05, + "grad_norm": 3.738921880722046, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.876513659954071, + "num_tokens": 348811274.0, + "step": 9145 + }, + { + "epoch": 1.1634652079888055, + "ewc_loss": 0.007549996953457594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549997098976746e-05, + "grad_norm": 3.74422287940979, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8627579808235168, + "num_tokens": 348847773.0, + "step": 9146 + }, + { + "epoch": 1.163592418267396, + "ewc_loss": 0.007563223131000996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.563223334727809e-05, + "grad_norm": 3.745131015777588, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8676042556762695, + "num_tokens": 348886356.0, + "step": 9147 + }, + { + "epoch": 1.1637196285459865, + "ewc_loss": 0.00756983645260334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56983645260334e-05, + "grad_norm": 3.7376554012298584, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8838673830032349, + "num_tokens": 348922892.0, + "step": 9148 + }, + { + "epoch": 1.163846838824577, + "ewc_loss": 0.007568520959466696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.568520959466696e-05, + "grad_norm": 3.777972459793091, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8938965201377869, + "num_tokens": 348954924.0, + "step": 9149 + }, + { + "epoch": 1.1639740491031676, + "ewc_loss": 0.007597469724714756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597469812026247e-05, + "grad_norm": 3.6741206645965576, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8808671236038208, + "num_tokens": 348997335.0, + "step": 9150 + }, + { + "epoch": 1.1641012593817581, + "ewc_loss": 0.0075301178731024265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.530117727583274e-05, + "grad_norm": 3.786468029022217, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8630971908569336, + "num_tokens": 349038221.0, + "step": 9151 + }, + { + "epoch": 1.1642284696603487, + "ewc_loss": 0.007631951943039894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631952030351385e-05, + "grad_norm": 3.767781972885132, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8696004748344421, + "num_tokens": 349076559.0, + "step": 9152 + }, + { + "epoch": 1.164355679938939, + "ewc_loss": 0.007560929283499718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560929225292057e-05, + "grad_norm": 3.79732084274292, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8524683713912964, + "num_tokens": 349112264.0, + "step": 9153 + }, + { + "epoch": 1.1644828902175295, + "ewc_loss": 0.007603818085044622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.603818085044622e-05, + "grad_norm": 3.7609267234802246, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8857447504997253, + "num_tokens": 349146571.0, + "step": 9154 + }, + { + "epoch": 1.16461010049612, + "ewc_loss": 0.007553784176707268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553784234914929e-05, + "grad_norm": 3.731109619140625, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8801463842391968, + "num_tokens": 349183674.0, + "step": 9155 + }, + { + "epoch": 1.1647373107747105, + "ewc_loss": 0.007562830578535795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.562830433016643e-05, + "grad_norm": 3.731205940246582, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.877612829208374, + "num_tokens": 349227042.0, + "step": 9156 + }, + { + "epoch": 1.164864521053301, + "ewc_loss": 0.007569996640086174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569996523670852e-05, + "grad_norm": 3.7693064212799072, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8695719242095947, + "num_tokens": 349264072.0, + "step": 9157 + }, + { + "epoch": 1.1649917313318916, + "ewc_loss": 0.007584266364574432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.584266131743789e-05, + "grad_norm": 3.745549440383911, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8734709024429321, + "num_tokens": 349304427.0, + "step": 9158 + }, + { + "epoch": 1.1651189416104821, + "ewc_loss": 0.007561799604445696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.561799429822713e-05, + "grad_norm": 3.7197389602661133, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8736540675163269, + "num_tokens": 349346708.0, + "step": 9159 + }, + { + "epoch": 1.1652461518890727, + "ewc_loss": 0.007566782645881176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566782733192667e-05, + "grad_norm": 3.753305435180664, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8730102181434631, + "num_tokens": 349387716.0, + "step": 9160 + }, + { + "epoch": 1.1653733621676632, + "ewc_loss": 0.007587637286633253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58763708290644e-05, + "grad_norm": 3.7483818531036377, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8705509305000305, + "num_tokens": 349425757.0, + "step": 9161 + }, + { + "epoch": 1.1655005724462537, + "ewc_loss": 0.0075586955063045025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.558695506304502e-05, + "grad_norm": 3.7229270935058594, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.881607711315155, + "num_tokens": 349472097.0, + "step": 9162 + }, + { + "epoch": 1.1656277827248442, + "ewc_loss": 0.0075470698066055775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54706998122856e-05, + "grad_norm": 3.755005121231079, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8710514307022095, + "num_tokens": 349513809.0, + "step": 9163 + }, + { + "epoch": 1.1657549930034348, + "ewc_loss": 0.007582360412925482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582360558444634e-05, + "grad_norm": 3.7618656158447266, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8903378844261169, + "num_tokens": 349544906.0, + "step": 9164 + }, + { + "epoch": 1.1658822032820253, + "ewc_loss": 0.007565078791230917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565078703919426e-05, + "grad_norm": 3.7456939220428467, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8698620796203613, + "num_tokens": 349582529.0, + "step": 9165 + }, + { + "epoch": 1.1660094135606156, + "ewc_loss": 0.007544944528490305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.544944674009457e-05, + "grad_norm": 3.7197136878967285, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8693620562553406, + "num_tokens": 349624871.0, + "step": 9166 + }, + { + "epoch": 1.1661366238392061, + "ewc_loss": 0.007532041519880295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532041490776464e-05, + "grad_norm": 3.8335249423980713, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.838142991065979, + "num_tokens": 349660783.0, + "step": 9167 + }, + { + "epoch": 1.1662638341177967, + "ewc_loss": 0.007611093111336231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.611093315063044e-05, + "grad_norm": 3.7211709022521973, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8847337961196899, + "num_tokens": 349698430.0, + "step": 9168 + }, + { + "epoch": 1.1663910443963872, + "ewc_loss": 0.007500082720071077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.500082574551925e-05, + "grad_norm": 3.7443604469299316, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8671611547470093, + "num_tokens": 349734981.0, + "step": 9169 + }, + { + "epoch": 1.1665182546749777, + "ewc_loss": 0.007585431914776564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.585431740153581e-05, + "grad_norm": 3.7741806507110596, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8689990639686584, + "num_tokens": 349774267.0, + "step": 9170 + }, + { + "epoch": 1.1666454649535682, + "ewc_loss": 0.007578202523291111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.578202348668128e-05, + "grad_norm": 3.769958019256592, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8707455992698669, + "num_tokens": 349811400.0, + "step": 9171 + }, + { + "epoch": 1.1667726752321588, + "ewc_loss": 0.007566336076706648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566335989395157e-05, + "grad_norm": 3.7592620849609375, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.866478443145752, + "num_tokens": 349850031.0, + "step": 9172 + }, + { + "epoch": 1.1668998855107493, + "ewc_loss": 0.0075818076729774475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.581807585665956e-05, + "grad_norm": 3.717923879623413, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8768628835678101, + "num_tokens": 349889967.0, + "step": 9173 + }, + { + "epoch": 1.1670270957893398, + "ewc_loss": 0.007553499191999435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.553499017376453e-05, + "grad_norm": 3.748481512069702, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8685225248336792, + "num_tokens": 349930276.0, + "step": 9174 + }, + { + "epoch": 1.1671543060679304, + "ewc_loss": 0.007595650851726532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.595650822622702e-05, + "grad_norm": 3.7463645935058594, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8692359924316406, + "num_tokens": 349969889.0, + "step": 9175 + }, + { + "epoch": 1.1672815163465209, + "ewc_loss": 0.007592210080474615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.592210022266954e-05, + "grad_norm": 3.7903289794921875, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8699995875358582, + "num_tokens": 350005729.0, + "step": 9176 + }, + { + "epoch": 1.1674087266251112, + "ewc_loss": 0.007618534378707409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.61853443691507e-05, + "grad_norm": 3.733455181121826, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8813605904579163, + "num_tokens": 350043130.0, + "step": 9177 + }, + { + "epoch": 1.1675359369037017, + "ewc_loss": 0.007561949081718922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.561949314549565e-05, + "grad_norm": 3.700305700302124, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8908119201660156, + "num_tokens": 350081233.0, + "step": 9178 + }, + { + "epoch": 1.1676631471822922, + "ewc_loss": 0.0075824367813766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582436955999583e-05, + "grad_norm": 3.7829275131225586, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8598365783691406, + "num_tokens": 350119403.0, + "step": 9179 + }, + { + "epoch": 1.1677903574608828, + "ewc_loss": 0.007640068419277668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640068361070007e-05, + "grad_norm": 3.762735605239868, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.861883819103241, + "num_tokens": 350159099.0, + "step": 9180 + }, + { + "epoch": 1.1679175677394733, + "ewc_loss": 0.007596193812787533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.59619360906072e-05, + "grad_norm": 3.7623202800750732, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.874405026435852, + "num_tokens": 350195642.0, + "step": 9181 + }, + { + "epoch": 1.1680447780180638, + "ewc_loss": 0.007602917496114969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.602917321491987e-05, + "grad_norm": 3.700749158859253, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8803365230560303, + "num_tokens": 350236721.0, + "step": 9182 + }, + { + "epoch": 1.1681719882966544, + "ewc_loss": 0.007563108578324318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.563108374597505e-05, + "grad_norm": 3.762577772140503, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8698651790618896, + "num_tokens": 350272691.0, + "step": 9183 + }, + { + "epoch": 1.168299198575245, + "ewc_loss": 0.007633848115801811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633848144905642e-05, + "grad_norm": 3.6898534297943115, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8722699880599976, + "num_tokens": 350318299.0, + "step": 9184 + }, + { + "epoch": 1.1684264088538354, + "ewc_loss": 0.007582644931972027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582645048387349e-05, + "grad_norm": 3.728757381439209, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8725844621658325, + "num_tokens": 350360973.0, + "step": 9185 + }, + { + "epoch": 1.168553619132426, + "ewc_loss": 0.007610654458403587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.610654574818909e-05, + "grad_norm": 3.725731611251831, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8815580606460571, + "num_tokens": 350400516.0, + "step": 9186 + }, + { + "epoch": 1.1686808294110165, + "ewc_loss": 0.007597428280860186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597428339067847e-05, + "grad_norm": 3.757232666015625, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8694261312484741, + "num_tokens": 350438147.0, + "step": 9187 + }, + { + "epoch": 1.168808039689607, + "ewc_loss": 0.007607010658830404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.607010775245726e-05, + "grad_norm": 3.704784631729126, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.878451943397522, + "num_tokens": 350481604.0, + "step": 9188 + }, + { + "epoch": 1.1689352499681975, + "ewc_loss": 0.0075791641138494015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579164230264723e-05, + "grad_norm": 3.7475175857543945, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8929310441017151, + "num_tokens": 350518225.0, + "step": 9189 + }, + { + "epoch": 1.169062460246788, + "ewc_loss": 0.007610580883920193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.610581087647006e-05, + "grad_norm": 3.8145432472229004, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8722118735313416, + "num_tokens": 350553752.0, + "step": 9190 + }, + { + "epoch": 1.1691896705253784, + "ewc_loss": 0.007609482388943434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.609482418047264e-05, + "grad_norm": 3.7261924743652344, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8793760538101196, + "num_tokens": 350593676.0, + "step": 9191 + }, + { + "epoch": 1.169316880803969, + "ewc_loss": 0.0075313919223845005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.531391747761518e-05, + "grad_norm": 3.7500200271606445, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8733388185501099, + "num_tokens": 350633618.0, + "step": 9192 + }, + { + "epoch": 1.1694440910825594, + "ewc_loss": 0.007582123391330242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582123362226412e-05, + "grad_norm": 3.713592767715454, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8716094493865967, + "num_tokens": 350678221.0, + "step": 9193 + }, + { + "epoch": 1.16957130136115, + "ewc_loss": 0.0075425743125379086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.542574167018756e-05, + "grad_norm": 3.740273952484131, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8832169771194458, + "num_tokens": 350718258.0, + "step": 9194 + }, + { + "epoch": 1.1696985116397405, + "ewc_loss": 0.00755902286618948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.559022924397141e-05, + "grad_norm": 3.683340311050415, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8845881223678589, + "num_tokens": 350757370.0, + "step": 9195 + }, + { + "epoch": 1.169825721918331, + "ewc_loss": 0.007527154870331287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.527154957642779e-05, + "grad_norm": 3.757575035095215, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8751696944236755, + "num_tokens": 350801394.0, + "step": 9196 + }, + { + "epoch": 1.1699529321969215, + "ewc_loss": 0.007579371798783541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579371595056728e-05, + "grad_norm": 3.794968843460083, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8656127452850342, + "num_tokens": 350840635.0, + "step": 9197 + }, + { + "epoch": 1.170080142475512, + "ewc_loss": 0.007538009434938431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.538009231211618e-05, + "grad_norm": 3.712150812149048, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8900405168533325, + "num_tokens": 350879616.0, + "step": 9198 + }, + { + "epoch": 1.1702073527541026, + "ewc_loss": 0.0074951318092644215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.495132012991235e-05, + "grad_norm": 3.7656443119049072, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8739596009254456, + "num_tokens": 350921710.0, + "step": 9199 + }, + { + "epoch": 1.1703345630326931, + "ewc_loss": 0.007533167954534292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.53316780901514e-05, + "grad_norm": 3.7439124584198, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8626564741134644, + "num_tokens": 350964218.0, + "step": 9200 + }, + { + "epoch": 1.1704617733112836, + "ewc_loss": 0.007502955850213766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.502955850213766e-05, + "grad_norm": 3.7505252361297607, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8830315470695496, + "num_tokens": 351004511.0, + "step": 9201 + }, + { + "epoch": 1.170588983589874, + "ewc_loss": 0.007504650391638279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.50465042074211e-05, + "grad_norm": 3.772247791290283, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.868669331073761, + "num_tokens": 351044583.0, + "step": 9202 + }, + { + "epoch": 1.1707161938684645, + "ewc_loss": 0.007518030237406492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518030179198831e-05, + "grad_norm": 3.7401962280273438, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8560870289802551, + "num_tokens": 351083613.0, + "step": 9203 + }, + { + "epoch": 1.170843404147055, + "ewc_loss": 0.007486904505640268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.486904360121116e-05, + "grad_norm": 3.806830406188965, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8671997785568237, + "num_tokens": 351117412.0, + "step": 9204 + }, + { + "epoch": 1.1709706144256455, + "ewc_loss": 0.00754028232768178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540282240370288e-05, + "grad_norm": 3.7757632732391357, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8602770566940308, + "num_tokens": 351154779.0, + "step": 9205 + }, + { + "epoch": 1.171097824704236, + "ewc_loss": 0.007495377212762833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.495377212762833e-05, + "grad_norm": 3.7315168380737305, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8849108219146729, + "num_tokens": 351191412.0, + "step": 9206 + }, + { + "epoch": 1.1712250349828266, + "ewc_loss": 0.007504396606236696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.504396489821374e-05, + "grad_norm": 3.7042572498321533, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8761358261108398, + "num_tokens": 351235465.0, + "step": 9207 + }, + { + "epoch": 1.1713522452614171, + "ewc_loss": 0.007493511773645878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.493511657230556e-05, + "grad_norm": 3.690678358078003, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8794362545013428, + "num_tokens": 351279326.0, + "step": 9208 + }, + { + "epoch": 1.1714794555400077, + "ewc_loss": 0.007491745986044407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.491745782317594e-05, + "grad_norm": 3.721980094909668, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8909952044487, + "num_tokens": 351316678.0, + "step": 9209 + }, + { + "epoch": 1.1716066658185982, + "ewc_loss": 0.00751262903213501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.512629235861823e-05, + "grad_norm": 3.786789655685425, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8706966638565063, + "num_tokens": 351352641.0, + "step": 9210 + }, + { + "epoch": 1.1717338760971887, + "ewc_loss": 0.007541591301560402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.54159118514508e-05, + "grad_norm": 3.868912696838379, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8656505346298218, + "num_tokens": 351385845.0, + "step": 9211 + }, + { + "epoch": 1.1718610863757792, + "ewc_loss": 0.007571344263851643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.571344031020999e-05, + "grad_norm": 3.7329347133636475, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8805989027023315, + "num_tokens": 351425220.0, + "step": 9212 + }, + { + "epoch": 1.1719882966543698, + "ewc_loss": 0.007475908845663071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.475908932974562e-05, + "grad_norm": 3.7445051670074463, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8687946796417236, + "num_tokens": 351467515.0, + "step": 9213 + }, + { + "epoch": 1.1721155069329603, + "ewc_loss": 0.007540361024439335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.540360820712522e-05, + "grad_norm": 3.7762744426727295, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8718992471694946, + "num_tokens": 351507571.0, + "step": 9214 + }, + { + "epoch": 1.1722427172115506, + "ewc_loss": 0.0075515201315283775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.551519956905395e-05, + "grad_norm": 3.716071844100952, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8724719285964966, + "num_tokens": 351546150.0, + "step": 9215 + }, + { + "epoch": 1.1723699274901411, + "ewc_loss": 0.007510303054004908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.510303112212569e-05, + "grad_norm": 3.7477033138275146, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8678216934204102, + "num_tokens": 351588574.0, + "step": 9216 + }, + { + "epoch": 1.1724971377687317, + "ewc_loss": 0.007567274384200573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.567274587927386e-05, + "grad_norm": 3.743333578109741, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8750009536743164, + "num_tokens": 351626914.0, + "step": 9217 + }, + { + "epoch": 1.1726243480473222, + "ewc_loss": 0.007552486378699541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.552486204076558e-05, + "grad_norm": 3.747650623321533, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8641461133956909, + "num_tokens": 351663574.0, + "step": 9218 + }, + { + "epoch": 1.1727515583259127, + "ewc_loss": 0.007569277659058571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569277659058571e-05, + "grad_norm": 3.7434797286987305, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8570569753646851, + "num_tokens": 351703275.0, + "step": 9219 + }, + { + "epoch": 1.1728787686045032, + "ewc_loss": 0.007572466507554054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.572466711280867e-05, + "grad_norm": 3.736079454421997, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8630729913711548, + "num_tokens": 351745713.0, + "step": 9220 + }, + { + "epoch": 1.1730059788830938, + "ewc_loss": 0.007574216462671757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.574216579087079e-05, + "grad_norm": 3.794152021408081, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8585537075996399, + "num_tokens": 351780789.0, + "step": 9221 + }, + { + "epoch": 1.1731331891616843, + "ewc_loss": 0.007619775831699371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.61977571528405e-05, + "grad_norm": 3.7123501300811768, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8827210664749146, + "num_tokens": 351820352.0, + "step": 9222 + }, + { + "epoch": 1.1732603994402748, + "ewc_loss": 0.007570118643343449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57011875975877e-05, + "grad_norm": 3.739689588546753, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8810849189758301, + "num_tokens": 351860011.0, + "step": 9223 + }, + { + "epoch": 1.1733876097188654, + "ewc_loss": 0.007605721242725849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.605721475556493e-05, + "grad_norm": 3.7632107734680176, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8683016896247864, + "num_tokens": 351898742.0, + "step": 9224 + }, + { + "epoch": 1.1735148199974559, + "ewc_loss": 0.007606569677591324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.606569852214307e-05, + "grad_norm": 3.7491414546966553, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8624265193939209, + "num_tokens": 351940201.0, + "step": 9225 + }, + { + "epoch": 1.1736420302760462, + "ewc_loss": 0.007580618839710951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.580618694191799e-05, + "grad_norm": 3.8403966426849365, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8729102611541748, + "num_tokens": 351971041.0, + "step": 9226 + }, + { + "epoch": 1.1737692405546367, + "ewc_loss": 0.007643606513738632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.643606659257784e-05, + "grad_norm": 3.731205463409424, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8562996983528137, + "num_tokens": 352013002.0, + "step": 9227 + }, + { + "epoch": 1.1738964508332272, + "ewc_loss": 0.007543346844613552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.543346873717383e-05, + "grad_norm": 3.7594988346099854, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.872717559337616, + "num_tokens": 352051435.0, + "step": 9228 + }, + { + "epoch": 1.1740236611118178, + "ewc_loss": 0.007611131761223078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6111318776384e-05, + "grad_norm": 3.7412946224212646, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8793568015098572, + "num_tokens": 352093295.0, + "step": 9229 + }, + { + "epoch": 1.1741508713904083, + "ewc_loss": 0.0075757866725325584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57578673074022e-05, + "grad_norm": 3.732513427734375, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8698314428329468, + "num_tokens": 352136401.0, + "step": 9230 + }, + { + "epoch": 1.1742780816689988, + "ewc_loss": 0.0075745475478470325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.574547635158524e-05, + "grad_norm": 3.6984877586364746, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8845977783203125, + "num_tokens": 352178696.0, + "step": 9231 + }, + { + "epoch": 1.1744052919475894, + "ewc_loss": 0.007552896626293659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.552896568085998e-05, + "grad_norm": 3.8145861625671387, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8545318841934204, + "num_tokens": 352214759.0, + "step": 9232 + }, + { + "epoch": 1.1745325022261799, + "ewc_loss": 0.00762953283265233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62953277444467e-05, + "grad_norm": 3.7564947605133057, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8871479034423828, + "num_tokens": 352250520.0, + "step": 9233 + }, + { + "epoch": 1.1746597125047704, + "ewc_loss": 0.007550786715000868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550786540377885e-05, + "grad_norm": 3.7765297889709473, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8723431825637817, + "num_tokens": 352288773.0, + "step": 9234 + }, + { + "epoch": 1.174786922783361, + "ewc_loss": 0.007591745816171169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.591745816171169e-05, + "grad_norm": 3.7558350563049316, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8815658688545227, + "num_tokens": 352326548.0, + "step": 9235 + }, + { + "epoch": 1.1749141330619515, + "ewc_loss": 0.007582937367260456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582937541883439e-05, + "grad_norm": 3.7369539737701416, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8814406991004944, + "num_tokens": 352366537.0, + "step": 9236 + }, + { + "epoch": 1.175041343340542, + "ewc_loss": 0.007554450538009405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554450712632388e-05, + "grad_norm": 3.7773001194000244, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8569250106811523, + "num_tokens": 352402390.0, + "step": 9237 + }, + { + "epoch": 1.1751685536191325, + "ewc_loss": 0.007602515164762735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.602514961035922e-05, + "grad_norm": 3.7414164543151855, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8717578649520874, + "num_tokens": 352442357.0, + "step": 9238 + }, + { + "epoch": 1.175295763897723, + "ewc_loss": 0.007573163136839867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.573163020424545e-05, + "grad_norm": 3.760681390762329, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8720475435256958, + "num_tokens": 352476884.0, + "step": 9239 + }, + { + "epoch": 1.1754229741763134, + "ewc_loss": 0.0075855678878724575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.585567800560966e-05, + "grad_norm": 3.7436211109161377, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8740835189819336, + "num_tokens": 352513448.0, + "step": 9240 + }, + { + "epoch": 1.175550184454904, + "ewc_loss": 0.007574908435344696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.574908522656187e-05, + "grad_norm": 3.6998260021209717, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8935867547988892, + "num_tokens": 352548420.0, + "step": 9241 + }, + { + "epoch": 1.1756773947334944, + "ewc_loss": 0.007562872022390366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.562871905975044e-05, + "grad_norm": 3.7194101810455322, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8774075508117676, + "num_tokens": 352592157.0, + "step": 9242 + }, + { + "epoch": 1.175804605012085, + "ewc_loss": 0.007599336095154285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.599336095154285e-05, + "grad_norm": 3.7896037101745605, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8653846979141235, + "num_tokens": 352626427.0, + "step": 9243 + }, + { + "epoch": 1.1759318152906755, + "ewc_loss": 0.007624408230185509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.624408317497e-05, + "grad_norm": 3.702230930328369, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8827416896820068, + "num_tokens": 352667075.0, + "step": 9244 + }, + { + "epoch": 1.176059025569266, + "ewc_loss": 0.007562767714262009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.562767859781161e-05, + "grad_norm": 3.7188353538513184, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8705514073371887, + "num_tokens": 352707114.0, + "step": 9245 + }, + { + "epoch": 1.1761862358478565, + "ewc_loss": 0.007615912239998579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.615912181790918e-05, + "grad_norm": 3.8478007316589355, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8783303499221802, + "num_tokens": 352738064.0, + "step": 9246 + }, + { + "epoch": 1.176313446126447, + "ewc_loss": 0.007679566740989685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.679566624574363e-05, + "grad_norm": 3.7695817947387695, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8582154512405396, + "num_tokens": 352779964.0, + "step": 9247 + }, + { + "epoch": 1.1764406564050376, + "ewc_loss": 0.007579118479043245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579118391731754e-05, + "grad_norm": 3.7222049236297607, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8805168867111206, + "num_tokens": 352819574.0, + "step": 9248 + }, + { + "epoch": 1.1765678666836281, + "ewc_loss": 0.007599299773573875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.599299715366215e-05, + "grad_norm": 3.748444080352783, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8828014731407166, + "num_tokens": 352859151.0, + "step": 9249 + }, + { + "epoch": 1.1766950769622184, + "ewc_loss": 0.0076095908880233765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.609590829815716e-05, + "grad_norm": 3.7755661010742188, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.852637767791748, + "num_tokens": 352901119.0, + "step": 9250 + }, + { + "epoch": 1.176822287240809, + "ewc_loss": 0.007617940194904804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617939991177991e-05, + "grad_norm": 3.692990779876709, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8732966184616089, + "num_tokens": 352945878.0, + "step": 9251 + }, + { + "epoch": 1.1769494975193995, + "ewc_loss": 0.0075608366169035435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560836820630357e-05, + "grad_norm": 3.7837984561920166, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.867576003074646, + "num_tokens": 352979819.0, + "step": 9252 + }, + { + "epoch": 1.17707670779799, + "ewc_loss": 0.00764362933114171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.643629214726388e-05, + "grad_norm": 3.7216975688934326, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8628460764884949, + "num_tokens": 353023552.0, + "step": 9253 + }, + { + "epoch": 1.1772039180765805, + "ewc_loss": 0.007564937695860863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56493755034171e-05, + "grad_norm": 3.7890000343322754, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8735102415084839, + "num_tokens": 353058178.0, + "step": 9254 + }, + { + "epoch": 1.177331128355171, + "ewc_loss": 0.0076338076032698154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633807399543002e-05, + "grad_norm": 3.7718608379364014, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8818966150283813, + "num_tokens": 353091428.0, + "step": 9255 + }, + { + "epoch": 1.1774583386337616, + "ewc_loss": 0.007601612713187933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.601612742291763e-05, + "grad_norm": 3.7414071559906006, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8754881620407104, + "num_tokens": 353131943.0, + "step": 9256 + }, + { + "epoch": 1.1775855489123521, + "ewc_loss": 0.007584186736494303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.584186823805794e-05, + "grad_norm": 3.749068021774292, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8663665056228638, + "num_tokens": 353171213.0, + "step": 9257 + }, + { + "epoch": 1.1777127591909426, + "ewc_loss": 0.007604075130075216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.604074926348403e-05, + "grad_norm": 3.783689498901367, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8798004984855652, + "num_tokens": 353205057.0, + "step": 9258 + }, + { + "epoch": 1.1778399694695332, + "ewc_loss": 0.0076077221892774105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.607722363900393e-05, + "grad_norm": 3.7079806327819824, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8587936162948608, + "num_tokens": 353251164.0, + "step": 9259 + }, + { + "epoch": 1.1779671797481237, + "ewc_loss": 0.0075669181533157825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566918066004291e-05, + "grad_norm": 3.754931926727295, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8775188326835632, + "num_tokens": 353288202.0, + "step": 9260 + }, + { + "epoch": 1.1780943900267142, + "ewc_loss": 0.007630183827131987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.630183972651139e-05, + "grad_norm": 3.7638442516326904, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.873044490814209, + "num_tokens": 353327396.0, + "step": 9261 + }, + { + "epoch": 1.1782216003053048, + "ewc_loss": 0.007593879941850901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.593879854539409e-05, + "grad_norm": 3.755655288696289, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8619425892829895, + "num_tokens": 353366512.0, + "step": 9262 + }, + { + "epoch": 1.1783488105838953, + "ewc_loss": 0.0076002865098416805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600286335218698e-05, + "grad_norm": 3.7234723567962646, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8656121492385864, + "num_tokens": 353408546.0, + "step": 9263 + }, + { + "epoch": 1.1784760208624856, + "ewc_loss": 0.0075894370675086975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.589437154820189e-05, + "grad_norm": 3.7606003284454346, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.879980206489563, + "num_tokens": 353444950.0, + "step": 9264 + }, + { + "epoch": 1.1786032311410761, + "ewc_loss": 0.007612558547407389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.612558692926541e-05, + "grad_norm": 3.7250263690948486, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8795701265335083, + "num_tokens": 353483306.0, + "step": 9265 + }, + { + "epoch": 1.1787304414196667, + "ewc_loss": 0.007582300808280706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582300895592198e-05, + "grad_norm": 3.7350685596466064, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8714206218719482, + "num_tokens": 353523038.0, + "step": 9266 + }, + { + "epoch": 1.1788576516982572, + "ewc_loss": 0.0075979470275342464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597947114845738e-05, + "grad_norm": 3.702981472015381, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8821675181388855, + "num_tokens": 353564266.0, + "step": 9267 + }, + { + "epoch": 1.1789848619768477, + "ewc_loss": 0.007572689559310675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.572689355583861e-05, + "grad_norm": 3.71545672416687, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8759339451789856, + "num_tokens": 353603558.0, + "step": 9268 + }, + { + "epoch": 1.1791120722554382, + "ewc_loss": 0.007572132628411055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.572132744826376e-05, + "grad_norm": 3.7208428382873535, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.879716157913208, + "num_tokens": 353648097.0, + "step": 9269 + }, + { + "epoch": 1.1792392825340288, + "ewc_loss": 0.007573836948722601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.573836774099618e-05, + "grad_norm": 3.73335599899292, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8583569526672363, + "num_tokens": 353688970.0, + "step": 9270 + }, + { + "epoch": 1.1793664928126193, + "ewc_loss": 0.007560758851468563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560758967883885e-05, + "grad_norm": 3.7520010471343994, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8612741827964783, + "num_tokens": 353732505.0, + "step": 9271 + }, + { + "epoch": 1.1794937030912098, + "ewc_loss": 0.007567846681922674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.567846478195861e-05, + "grad_norm": 3.7373671531677246, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8816932439804077, + "num_tokens": 353767460.0, + "step": 9272 + }, + { + "epoch": 1.1796209133698004, + "ewc_loss": 0.007554598618298769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554598414571956e-05, + "grad_norm": 3.780938148498535, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8826528191566467, + "num_tokens": 353801951.0, + "step": 9273 + }, + { + "epoch": 1.1797481236483909, + "ewc_loss": 0.007565676234662533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.56567606003955e-05, + "grad_norm": 3.7624118328094482, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8802015781402588, + "num_tokens": 353836815.0, + "step": 9274 + }, + { + "epoch": 1.1798753339269812, + "ewc_loss": 0.007555780932307243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55578075768426e-05, + "grad_norm": 3.7546586990356445, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8652362823486328, + "num_tokens": 353876296.0, + "step": 9275 + }, + { + "epoch": 1.1800025442055717, + "ewc_loss": 0.007550416048616171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550416194135323e-05, + "grad_norm": 3.7906911373138428, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8581625819206238, + "num_tokens": 353911792.0, + "step": 9276 + }, + { + "epoch": 1.1801297544841622, + "ewc_loss": 0.007586718071252108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586718129459769e-05, + "grad_norm": 3.7661025524139404, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8625646233558655, + "num_tokens": 353949532.0, + "step": 9277 + }, + { + "epoch": 1.1802569647627528, + "ewc_loss": 0.007545358035713434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.545357948401943e-05, + "grad_norm": 3.7614238262176514, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.860727071762085, + "num_tokens": 353988622.0, + "step": 9278 + }, + { + "epoch": 1.1803841750413433, + "ewc_loss": 0.007576967589557171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.576967618661001e-05, + "grad_norm": 3.722966194152832, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8770620822906494, + "num_tokens": 354028136.0, + "step": 9279 + }, + { + "epoch": 1.1805113853199338, + "ewc_loss": 0.007551864720880985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.551864837296307e-05, + "grad_norm": 3.7461414337158203, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8735635280609131, + "num_tokens": 354065029.0, + "step": 9280 + }, + { + "epoch": 1.1806385955985244, + "ewc_loss": 0.0075897048227488995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.589704910060391e-05, + "grad_norm": 3.763636827468872, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8824288249015808, + "num_tokens": 354099279.0, + "step": 9281 + }, + { + "epoch": 1.1807658058771149, + "ewc_loss": 0.0075749680399894714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.574968185508624e-05, + "grad_norm": 3.7423858642578125, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8678159117698669, + "num_tokens": 354138870.0, + "step": 9282 + }, + { + "epoch": 1.1808930161557054, + "ewc_loss": 0.007569689303636551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569689478259534e-05, + "grad_norm": 3.7523858547210693, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8602185845375061, + "num_tokens": 354176498.0, + "step": 9283 + }, + { + "epoch": 1.181020226434296, + "ewc_loss": 0.007588176988065243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.588176958961412e-05, + "grad_norm": 3.8378920555114746, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8542412519454956, + "num_tokens": 354210063.0, + "step": 9284 + }, + { + "epoch": 1.1811474367128865, + "ewc_loss": 0.0076347533613443375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.634753274032846e-05, + "grad_norm": 3.7871580123901367, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8662617802619934, + "num_tokens": 354248095.0, + "step": 9285 + }, + { + "epoch": 1.181274646991477, + "ewc_loss": 0.007589565124362707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.589565211674199e-05, + "grad_norm": 3.7268946170806885, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8688336610794067, + "num_tokens": 354286897.0, + "step": 9286 + }, + { + "epoch": 1.1814018572700675, + "ewc_loss": 0.007575375027954578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.575374911539257e-05, + "grad_norm": 3.6999430656433105, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8811340928077698, + "num_tokens": 354330685.0, + "step": 9287 + }, + { + "epoch": 1.181529067548658, + "ewc_loss": 0.007585102692246437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58510286686942e-05, + "grad_norm": 3.7372806072235107, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8642404675483704, + "num_tokens": 354371120.0, + "step": 9288 + }, + { + "epoch": 1.1816562778272484, + "ewc_loss": 0.007614306174218655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.614306377945468e-05, + "grad_norm": 3.7866272926330566, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8707290887832642, + "num_tokens": 354410825.0, + "step": 9289 + }, + { + "epoch": 1.1817834881058389, + "ewc_loss": 0.007626892998814583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626893057022244e-05, + "grad_norm": 3.706900119781494, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8688217401504517, + "num_tokens": 354453623.0, + "step": 9290 + }, + { + "epoch": 1.1819106983844294, + "ewc_loss": 0.0075592417269945145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.559241930721328e-05, + "grad_norm": 3.7848403453826904, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8731130361557007, + "num_tokens": 354491854.0, + "step": 9291 + }, + { + "epoch": 1.18203790866302, + "ewc_loss": 0.007641161326318979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.641161209903657e-05, + "grad_norm": 3.7571051120758057, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8597807884216309, + "num_tokens": 354530788.0, + "step": 9292 + }, + { + "epoch": 1.1821651189416105, + "ewc_loss": 0.007579314988106489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579314842587337e-05, + "grad_norm": 3.739434242248535, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8914459943771362, + "num_tokens": 354568576.0, + "step": 9293 + }, + { + "epoch": 1.182292329220201, + "ewc_loss": 0.007587061729282141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.587061554659158e-05, + "grad_norm": 3.721637010574341, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8732403516769409, + "num_tokens": 354611307.0, + "step": 9294 + }, + { + "epoch": 1.1824195394987915, + "ewc_loss": 0.007593723479658365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.593723421450704e-05, + "grad_norm": 3.8044052124023438, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8799751996994019, + "num_tokens": 354648810.0, + "step": 9295 + }, + { + "epoch": 1.182546749777382, + "ewc_loss": 0.007626577280461788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626577280461788e-05, + "grad_norm": 3.7561755180358887, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8744246959686279, + "num_tokens": 354688411.0, + "step": 9296 + }, + { + "epoch": 1.1826739600559726, + "ewc_loss": 0.007560909725725651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560909580206499e-05, + "grad_norm": 3.733241558074951, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8711231350898743, + "num_tokens": 354734700.0, + "step": 9297 + }, + { + "epoch": 1.1828011703345631, + "ewc_loss": 0.007573156617581844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.573156472062692e-05, + "grad_norm": 3.826153516769409, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8716827034950256, + "num_tokens": 354765517.0, + "step": 9298 + }, + { + "epoch": 1.1829283806131534, + "ewc_loss": 0.007637951988726854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.637951785000041e-05, + "grad_norm": 3.7046422958374023, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8766152858734131, + "num_tokens": 354808563.0, + "step": 9299 + }, + { + "epoch": 1.183055590891744, + "ewc_loss": 0.007525274064391851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.525274122599512e-05, + "grad_norm": 3.782461166381836, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8558852076530457, + "num_tokens": 354846618.0, + "step": 9300 + }, + { + "epoch": 1.1831828011703345, + "ewc_loss": 0.007622280158102512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.622280099894851e-05, + "grad_norm": 3.802720546722412, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8745663166046143, + "num_tokens": 354881293.0, + "step": 9301 + }, + { + "epoch": 1.183310011448925, + "ewc_loss": 0.007589197717607021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.589197775814682e-05, + "grad_norm": 3.717925786972046, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8715823888778687, + "num_tokens": 354925239.0, + "step": 9302 + }, + { + "epoch": 1.1834372217275155, + "ewc_loss": 0.007518713362514973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.518713391618803e-05, + "grad_norm": 3.7015345096588135, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8597327470779419, + "num_tokens": 354967836.0, + "step": 9303 + }, + { + "epoch": 1.183564432006106, + "ewc_loss": 0.00755231361836195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.552313763881102e-05, + "grad_norm": 3.8733224868774414, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8583523631095886, + "num_tokens": 355003386.0, + "step": 9304 + }, + { + "epoch": 1.1836916422846966, + "ewc_loss": 0.007665085140615702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.665085286134854e-05, + "grad_norm": 3.781257390975952, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8520321846008301, + "num_tokens": 355043391.0, + "step": 9305 + }, + { + "epoch": 1.1838188525632871, + "ewc_loss": 0.007534884847700596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534884935012087e-05, + "grad_norm": 3.724379539489746, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8741016387939453, + "num_tokens": 355082820.0, + "step": 9306 + }, + { + "epoch": 1.1839460628418776, + "ewc_loss": 0.007569428067654371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569428271381184e-05, + "grad_norm": 3.7467360496520996, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8939083814620972, + "num_tokens": 355117531.0, + "step": 9307 + }, + { + "epoch": 1.1840732731204682, + "ewc_loss": 0.007582321763038635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582321995869279e-05, + "grad_norm": 3.7367305755615234, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8617329597473145, + "num_tokens": 355159485.0, + "step": 9308 + }, + { + "epoch": 1.1842004833990587, + "ewc_loss": 0.0075797224417328835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579722296213731e-05, + "grad_norm": 3.7504568099975586, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8675935864448547, + "num_tokens": 355206051.0, + "step": 9309 + }, + { + "epoch": 1.1843276936776492, + "ewc_loss": 0.007578227203339338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.578227086924016e-05, + "grad_norm": 3.815399646759033, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8646392226219177, + "num_tokens": 355242933.0, + "step": 9310 + }, + { + "epoch": 1.1844549039562398, + "ewc_loss": 0.007623040582984686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.623040437465534e-05, + "grad_norm": 3.7469687461853027, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8856245279312134, + "num_tokens": 355286311.0, + "step": 9311 + }, + { + "epoch": 1.1845821142348303, + "ewc_loss": 0.007565830834209919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565831037936732e-05, + "grad_norm": 3.743898868560791, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8739372491836548, + "num_tokens": 355327844.0, + "step": 9312 + }, + { + "epoch": 1.1847093245134206, + "ewc_loss": 0.007577230222523212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.577230280730873e-05, + "grad_norm": 3.729278087615967, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8803337812423706, + "num_tokens": 355368977.0, + "step": 9313 + }, + { + "epoch": 1.1848365347920111, + "ewc_loss": 0.007569096516817808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569096487713978e-05, + "grad_norm": 3.7319388389587402, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8628877401351929, + "num_tokens": 355411479.0, + "step": 9314 + }, + { + "epoch": 1.1849637450706016, + "ewc_loss": 0.007571614813059568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.571614696644247e-05, + "grad_norm": 3.7910728454589844, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8876357078552246, + "num_tokens": 355447370.0, + "step": 9315 + }, + { + "epoch": 1.1850909553491922, + "ewc_loss": 0.007578281685709953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.578281656606123e-05, + "grad_norm": 3.694766044616699, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8700523376464844, + "num_tokens": 355490335.0, + "step": 9316 + }, + { + "epoch": 1.1852181656277827, + "ewc_loss": 0.007523433305323124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.523433305323124e-05, + "grad_norm": 3.7872495651245117, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8726626038551331, + "num_tokens": 355527843.0, + "step": 9317 + }, + { + "epoch": 1.1853453759063732, + "ewc_loss": 0.00759193766862154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.591937901452184e-05, + "grad_norm": 3.7638890743255615, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8705897331237793, + "num_tokens": 355566094.0, + "step": 9318 + }, + { + "epoch": 1.1854725861849638, + "ewc_loss": 0.007556668482720852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.556668424513191e-05, + "grad_norm": 3.7173376083374023, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8800875544548035, + "num_tokens": 355609505.0, + "step": 9319 + }, + { + "epoch": 1.1855997964635543, + "ewc_loss": 0.007534859701991081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534859469160438e-05, + "grad_norm": 3.767204999923706, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8503949046134949, + "num_tokens": 355650930.0, + "step": 9320 + }, + { + "epoch": 1.1857270067421448, + "ewc_loss": 0.007565181236714125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565181294921786e-05, + "grad_norm": 3.8381104469299316, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8822646141052246, + "num_tokens": 355684075.0, + "step": 9321 + }, + { + "epoch": 1.1858542170207353, + "ewc_loss": 0.007583806291222572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583806291222572e-05, + "grad_norm": 3.7564444541931152, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8769149780273438, + "num_tokens": 355723567.0, + "step": 9322 + }, + { + "epoch": 1.1859814272993259, + "ewc_loss": 0.0075212870724499226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.52128689782694e-05, + "grad_norm": 3.771027088165283, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8599114418029785, + "num_tokens": 355764915.0, + "step": 9323 + }, + { + "epoch": 1.1861086375779162, + "ewc_loss": 0.007563186343759298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.563186227343976e-05, + "grad_norm": 3.7310376167297363, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8740646839141846, + "num_tokens": 355804950.0, + "step": 9324 + }, + { + "epoch": 1.1862358478565067, + "ewc_loss": 0.007517769932746887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517769699916244e-05, + "grad_norm": 3.7599222660064697, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8800775408744812, + "num_tokens": 355844155.0, + "step": 9325 + }, + { + "epoch": 1.1863630581350972, + "ewc_loss": 0.007541648577898741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.541648665210232e-05, + "grad_norm": 3.670161247253418, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8876000046730042, + "num_tokens": 355883971.0, + "step": 9326 + }, + { + "epoch": 1.1864902684136878, + "ewc_loss": 0.007487199734896421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487199764000252e-05, + "grad_norm": 3.7444562911987305, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.869549036026001, + "num_tokens": 355922271.0, + "step": 9327 + }, + { + "epoch": 1.1866174786922783, + "ewc_loss": 0.007550789508968592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550789450760931e-05, + "grad_norm": 3.7501862049102783, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.871648371219635, + "num_tokens": 355961087.0, + "step": 9328 + }, + { + "epoch": 1.1867446889708688, + "ewc_loss": 0.007532879710197449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532879681093618e-05, + "grad_norm": 3.735330581665039, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8763771057128906, + "num_tokens": 355999553.0, + "step": 9329 + }, + { + "epoch": 1.1868718992494594, + "ewc_loss": 0.007537753786891699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.53775384509936e-05, + "grad_norm": 3.790203332901001, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8773338794708252, + "num_tokens": 356038503.0, + "step": 9330 + }, + { + "epoch": 1.1869991095280499, + "ewc_loss": 0.00755865965038538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.558659854112193e-05, + "grad_norm": 3.7345023155212402, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8827964663505554, + "num_tokens": 356078538.0, + "step": 9331 + }, + { + "epoch": 1.1871263198066404, + "ewc_loss": 0.007507358677685261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.507358532166108e-05, + "grad_norm": 3.7825257778167725, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8854479789733887, + "num_tokens": 356115031.0, + "step": 9332 + }, + { + "epoch": 1.187253530085231, + "ewc_loss": 0.007563984952867031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.563985127490014e-05, + "grad_norm": 3.75654935836792, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8860946893692017, + "num_tokens": 356154751.0, + "step": 9333 + }, + { + "epoch": 1.1873807403638215, + "ewc_loss": 0.007519015111029148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.519015343859792e-05, + "grad_norm": 3.7820074558258057, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8568369150161743, + "num_tokens": 356192105.0, + "step": 9334 + }, + { + "epoch": 1.187507950642412, + "ewc_loss": 0.007569043897092342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569044100819156e-05, + "grad_norm": 3.8285813331604004, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8636298179626465, + "num_tokens": 356224964.0, + "step": 9335 + }, + { + "epoch": 1.1876351609210025, + "ewc_loss": 0.007581030018627644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.581029785797e-05, + "grad_norm": 3.7948789596557617, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8774282932281494, + "num_tokens": 356261565.0, + "step": 9336 + }, + { + "epoch": 1.187762371199593, + "ewc_loss": 0.0075622121803462505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.562211976619437e-05, + "grad_norm": 3.7500646114349365, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8752152919769287, + "num_tokens": 356301886.0, + "step": 9337 + }, + { + "epoch": 1.1878895814781834, + "ewc_loss": 0.00755445659160614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554456533398479e-05, + "grad_norm": 3.7282044887542725, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8819513916969299, + "num_tokens": 356339481.0, + "step": 9338 + }, + { + "epoch": 1.1880167917567739, + "ewc_loss": 0.007586544845253229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586544961668551e-05, + "grad_norm": 3.762538433074951, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8492552042007446, + "num_tokens": 356387038.0, + "step": 9339 + }, + { + "epoch": 1.1881440020353644, + "ewc_loss": 0.00758373411372304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583734259242192e-05, + "grad_norm": 3.735588550567627, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8703984022140503, + "num_tokens": 356433256.0, + "step": 9340 + }, + { + "epoch": 1.188271212313955, + "ewc_loss": 0.007557853125035763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55785295041278e-05, + "grad_norm": 3.744009017944336, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.862967848777771, + "num_tokens": 356471282.0, + "step": 9341 + }, + { + "epoch": 1.1883984225925455, + "ewc_loss": 0.007595970295369625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.595970237161964e-05, + "grad_norm": 3.7653331756591797, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8776013255119324, + "num_tokens": 356508951.0, + "step": 9342 + }, + { + "epoch": 1.188525632871136, + "ewc_loss": 0.00758874136954546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.588741573272273e-05, + "grad_norm": 3.799518585205078, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8748090267181396, + "num_tokens": 356544487.0, + "step": 9343 + }, + { + "epoch": 1.1886528431497265, + "ewc_loss": 0.007606698665767908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.606698636664078e-05, + "grad_norm": 3.7678346633911133, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8780888319015503, + "num_tokens": 356580873.0, + "step": 9344 + }, + { + "epoch": 1.188780053428317, + "ewc_loss": 0.007582383695989847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582383841509e-05, + "grad_norm": 3.8157846927642822, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.871807336807251, + "num_tokens": 356613622.0, + "step": 9345 + }, + { + "epoch": 1.1889072637069076, + "ewc_loss": 0.007629875559359789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.629875472048298e-05, + "grad_norm": 3.7502529621124268, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8507112860679626, + "num_tokens": 356660058.0, + "step": 9346 + }, + { + "epoch": 1.189034473985498, + "ewc_loss": 0.007570110261440277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.570110028609633e-05, + "grad_norm": 3.6916403770446777, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8719092607498169, + "num_tokens": 356701614.0, + "step": 9347 + }, + { + "epoch": 1.1891616842640884, + "ewc_loss": 0.007582673337310553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582673424622044e-05, + "grad_norm": 3.728214740753174, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8854036331176758, + "num_tokens": 356737105.0, + "step": 9348 + }, + { + "epoch": 1.189288894542679, + "ewc_loss": 0.007607355713844299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.607355655636638e-05, + "grad_norm": 3.762660503387451, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8650103807449341, + "num_tokens": 356779382.0, + "step": 9349 + }, + { + "epoch": 1.1894161048212695, + "ewc_loss": 0.007608434185385704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.60843395255506e-05, + "grad_norm": 3.76878023147583, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8691856265068054, + "num_tokens": 356813836.0, + "step": 9350 + }, + { + "epoch": 1.18954331509986, + "ewc_loss": 0.007604979909956455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.604980055475608e-05, + "grad_norm": 3.7759861946105957, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8633453845977783, + "num_tokens": 356852591.0, + "step": 9351 + }, + { + "epoch": 1.1896705253784505, + "ewc_loss": 0.007590301334857941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.590301538584754e-05, + "grad_norm": 3.8016514778137207, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8713424205780029, + "num_tokens": 356889940.0, + "step": 9352 + }, + { + "epoch": 1.189797735657041, + "ewc_loss": 0.007606279570609331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6062795415055e-05, + "grad_norm": 3.7143688201904297, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8751304149627686, + "num_tokens": 356927878.0, + "step": 9353 + }, + { + "epoch": 1.1899249459356316, + "ewc_loss": 0.007555277086794376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.555277261417359e-05, + "grad_norm": 3.740731716156006, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8832952976226807, + "num_tokens": 356963339.0, + "step": 9354 + }, + { + "epoch": 1.1900521562142221, + "ewc_loss": 0.007617413066327572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617413211846724e-05, + "grad_norm": 3.7434113025665283, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.87638258934021, + "num_tokens": 357002999.0, + "step": 9355 + }, + { + "epoch": 1.1901793664928126, + "ewc_loss": 0.00757621880620718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.576218922622502e-05, + "grad_norm": 3.721320867538452, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8763388991355896, + "num_tokens": 357044305.0, + "step": 9356 + }, + { + "epoch": 1.1903065767714032, + "ewc_loss": 0.007592333480715752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.592333713546395e-05, + "grad_norm": 3.7486069202423096, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8752994537353516, + "num_tokens": 357082197.0, + "step": 9357 + }, + { + "epoch": 1.1904337870499937, + "ewc_loss": 0.007595260161906481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.59526010369882e-05, + "grad_norm": 3.825718879699707, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8560110926628113, + "num_tokens": 357119541.0, + "step": 9358 + }, + { + "epoch": 1.1905609973285842, + "ewc_loss": 0.00764471897855401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644719153176993e-05, + "grad_norm": 3.6965301036834717, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8942537307739258, + "num_tokens": 357159176.0, + "step": 9359 + }, + { + "epoch": 1.1906882076071748, + "ewc_loss": 0.0075284079648554325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.528407877543941e-05, + "grad_norm": 3.747269630432129, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8718633651733398, + "num_tokens": 357199204.0, + "step": 9360 + }, + { + "epoch": 1.1908154178857653, + "ewc_loss": 0.007619010284543037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.619010284543037e-05, + "grad_norm": 3.7806804180145264, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8866031169891357, + "num_tokens": 357236900.0, + "step": 9361 + }, + { + "epoch": 1.1909426281643556, + "ewc_loss": 0.0076009351760149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600935350637883e-05, + "grad_norm": 3.7841978073120117, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8707424402236938, + "num_tokens": 357271622.0, + "step": 9362 + }, + { + "epoch": 1.1910698384429461, + "ewc_loss": 0.007583962753415108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583962724311277e-05, + "grad_norm": 3.7073564529418945, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8831769824028015, + "num_tokens": 357311145.0, + "step": 9363 + }, + { + "epoch": 1.1911970487215366, + "ewc_loss": 0.007537929806858301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.537929923273623e-05, + "grad_norm": 3.809708595275879, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8895106911659241, + "num_tokens": 357344609.0, + "step": 9364 + }, + { + "epoch": 1.1913242590001272, + "ewc_loss": 0.007633563131093979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633562927367166e-05, + "grad_norm": 3.7624387741088867, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.870792031288147, + "num_tokens": 357378896.0, + "step": 9365 + }, + { + "epoch": 1.1914514692787177, + "ewc_loss": 0.007572953589260578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.572953472845256e-05, + "grad_norm": 3.699110507965088, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8756510019302368, + "num_tokens": 357424730.0, + "step": 9366 + }, + { + "epoch": 1.1915786795573082, + "ewc_loss": 0.007564361207187176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564361294498667e-05, + "grad_norm": 3.8053500652313232, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8554881811141968, + "num_tokens": 357461152.0, + "step": 9367 + }, + { + "epoch": 1.1917058898358988, + "ewc_loss": 0.007644131314009428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644131255801767e-05, + "grad_norm": 3.7517683506011963, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8666538596153259, + "num_tokens": 357502305.0, + "step": 9368 + }, + { + "epoch": 1.1918331001144893, + "ewc_loss": 0.0075641050934791565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564105180790648e-05, + "grad_norm": 3.7557132244110107, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8676704168319702, + "num_tokens": 357539187.0, + "step": 9369 + }, + { + "epoch": 1.1919603103930798, + "ewc_loss": 0.007603881880640984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.603882113471627e-05, + "grad_norm": 3.859997510910034, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8628801107406616, + "num_tokens": 357569520.0, + "step": 9370 + }, + { + "epoch": 1.1920875206716703, + "ewc_loss": 0.007676411885768175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.676411769352853e-05, + "grad_norm": 3.7797086238861084, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8718515634536743, + "num_tokens": 357605253.0, + "step": 9371 + }, + { + "epoch": 1.1922147309502609, + "ewc_loss": 0.00760231539607048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.602315599797294e-05, + "grad_norm": 3.7647900581359863, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8824607133865356, + "num_tokens": 357639701.0, + "step": 9372 + }, + { + "epoch": 1.1923419412288512, + "ewc_loss": 0.007617428433150053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617428491357714e-05, + "grad_norm": 3.754997730255127, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8795188665390015, + "num_tokens": 357675611.0, + "step": 9373 + }, + { + "epoch": 1.1924691515074417, + "ewc_loss": 0.0076405261643230915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640526018803939e-05, + "grad_norm": 3.779618740081787, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8839781284332275, + "num_tokens": 357708521.0, + "step": 9374 + }, + { + "epoch": 1.1925963617860322, + "ewc_loss": 0.007642925716936588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.642925629625097e-05, + "grad_norm": 3.7349226474761963, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8647850751876831, + "num_tokens": 357749970.0, + "step": 9375 + }, + { + "epoch": 1.1927235720646228, + "ewc_loss": 0.007624199613928795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.624199497513473e-05, + "grad_norm": 3.801069736480713, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8832902908325195, + "num_tokens": 357783287.0, + "step": 9376 + }, + { + "epoch": 1.1928507823432133, + "ewc_loss": 0.007685518357902765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.685518357902765e-05, + "grad_norm": 3.773346185684204, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8735865354537964, + "num_tokens": 357818883.0, + "step": 9377 + }, + { + "epoch": 1.1929779926218038, + "ewc_loss": 0.007640821393579245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640821422683075e-05, + "grad_norm": 3.697936534881592, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.885197639465332, + "num_tokens": 357862862.0, + "step": 9378 + }, + { + "epoch": 1.1931052029003943, + "ewc_loss": 0.007600909098982811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600909157190472e-05, + "grad_norm": 3.7788257598876953, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8618713617324829, + "num_tokens": 357901574.0, + "step": 9379 + }, + { + "epoch": 1.1932324131789849, + "ewc_loss": 0.007688743993639946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.688743789913133e-05, + "grad_norm": 3.732440233230591, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8720433712005615, + "num_tokens": 357941822.0, + "step": 9380 + }, + { + "epoch": 1.1933596234575754, + "ewc_loss": 0.007617697585374117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617697701789439e-05, + "grad_norm": 3.7414538860321045, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8670727014541626, + "num_tokens": 357982820.0, + "step": 9381 + }, + { + "epoch": 1.193486833736166, + "ewc_loss": 0.007641850039362907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64185024308972e-05, + "grad_norm": 3.7861902713775635, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8707113265991211, + "num_tokens": 358020491.0, + "step": 9382 + }, + { + "epoch": 1.1936140440147565, + "ewc_loss": 0.007657337002456188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.65733711887151e-05, + "grad_norm": 3.7342920303344727, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8749383091926575, + "num_tokens": 358059775.0, + "step": 9383 + }, + { + "epoch": 1.193741254293347, + "ewc_loss": 0.00761592760682106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.615927461301908e-05, + "grad_norm": 3.7464258670806885, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8803799748420715, + "num_tokens": 358098672.0, + "step": 9384 + }, + { + "epoch": 1.1938684645719375, + "ewc_loss": 0.0076347291469573975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.634729263372719e-05, + "grad_norm": 3.7736361026763916, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8656638860702515, + "num_tokens": 358140850.0, + "step": 9385 + }, + { + "epoch": 1.193995674850528, + "ewc_loss": 0.007628433406352997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.628433377249166e-05, + "grad_norm": 3.768942356109619, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8787314891815186, + "num_tokens": 358175503.0, + "step": 9386 + }, + { + "epoch": 1.1941228851291183, + "ewc_loss": 0.007627539336681366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.627539162058383e-05, + "grad_norm": 3.7873117923736572, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8776247501373291, + "num_tokens": 358210615.0, + "step": 9387 + }, + { + "epoch": 1.1942500954077089, + "ewc_loss": 0.007635117042809725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.635117071913555e-05, + "grad_norm": 3.7707812786102295, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8757194876670837, + "num_tokens": 358246605.0, + "step": 9388 + }, + { + "epoch": 1.1943773056862994, + "ewc_loss": 0.0076150731183588505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.615073263878003e-05, + "grad_norm": 3.7578961849212646, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8698467016220093, + "num_tokens": 358283136.0, + "step": 9389 + }, + { + "epoch": 1.19450451596489, + "ewc_loss": 0.007616203743964434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.616203947691247e-05, + "grad_norm": 3.6700236797332764, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8809043765068054, + "num_tokens": 358325437.0, + "step": 9390 + }, + { + "epoch": 1.1946317262434805, + "ewc_loss": 0.0075807832181453705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.580783130833879e-05, + "grad_norm": 3.747002601623535, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8726332187652588, + "num_tokens": 358365360.0, + "step": 9391 + }, + { + "epoch": 1.194758936522071, + "ewc_loss": 0.007648932747542858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648932660231367e-05, + "grad_norm": 3.707998514175415, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8897205591201782, + "num_tokens": 358406502.0, + "step": 9392 + }, + { + "epoch": 1.1948861468006615, + "ewc_loss": 0.007588911801576614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.588911830680445e-05, + "grad_norm": 3.806928873062134, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8686608672142029, + "num_tokens": 358437220.0, + "step": 9393 + }, + { + "epoch": 1.195013357079252, + "ewc_loss": 0.007667691446840763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.667691534152254e-05, + "grad_norm": 3.8011627197265625, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8629665374755859, + "num_tokens": 358475314.0, + "step": 9394 + }, + { + "epoch": 1.1951405673578426, + "ewc_loss": 0.007630328647792339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.630328764207661e-05, + "grad_norm": 3.6825344562530518, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8827722668647766, + "num_tokens": 358520777.0, + "step": 9395 + }, + { + "epoch": 1.195267777636433, + "ewc_loss": 0.00757130840793252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57130837882869e-05, + "grad_norm": 3.696908712387085, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.892236590385437, + "num_tokens": 358557973.0, + "step": 9396 + }, + { + "epoch": 1.1953949879150234, + "ewc_loss": 0.007633262313902378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6332624303177e-05, + "grad_norm": 3.707805871963501, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8805559277534485, + "num_tokens": 358596795.0, + "step": 9397 + }, + { + "epoch": 1.195522198193614, + "ewc_loss": 0.007608610671013594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.608610758325085e-05, + "grad_norm": 3.77461314201355, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8735005855560303, + "num_tokens": 358632146.0, + "step": 9398 + }, + { + "epoch": 1.1956494084722045, + "ewc_loss": 0.007633145432919264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633145287400112e-05, + "grad_norm": 3.7260937690734863, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8814015984535217, + "num_tokens": 358672194.0, + "step": 9399 + }, + { + "epoch": 1.195776618750795, + "ewc_loss": 0.0075897034257650375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.589703454868868e-05, + "grad_norm": 3.7363524436950684, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8810028433799744, + "num_tokens": 358710192.0, + "step": 9400 + }, + { + "epoch": 1.1959038290293855, + "ewc_loss": 0.007607932668179274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.607932639075443e-05, + "grad_norm": 3.7934153079986572, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8598109483718872, + "num_tokens": 358749284.0, + "step": 9401 + }, + { + "epoch": 1.196031039307976, + "ewc_loss": 0.007631865330040455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631865446455777e-05, + "grad_norm": 3.829082727432251, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8701426982879639, + "num_tokens": 358782830.0, + "step": 9402 + }, + { + "epoch": 1.1961582495865666, + "ewc_loss": 0.007631538901478052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6315387559589e-05, + "grad_norm": 3.7864203453063965, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8719228506088257, + "num_tokens": 358820696.0, + "step": 9403 + }, + { + "epoch": 1.196285459865157, + "ewc_loss": 0.007573322393000126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.573322363896295e-05, + "grad_norm": 3.8675551414489746, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8804874420166016, + "num_tokens": 358853616.0, + "step": 9404 + }, + { + "epoch": 1.1964126701437476, + "ewc_loss": 0.0076620010659098625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.662001007702202e-05, + "grad_norm": 3.708467960357666, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8807386159896851, + "num_tokens": 358893420.0, + "step": 9405 + }, + { + "epoch": 1.1965398804223382, + "ewc_loss": 0.007517660036683083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.51765983295627e-05, + "grad_norm": 3.775270700454712, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8739749193191528, + "num_tokens": 358930704.0, + "step": 9406 + }, + { + "epoch": 1.1966670907009287, + "ewc_loss": 0.007628461811691523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.628461753483862e-05, + "grad_norm": 3.7228126525878906, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8857690691947937, + "num_tokens": 358967730.0, + "step": 9407 + }, + { + "epoch": 1.1967943009795192, + "ewc_loss": 0.007564393803477287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564394036307931e-05, + "grad_norm": 3.699063777923584, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.886144757270813, + "num_tokens": 359007241.0, + "step": 9408 + }, + { + "epoch": 1.1969215112581097, + "ewc_loss": 0.007564402651041746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564402767457068e-05, + "grad_norm": 3.797109365463257, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8610274195671082, + "num_tokens": 359045573.0, + "step": 9409 + }, + { + "epoch": 1.1970487215367003, + "ewc_loss": 0.007645968347787857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.645968435099348e-05, + "grad_norm": 3.7594470977783203, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.877594530582428, + "num_tokens": 359083420.0, + "step": 9410 + }, + { + "epoch": 1.1971759318152906, + "ewc_loss": 0.007586263585835695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586263382108882e-05, + "grad_norm": 3.735656499862671, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8858299255371094, + "num_tokens": 359117555.0, + "step": 9411 + }, + { + "epoch": 1.1973031420938811, + "ewc_loss": 0.007584055885672569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.584055856568739e-05, + "grad_norm": 3.790433645248413, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8716529607772827, + "num_tokens": 359154597.0, + "step": 9412 + }, + { + "epoch": 1.1974303523724716, + "ewc_loss": 0.007605890277773142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.605890277773142e-05, + "grad_norm": 3.7748661041259766, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8680296540260315, + "num_tokens": 359190074.0, + "step": 9413 + }, + { + "epoch": 1.1975575626510622, + "ewc_loss": 0.007596040144562721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.59604008635506e-05, + "grad_norm": 3.722609043121338, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8624241352081299, + "num_tokens": 359232711.0, + "step": 9414 + }, + { + "epoch": 1.1976847729296527, + "ewc_loss": 0.007564117666333914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.564117549918592e-05, + "grad_norm": 3.847548723220825, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8737972974777222, + "num_tokens": 359265582.0, + "step": 9415 + }, + { + "epoch": 1.1978119832082432, + "ewc_loss": 0.007667500991374254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.667500904062763e-05, + "grad_norm": 3.7292428016662598, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8807168006896973, + "num_tokens": 359303177.0, + "step": 9416 + }, + { + "epoch": 1.1979391934868338, + "ewc_loss": 0.0075432611629366875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.543261017417535e-05, + "grad_norm": 3.766188621520996, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8839563131332397, + "num_tokens": 359338712.0, + "step": 9417 + }, + { + "epoch": 1.1980664037654243, + "ewc_loss": 0.00761300977319479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.61300980229862e-05, + "grad_norm": 3.7947521209716797, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8825967311859131, + "num_tokens": 359372849.0, + "step": 9418 + }, + { + "epoch": 1.1981936140440148, + "ewc_loss": 0.0076148854568600655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.614885544171557e-05, + "grad_norm": 3.8032736778259277, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8785648345947266, + "num_tokens": 359407981.0, + "step": 9419 + }, + { + "epoch": 1.1983208243226053, + "ewc_loss": 0.007593852933496237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.593852933496237e-05, + "grad_norm": 3.7793774604797363, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8539387583732605, + "num_tokens": 359450180.0, + "step": 9420 + }, + { + "epoch": 1.1984480346011959, + "ewc_loss": 0.007593762595206499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.593762711621821e-05, + "grad_norm": 3.7539403438568115, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8723920583724976, + "num_tokens": 359487909.0, + "step": 9421 + }, + { + "epoch": 1.1985752448797862, + "ewc_loss": 0.007581949234008789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.581949466839433e-05, + "grad_norm": 3.8172249794006348, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8861793279647827, + "num_tokens": 359522609.0, + "step": 9422 + }, + { + "epoch": 1.1987024551583767, + "ewc_loss": 0.007639527786523104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639527757419273e-05, + "grad_norm": 3.7464981079101562, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8697428703308105, + "num_tokens": 359560735.0, + "step": 9423 + }, + { + "epoch": 1.1988296654369672, + "ewc_loss": 0.007574582472443581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.574582559755072e-05, + "grad_norm": 3.74812650680542, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8681546449661255, + "num_tokens": 359602994.0, + "step": 9424 + }, + { + "epoch": 1.1989568757155578, + "ewc_loss": 0.007618425879627466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.618426025146618e-05, + "grad_norm": 3.762256145477295, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8746007084846497, + "num_tokens": 359638864.0, + "step": 9425 + }, + { + "epoch": 1.1990840859941483, + "ewc_loss": 0.007630911655724049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.630911568412557e-05, + "grad_norm": 3.8093490600585938, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8814253807067871, + "num_tokens": 359674202.0, + "step": 9426 + }, + { + "epoch": 1.1992112962727388, + "ewc_loss": 0.0076445029117167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644503057235852e-05, + "grad_norm": 3.8146555423736572, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8796194791793823, + "num_tokens": 359710408.0, + "step": 9427 + }, + { + "epoch": 1.1993385065513293, + "ewc_loss": 0.0076360562816262245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636056398041546e-05, + "grad_norm": 3.7308928966522217, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8800788521766663, + "num_tokens": 359749068.0, + "step": 9428 + }, + { + "epoch": 1.1994657168299199, + "ewc_loss": 0.007597781252115965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597781223012134e-05, + "grad_norm": 3.753636121749878, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8738922476768494, + "num_tokens": 359788449.0, + "step": 9429 + }, + { + "epoch": 1.1995929271085104, + "ewc_loss": 0.007624432444572449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.624432328157127e-05, + "grad_norm": 3.812385320663452, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8698220252990723, + "num_tokens": 359824167.0, + "step": 9430 + }, + { + "epoch": 1.199720137387101, + "ewc_loss": 0.007639083079993725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639083196409047e-05, + "grad_norm": 3.765113353729248, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8700340390205383, + "num_tokens": 359861476.0, + "step": 9431 + }, + { + "epoch": 1.1998473476656915, + "ewc_loss": 0.007601452525705099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.601452671224251e-05, + "grad_norm": 3.719191551208496, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8714953660964966, + "num_tokens": 359906463.0, + "step": 9432 + }, + { + "epoch": 1.199974557944282, + "ewc_loss": 0.007603746838867664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.603746780660003e-05, + "grad_norm": 3.8152263164520264, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.877884030342102, + "num_tokens": 359940576.0, + "step": 9433 + }, + { + "epoch": 1.2001017682228725, + "ewc_loss": 0.00766740320250392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.667403406230733e-05, + "grad_norm": 3.798788070678711, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8852416276931763, + "num_tokens": 359973616.0, + "step": 9434 + }, + { + "epoch": 1.200228978501463, + "ewc_loss": 0.007627845276147127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62784547987394e-05, + "grad_norm": 3.765645742416382, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8863135576248169, + "num_tokens": 360011059.0, + "step": 9435 + }, + { + "epoch": 1.2003561887800533, + "ewc_loss": 0.007613640744239092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.61364062782377e-05, + "grad_norm": 3.75262713432312, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8826315402984619, + "num_tokens": 360048307.0, + "step": 9436 + }, + { + "epoch": 1.2004833990586439, + "ewc_loss": 0.007625875994563103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.625875878147781e-05, + "grad_norm": 3.7917439937591553, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8744242191314697, + "num_tokens": 360086946.0, + "step": 9437 + }, + { + "epoch": 1.2006106093372344, + "ewc_loss": 0.0076472400687634945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.647240272490308e-05, + "grad_norm": 3.748828411102295, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8776279091835022, + "num_tokens": 360121908.0, + "step": 9438 + }, + { + "epoch": 1.200737819615825, + "ewc_loss": 0.007604830898344517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.604830898344517e-05, + "grad_norm": 3.738588333129883, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8647927045822144, + "num_tokens": 360164349.0, + "step": 9439 + }, + { + "epoch": 1.2008650298944155, + "ewc_loss": 0.007630493957549334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.630493928445503e-05, + "grad_norm": 3.7530133724212646, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8882192373275757, + "num_tokens": 360197660.0, + "step": 9440 + }, + { + "epoch": 1.200992240173006, + "ewc_loss": 0.007649277336895466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.649277540622279e-05, + "grad_norm": 3.7853734493255615, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8515834808349609, + "num_tokens": 360240297.0, + "step": 9441 + }, + { + "epoch": 1.2011194504515965, + "ewc_loss": 0.007658157031983137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658157119294629e-05, + "grad_norm": 3.7490248680114746, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8642235398292542, + "num_tokens": 360281566.0, + "step": 9442 + }, + { + "epoch": 1.201246660730187, + "ewc_loss": 0.00762224243953824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.622242264915258e-05, + "grad_norm": 3.832380533218384, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8810645341873169, + "num_tokens": 360313927.0, + "step": 9443 + }, + { + "epoch": 1.2013738710087776, + "ewc_loss": 0.007679250091314316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.679250120418146e-05, + "grad_norm": 3.7596828937530518, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8776401281356812, + "num_tokens": 360349369.0, + "step": 9444 + }, + { + "epoch": 1.201501081287368, + "ewc_loss": 0.0075970920734107494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597092189826071e-05, + "grad_norm": 3.754390001296997, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8723611235618591, + "num_tokens": 360386996.0, + "step": 9445 + }, + { + "epoch": 1.2016282915659584, + "ewc_loss": 0.007621771655976772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62177151045762e-05, + "grad_norm": 3.7173807621002197, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8774280548095703, + "num_tokens": 360424883.0, + "step": 9446 + }, + { + "epoch": 1.201755501844549, + "ewc_loss": 0.007600326556712389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600326352985576e-05, + "grad_norm": 3.7904539108276367, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.880567193031311, + "num_tokens": 360457992.0, + "step": 9447 + }, + { + "epoch": 1.2018827121231395, + "ewc_loss": 0.007657655980437994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.657655805815011e-05, + "grad_norm": 3.742130994796753, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8609355092048645, + "num_tokens": 360497934.0, + "step": 9448 + }, + { + "epoch": 1.20200992240173, + "ewc_loss": 0.007598897907882929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.598898082505912e-05, + "grad_norm": 3.791703462600708, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8712505102157593, + "num_tokens": 360533928.0, + "step": 9449 + }, + { + "epoch": 1.2021371326803205, + "ewc_loss": 0.007657344918698072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.657345122424886e-05, + "grad_norm": 3.775111436843872, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8764307498931885, + "num_tokens": 360571076.0, + "step": 9450 + }, + { + "epoch": 1.202264342958911, + "ewc_loss": 0.007631439249962568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631439075339586e-05, + "grad_norm": 3.7235116958618164, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8672994375228882, + "num_tokens": 360611957.0, + "step": 9451 + }, + { + "epoch": 1.2023915532375016, + "ewc_loss": 0.00760649936273694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.606499275425449e-05, + "grad_norm": 3.8013386726379395, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8672094345092773, + "num_tokens": 360647823.0, + "step": 9452 + }, + { + "epoch": 1.202518763516092, + "ewc_loss": 0.007687285542488098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.68728568800725e-05, + "grad_norm": 3.7830164432525635, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8654451370239258, + "num_tokens": 360688341.0, + "step": 9453 + }, + { + "epoch": 1.2026459737946826, + "ewc_loss": 0.007639850955456495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639850809937343e-05, + "grad_norm": 3.7686543464660645, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8661669492721558, + "num_tokens": 360730417.0, + "step": 9454 + }, + { + "epoch": 1.2027731840732732, + "ewc_loss": 0.007635199464857578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.635199290234596e-05, + "grad_norm": 3.7640061378479004, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8543813824653625, + "num_tokens": 360771931.0, + "step": 9455 + }, + { + "epoch": 1.2029003943518637, + "ewc_loss": 0.007640006486326456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640006515430287e-05, + "grad_norm": 3.753185510635376, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8714723587036133, + "num_tokens": 360811460.0, + "step": 9456 + }, + { + "epoch": 1.2030276046304542, + "ewc_loss": 0.007631534244865179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631534390384331e-05, + "grad_norm": 3.7433671951293945, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8856144547462463, + "num_tokens": 360849476.0, + "step": 9457 + }, + { + "epoch": 1.2031548149090447, + "ewc_loss": 0.007636871188879013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636871305294335e-05, + "grad_norm": 3.7506916522979736, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8624838590621948, + "num_tokens": 360891178.0, + "step": 9458 + }, + { + "epoch": 1.2032820251876353, + "ewc_loss": 0.007651001214981079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.651001214981079e-05, + "grad_norm": 3.786083459854126, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8702841997146606, + "num_tokens": 360930503.0, + "step": 9459 + }, + { + "epoch": 1.2034092354662256, + "ewc_loss": 0.007640139665454626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640139665454626e-05, + "grad_norm": 3.7996327877044678, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8702583312988281, + "num_tokens": 360963730.0, + "step": 9460 + }, + { + "epoch": 1.203536445744816, + "ewc_loss": 0.007646805141121149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64680517022498e-05, + "grad_norm": 3.7350168228149414, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8606109023094177, + "num_tokens": 361003796.0, + "step": 9461 + }, + { + "epoch": 1.2036636560234066, + "ewc_loss": 0.007608685176819563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.60868497309275e-05, + "grad_norm": 3.72589111328125, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8740589618682861, + "num_tokens": 361044818.0, + "step": 9462 + }, + { + "epoch": 1.2037908663019972, + "ewc_loss": 0.0076329647563397884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.63296484365128e-05, + "grad_norm": 3.735692024230957, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8836706876754761, + "num_tokens": 361080692.0, + "step": 9463 + }, + { + "epoch": 1.2039180765805877, + "ewc_loss": 0.007632511667907238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.632511551491916e-05, + "grad_norm": 3.8361587524414062, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8646983504295349, + "num_tokens": 361116609.0, + "step": 9464 + }, + { + "epoch": 1.2040452868591782, + "ewc_loss": 0.007695522624999285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.695522799622267e-05, + "grad_norm": 3.7163407802581787, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8680673241615295, + "num_tokens": 361154559.0, + "step": 9465 + }, + { + "epoch": 1.2041724971377687, + "ewc_loss": 0.007608360610902309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.608360465383157e-05, + "grad_norm": 3.7627570629119873, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8836772441864014, + "num_tokens": 361191005.0, + "step": 9466 + }, + { + "epoch": 1.2042997074163593, + "ewc_loss": 0.007679317146539688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.679317059228197e-05, + "grad_norm": 3.730320692062378, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8712066411972046, + "num_tokens": 361230191.0, + "step": 9467 + }, + { + "epoch": 1.2044269176949498, + "ewc_loss": 0.007635048124939203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.63504795031622e-05, + "grad_norm": 3.76725172996521, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.866281270980835, + "num_tokens": 361269132.0, + "step": 9468 + }, + { + "epoch": 1.2045541279735403, + "ewc_loss": 0.007667504716664553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66750454204157e-05, + "grad_norm": 3.7574942111968994, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8659744262695312, + "num_tokens": 361306674.0, + "step": 9469 + }, + { + "epoch": 1.2046813382521309, + "ewc_loss": 0.00765581801533699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.655817898921669e-05, + "grad_norm": 3.7194881439208984, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8762876987457275, + "num_tokens": 361348309.0, + "step": 9470 + }, + { + "epoch": 1.2048085485307212, + "ewc_loss": 0.0076367612928152084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636761438334361e-05, + "grad_norm": 3.7779691219329834, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8679903149604797, + "num_tokens": 361386824.0, + "step": 9471 + }, + { + "epoch": 1.2049357588093117, + "ewc_loss": 0.007695651613175869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.695651584072039e-05, + "grad_norm": 3.7992870807647705, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8561617136001587, + "num_tokens": 361426590.0, + "step": 9472 + }, + { + "epoch": 1.2050629690879022, + "ewc_loss": 0.007663060445338488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.663060387130827e-05, + "grad_norm": 3.800506114959717, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8622525930404663, + "num_tokens": 361463259.0, + "step": 9473 + }, + { + "epoch": 1.2051901793664928, + "ewc_loss": 0.007673684507608414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.673684740439057e-05, + "grad_norm": 3.775738477706909, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8816438913345337, + "num_tokens": 361498750.0, + "step": 9474 + }, + { + "epoch": 1.2053173896450833, + "ewc_loss": 0.007639135699719191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639135583303869e-05, + "grad_norm": 3.775960683822632, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8629055023193359, + "num_tokens": 361537101.0, + "step": 9475 + }, + { + "epoch": 1.2054445999236738, + "ewc_loss": 0.007657740730792284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.657740934519097e-05, + "grad_norm": 3.8292806148529053, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8768025636672974, + "num_tokens": 361572130.0, + "step": 9476 + }, + { + "epoch": 1.2055718102022643, + "ewc_loss": 0.0076951757073402405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.695175736444071e-05, + "grad_norm": 3.7898826599121094, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8874949216842651, + "num_tokens": 361608484.0, + "step": 9477 + }, + { + "epoch": 1.2056990204808549, + "ewc_loss": 0.007658146787434816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658146932953969e-05, + "grad_norm": 3.7186174392700195, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8894457221031189, + "num_tokens": 361650361.0, + "step": 9478 + }, + { + "epoch": 1.2058262307594454, + "ewc_loss": 0.007617687340825796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617687515448779e-05, + "grad_norm": 3.784701347351074, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8816592693328857, + "num_tokens": 361680670.0, + "step": 9479 + }, + { + "epoch": 1.205953441038036, + "ewc_loss": 0.007694091647863388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.694091618759558e-05, + "grad_norm": 3.702589750289917, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8791106939315796, + "num_tokens": 361723504.0, + "step": 9480 + }, + { + "epoch": 1.2060806513166265, + "ewc_loss": 0.007610548287630081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.610548345837742e-05, + "grad_norm": 3.738255500793457, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8791664838790894, + "num_tokens": 361765850.0, + "step": 9481 + }, + { + "epoch": 1.206207861595217, + "ewc_loss": 0.007668645586818457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.668645412195474e-05, + "grad_norm": 3.7972872257232666, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.865248441696167, + "num_tokens": 361804375.0, + "step": 9482 + }, + { + "epoch": 1.2063350718738075, + "ewc_loss": 0.0076910690404474735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.691069185966626e-05, + "grad_norm": 3.834047317504883, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8703038096427917, + "num_tokens": 361834714.0, + "step": 9483 + }, + { + "epoch": 1.206462282152398, + "ewc_loss": 0.007689760997891426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.689760968787596e-05, + "grad_norm": 3.736956834793091, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8741052150726318, + "num_tokens": 361877568.0, + "step": 9484 + }, + { + "epoch": 1.2065894924309883, + "ewc_loss": 0.007641316391527653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64131618780084e-05, + "grad_norm": 3.7746520042419434, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.869236409664154, + "num_tokens": 361914785.0, + "step": 9485 + }, + { + "epoch": 1.2067167027095789, + "ewc_loss": 0.007695693988353014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.695693784626201e-05, + "grad_norm": 3.7508299350738525, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8810145854949951, + "num_tokens": 361956781.0, + "step": 9486 + }, + { + "epoch": 1.2068439129881694, + "ewc_loss": 0.0076665012165904045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.666501187486574e-05, + "grad_norm": 3.740039348602295, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8822746276855469, + "num_tokens": 361997408.0, + "step": 9487 + }, + { + "epoch": 1.20697112326676, + "ewc_loss": 0.007672871463000774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.672871288377792e-05, + "grad_norm": 3.8027844429016113, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8766579627990723, + "num_tokens": 362030226.0, + "step": 9488 + }, + { + "epoch": 1.2070983335453505, + "ewc_loss": 0.007714997045695782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71499690017663e-05, + "grad_norm": 3.720355749130249, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8834547996520996, + "num_tokens": 362068370.0, + "step": 9489 + }, + { + "epoch": 1.207225543823941, + "ewc_loss": 0.007633804343640804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633804489159957e-05, + "grad_norm": 3.728745460510254, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8807095289230347, + "num_tokens": 362108569.0, + "step": 9490 + }, + { + "epoch": 1.2073527541025315, + "ewc_loss": 0.007648960221558809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6489603088703e-05, + "grad_norm": 3.845452070236206, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8666479587554932, + "num_tokens": 362140476.0, + "step": 9491 + }, + { + "epoch": 1.207479964381122, + "ewc_loss": 0.007716809865087271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716810068814084e-05, + "grad_norm": 3.826213836669922, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8833121061325073, + "num_tokens": 362171132.0, + "step": 9492 + }, + { + "epoch": 1.2076071746597126, + "ewc_loss": 0.007651070132851601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.651070336578414e-05, + "grad_norm": 3.7675044536590576, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8720026016235352, + "num_tokens": 362204791.0, + "step": 9493 + }, + { + "epoch": 1.207734384938303, + "ewc_loss": 0.007637920323759317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6379204983823e-05, + "grad_norm": 3.760500431060791, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8777788877487183, + "num_tokens": 362242360.0, + "step": 9494 + }, + { + "epoch": 1.2078615952168934, + "ewc_loss": 0.007663261145353317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.663261203560978e-05, + "grad_norm": 3.798780918121338, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8772426843643188, + "num_tokens": 362276558.0, + "step": 9495 + }, + { + "epoch": 1.207988805495484, + "ewc_loss": 0.007686546538025141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.68654645071365e-05, + "grad_norm": 3.894524097442627, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.861464262008667, + "num_tokens": 362307055.0, + "step": 9496 + }, + { + "epoch": 1.2081160157740745, + "ewc_loss": 0.0077340309508144855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734030805295333e-05, + "grad_norm": 3.744880437850952, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8836691379547119, + "num_tokens": 362342433.0, + "step": 9497 + }, + { + "epoch": 1.208243226052665, + "ewc_loss": 0.0076148551888763905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.614854985149577e-05, + "grad_norm": 3.7763991355895996, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8946465253829956, + "num_tokens": 362377008.0, + "step": 9498 + }, + { + "epoch": 1.2083704363312555, + "ewc_loss": 0.007700961548835039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70096157793887e-05, + "grad_norm": 3.793816566467285, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8873742818832397, + "num_tokens": 362412089.0, + "step": 9499 + }, + { + "epoch": 1.208497646609846, + "ewc_loss": 0.0076985531486570835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.698553235968575e-05, + "grad_norm": 3.748619794845581, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8663668632507324, + "num_tokens": 362455817.0, + "step": 9500 + }, + { + "epoch": 1.2086248568884366, + "ewc_loss": 0.0076500289142131805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.650029147043824e-05, + "grad_norm": 3.7077584266662598, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8733871579170227, + "num_tokens": 362498859.0, + "step": 9501 + }, + { + "epoch": 1.208752067167027, + "ewc_loss": 0.007663034368306398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.663034193683416e-05, + "grad_norm": 3.764164686203003, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8724642992019653, + "num_tokens": 362542346.0, + "step": 9502 + }, + { + "epoch": 1.2088792774456176, + "ewc_loss": 0.007708842866122723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708842895226553e-05, + "grad_norm": 3.7448089122772217, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8932437896728516, + "num_tokens": 362575360.0, + "step": 9503 + }, + { + "epoch": 1.2090064877242082, + "ewc_loss": 0.007652158848941326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.652158819837496e-05, + "grad_norm": 3.751432418823242, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8754928708076477, + "num_tokens": 362614737.0, + "step": 9504 + }, + { + "epoch": 1.2091336980027987, + "ewc_loss": 0.007652400527149439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.652400381630287e-05, + "grad_norm": 3.8045926094055176, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8692800402641296, + "num_tokens": 362653756.0, + "step": 9505 + }, + { + "epoch": 1.2092609082813892, + "ewc_loss": 0.007671517785638571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.671517960261554e-05, + "grad_norm": 3.779409885406494, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8761011958122253, + "num_tokens": 362690745.0, + "step": 9506 + }, + { + "epoch": 1.2093881185599797, + "ewc_loss": 0.007648559287190437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648559403605759e-05, + "grad_norm": 3.755509853363037, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8814252614974976, + "num_tokens": 362726328.0, + "step": 9507 + }, + { + "epoch": 1.2095153288385703, + "ewc_loss": 0.0076391445472836494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639144314453006e-05, + "grad_norm": 3.7446985244750977, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8779125809669495, + "num_tokens": 362764285.0, + "step": 9508 + }, + { + "epoch": 1.2096425391171606, + "ewc_loss": 0.0076437960378825665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.643795834155753e-05, + "grad_norm": 3.8658838272094727, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8560128211975098, + "num_tokens": 362794697.0, + "step": 9509 + }, + { + "epoch": 1.209769749395751, + "ewc_loss": 0.007725023664534092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725023897364736e-05, + "grad_norm": 3.798795461654663, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8722901940345764, + "num_tokens": 362831618.0, + "step": 9510 + }, + { + "epoch": 1.2098969596743416, + "ewc_loss": 0.007631579414010048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631579501321539e-05, + "grad_norm": 3.8427395820617676, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8815005421638489, + "num_tokens": 362866404.0, + "step": 9511 + }, + { + "epoch": 1.2100241699529322, + "ewc_loss": 0.0076773217879235744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.677321991650388e-05, + "grad_norm": 3.752397060394287, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8898420333862305, + "num_tokens": 362902554.0, + "step": 9512 + }, + { + "epoch": 1.2101513802315227, + "ewc_loss": 0.007622171193361282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.622170960530639e-05, + "grad_norm": 3.739753484725952, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8670738935470581, + "num_tokens": 362944169.0, + "step": 9513 + }, + { + "epoch": 1.2102785905101132, + "ewc_loss": 0.007642659358680248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.642659329576418e-05, + "grad_norm": 3.749445676803589, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8537986278533936, + "num_tokens": 362983789.0, + "step": 9514 + }, + { + "epoch": 1.2104058007887037, + "ewc_loss": 0.007663680240511894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.663680298719555e-05, + "grad_norm": 3.774768114089966, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8794140815734863, + "num_tokens": 363021551.0, + "step": 9515 + }, + { + "epoch": 1.2105330110672943, + "ewc_loss": 0.007669265381991863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.669265323784202e-05, + "grad_norm": 3.728282928466797, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8587571978569031, + "num_tokens": 363064951.0, + "step": 9516 + }, + { + "epoch": 1.2106602213458848, + "ewc_loss": 0.0076496414840221405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.649641338502988e-05, + "grad_norm": 3.7269062995910645, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8739415407180786, + "num_tokens": 363110876.0, + "step": 9517 + }, + { + "epoch": 1.2107874316244753, + "ewc_loss": 0.007673730608075857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.673730578972027e-05, + "grad_norm": 3.737234354019165, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8717010617256165, + "num_tokens": 363155310.0, + "step": 9518 + }, + { + "epoch": 1.2109146419030659, + "ewc_loss": 0.007676014676690102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.676014502067119e-05, + "grad_norm": 3.746244192123413, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8839552402496338, + "num_tokens": 363193166.0, + "step": 9519 + }, + { + "epoch": 1.2110418521816562, + "ewc_loss": 0.0076735299080610275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.673529762541875e-05, + "grad_norm": 3.854804277420044, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8576441407203674, + "num_tokens": 363225163.0, + "step": 9520 + }, + { + "epoch": 1.2111690624602467, + "ewc_loss": 0.0077299694530665874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729969365755096e-05, + "grad_norm": 3.7626781463623047, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8736121654510498, + "num_tokens": 363264720.0, + "step": 9521 + }, + { + "epoch": 1.2112962727388372, + "ewc_loss": 0.0076384698040783405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.638469833182171e-05, + "grad_norm": 3.75343656539917, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8600170612335205, + "num_tokens": 363305029.0, + "step": 9522 + }, + { + "epoch": 1.2114234830174277, + "ewc_loss": 0.00766098964959383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66098964959383e-05, + "grad_norm": 3.7297701835632324, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8799575567245483, + "num_tokens": 363344128.0, + "step": 9523 + }, + { + "epoch": 1.2115506932960183, + "ewc_loss": 0.00764621514827013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.646215090062469e-05, + "grad_norm": 3.745337724685669, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8729700446128845, + "num_tokens": 363385903.0, + "step": 9524 + }, + { + "epoch": 1.2116779035746088, + "ewc_loss": 0.007652184460312128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.652184285689145e-05, + "grad_norm": 3.767303228378296, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.875925600528717, + "num_tokens": 363425216.0, + "step": 9525 + }, + { + "epoch": 1.2118051138531993, + "ewc_loss": 0.007657771464437246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.657771493541077e-05, + "grad_norm": 3.7417759895324707, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8855533599853516, + "num_tokens": 363462691.0, + "step": 9526 + }, + { + "epoch": 1.2119323241317899, + "ewc_loss": 0.007634320296347141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.634320354554802e-05, + "grad_norm": 3.853728771209717, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8586393594741821, + "num_tokens": 363495849.0, + "step": 9527 + }, + { + "epoch": 1.2120595344103804, + "ewc_loss": 0.007711923681199551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711923535680398e-05, + "grad_norm": 3.753385066986084, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8731837272644043, + "num_tokens": 363535268.0, + "step": 9528 + }, + { + "epoch": 1.212186744688971, + "ewc_loss": 0.007607522886246443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.607523002661765e-05, + "grad_norm": 3.783062219619751, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.858155369758606, + "num_tokens": 363573523.0, + "step": 9529 + }, + { + "epoch": 1.2123139549675614, + "ewc_loss": 0.007667019031941891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.667019235668704e-05, + "grad_norm": 3.7451019287109375, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8736926317214966, + "num_tokens": 363607545.0, + "step": 9530 + }, + { + "epoch": 1.212441165246152, + "ewc_loss": 0.0076447054743766785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644705328857526e-05, + "grad_norm": 3.7454099655151367, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8753007650375366, + "num_tokens": 363648456.0, + "step": 9531 + }, + { + "epoch": 1.2125683755247425, + "ewc_loss": 0.007629077881574631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.629078027093783e-05, + "grad_norm": 3.6977407932281494, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8828301429748535, + "num_tokens": 363688726.0, + "step": 9532 + }, + { + "epoch": 1.212695585803333, + "ewc_loss": 0.007620977703481913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.620977703481913e-05, + "grad_norm": 3.778583288192749, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8673973083496094, + "num_tokens": 363728557.0, + "step": 9533 + }, + { + "epoch": 1.2128227960819233, + "ewc_loss": 0.007675512693822384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.67551246099174e-05, + "grad_norm": 3.8022067546844482, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8736962080001831, + "num_tokens": 363764191.0, + "step": 9534 + }, + { + "epoch": 1.2129500063605139, + "ewc_loss": 0.007657888811081648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.657888636458665e-05, + "grad_norm": 3.829885244369507, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8639286756515503, + "num_tokens": 363798126.0, + "step": 9535 + }, + { + "epoch": 1.2130772166391044, + "ewc_loss": 0.007669234182685614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.669234037166461e-05, + "grad_norm": 3.779134750366211, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8582569360733032, + "num_tokens": 363836037.0, + "step": 9536 + }, + { + "epoch": 1.213204426917695, + "ewc_loss": 0.007625322323292494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.625322177773342e-05, + "grad_norm": 3.73152232170105, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8937724828720093, + "num_tokens": 363872359.0, + "step": 9537 + }, + { + "epoch": 1.2133316371962855, + "ewc_loss": 0.007633409462869167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633409404661506e-05, + "grad_norm": 3.751288414001465, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8677820563316345, + "num_tokens": 363914206.0, + "step": 9538 + }, + { + "epoch": 1.213458847474876, + "ewc_loss": 0.007649904116988182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64990400057286e-05, + "grad_norm": 3.7391185760498047, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.874272346496582, + "num_tokens": 363956147.0, + "step": 9539 + }, + { + "epoch": 1.2135860577534665, + "ewc_loss": 0.00762173580005765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62173585826531e-05, + "grad_norm": 3.8249671459198, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8906759023666382, + "num_tokens": 363986328.0, + "step": 9540 + }, + { + "epoch": 1.213713268032057, + "ewc_loss": 0.007686267141252756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.686267053941265e-05, + "grad_norm": 3.763115644454956, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8707355260848999, + "num_tokens": 364024905.0, + "step": 9541 + }, + { + "epoch": 1.2138404783106476, + "ewc_loss": 0.007630037143826485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.630036998307332e-05, + "grad_norm": 3.742037296295166, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8730500936508179, + "num_tokens": 364068628.0, + "step": 9542 + }, + { + "epoch": 1.213967688589238, + "ewc_loss": 0.0076494114473462105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64941141824238e-05, + "grad_norm": 3.781165838241577, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8717645406723022, + "num_tokens": 364104261.0, + "step": 9543 + }, + { + "epoch": 1.2140948988678284, + "ewc_loss": 0.007659361232072115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.659361290279776e-05, + "grad_norm": 3.7318034172058105, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8717098236083984, + "num_tokens": 364147524.0, + "step": 9544 + }, + { + "epoch": 1.214222109146419, + "ewc_loss": 0.0076208277605473995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62082781875506e-05, + "grad_norm": 3.719625473022461, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8761234879493713, + "num_tokens": 364187715.0, + "step": 9545 + }, + { + "epoch": 1.2143493194250095, + "ewc_loss": 0.007641117554157972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.641117554157972e-05, + "grad_norm": 3.7739765644073486, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8850177526473999, + "num_tokens": 364226081.0, + "step": 9546 + }, + { + "epoch": 1.2144765297036, + "ewc_loss": 0.00766746886074543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66746888984926e-05, + "grad_norm": 3.80047869682312, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8742151856422424, + "num_tokens": 364268390.0, + "step": 9547 + }, + { + "epoch": 1.2146037399821905, + "ewc_loss": 0.007642979267984629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.642979471711442e-05, + "grad_norm": 3.724731922149658, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8762328028678894, + "num_tokens": 364310176.0, + "step": 9548 + }, + { + "epoch": 1.214730950260781, + "ewc_loss": 0.00759259844198823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.592598558403552e-05, + "grad_norm": 3.767092704772949, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8849459886550903, + "num_tokens": 364345652.0, + "step": 9549 + }, + { + "epoch": 1.2148581605393716, + "ewc_loss": 0.007627978455275297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62797862989828e-05, + "grad_norm": 3.794001817703247, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8700007200241089, + "num_tokens": 364383810.0, + "step": 9550 + }, + { + "epoch": 1.214985370817962, + "ewc_loss": 0.007614110596477985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.614110654685646e-05, + "grad_norm": 3.8049938678741455, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8777896761894226, + "num_tokens": 364421169.0, + "step": 9551 + }, + { + "epoch": 1.2151125810965526, + "ewc_loss": 0.00759729091078043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597290823468938e-05, + "grad_norm": 3.8107056617736816, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8673408031463623, + "num_tokens": 364452552.0, + "step": 9552 + }, + { + "epoch": 1.2152397913751432, + "ewc_loss": 0.0076079959981143475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.607995939906687e-05, + "grad_norm": 3.8303585052490234, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8724635243415833, + "num_tokens": 364486779.0, + "step": 9553 + }, + { + "epoch": 1.2153670016537337, + "ewc_loss": 0.007619788870215416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.619788812007755e-05, + "grad_norm": 3.735158920288086, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8833062648773193, + "num_tokens": 364526237.0, + "step": 9554 + }, + { + "epoch": 1.2154942119323242, + "ewc_loss": 0.007566562853753567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.566562999272719e-05, + "grad_norm": 3.716378688812256, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8796969652175903, + "num_tokens": 364568446.0, + "step": 9555 + }, + { + "epoch": 1.2156214222109147, + "ewc_loss": 0.007590323220938444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.590323366457596e-05, + "grad_norm": 3.7940452098846436, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.882252037525177, + "num_tokens": 364604448.0, + "step": 9556 + }, + { + "epoch": 1.2157486324895053, + "ewc_loss": 0.007632254157215357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.632253982592374e-05, + "grad_norm": 3.7658157348632812, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8725866079330444, + "num_tokens": 364642707.0, + "step": 9557 + }, + { + "epoch": 1.2158758427680956, + "ewc_loss": 0.007581657730042934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.581657700939104e-05, + "grad_norm": 3.7692787647247314, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8686540722846985, + "num_tokens": 364678428.0, + "step": 9558 + }, + { + "epoch": 1.216003053046686, + "ewc_loss": 0.007611053995788097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.611054024891928e-05, + "grad_norm": 3.7611374855041504, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8867754936218262, + "num_tokens": 364718915.0, + "step": 9559 + }, + { + "epoch": 1.2161302633252766, + "ewc_loss": 0.007609459105879068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.609459134982899e-05, + "grad_norm": 3.7587997913360596, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8900876641273499, + "num_tokens": 364756030.0, + "step": 9560 + }, + { + "epoch": 1.2162574736038672, + "ewc_loss": 0.0076020872220396996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.602087134728208e-05, + "grad_norm": 3.781543731689453, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8718587756156921, + "num_tokens": 364796158.0, + "step": 9561 + }, + { + "epoch": 1.2163846838824577, + "ewc_loss": 0.007616520393639803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.616520451847464e-05, + "grad_norm": 3.817523241043091, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8833761811256409, + "num_tokens": 364832722.0, + "step": 9562 + }, + { + "epoch": 1.2165118941610482, + "ewc_loss": 0.00762449624016881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.624496356584132e-05, + "grad_norm": 3.768495559692383, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8724024295806885, + "num_tokens": 364874120.0, + "step": 9563 + }, + { + "epoch": 1.2166391044396387, + "ewc_loss": 0.0075734276324510574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.573427865281701e-05, + "grad_norm": 3.744152069091797, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8753425478935242, + "num_tokens": 364913166.0, + "step": 9564 + }, + { + "epoch": 1.2167663147182293, + "ewc_loss": 0.007583293132483959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583293336210772e-05, + "grad_norm": 3.8215994834899902, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.855076789855957, + "num_tokens": 364949586.0, + "step": 9565 + }, + { + "epoch": 1.2168935249968198, + "ewc_loss": 0.007617918308824301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617918163305148e-05, + "grad_norm": 3.782581090927124, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8571083545684814, + "num_tokens": 364989070.0, + "step": 9566 + }, + { + "epoch": 1.2170207352754103, + "ewc_loss": 0.007581691723316908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58169189793989e-05, + "grad_norm": 3.7699391841888428, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8775554895401001, + "num_tokens": 365027050.0, + "step": 9567 + }, + { + "epoch": 1.2171479455540009, + "ewc_loss": 0.007591082248836756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.591082248836756e-05, + "grad_norm": 3.8061532974243164, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.874623715877533, + "num_tokens": 365060050.0, + "step": 9568 + }, + { + "epoch": 1.2172751558325912, + "ewc_loss": 0.007626160979270935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626161095686257e-05, + "grad_norm": 3.8184757232666016, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8429053425788879, + "num_tokens": 365098030.0, + "step": 9569 + }, + { + "epoch": 1.2174023661111817, + "ewc_loss": 0.007611779496073723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.611779437866062e-05, + "grad_norm": 3.688002824783325, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8754891157150269, + "num_tokens": 365141143.0, + "step": 9570 + }, + { + "epoch": 1.2175295763897722, + "ewc_loss": 0.007549581583589315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549581641796976e-05, + "grad_norm": 3.875051736831665, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.852540135383606, + "num_tokens": 365174204.0, + "step": 9571 + }, + { + "epoch": 1.2176567866683627, + "ewc_loss": 0.0076978327706456184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.697832916164771e-05, + "grad_norm": 3.8273987770080566, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8735756278038025, + "num_tokens": 365207170.0, + "step": 9572 + }, + { + "epoch": 1.2177839969469533, + "ewc_loss": 0.007613389752805233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.613389607286081e-05, + "grad_norm": 3.6983957290649414, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8767290115356445, + "num_tokens": 365248926.0, + "step": 9573 + }, + { + "epoch": 1.2179112072255438, + "ewc_loss": 0.007591327652335167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.591327448608354e-05, + "grad_norm": 3.7699649333953857, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8773534297943115, + "num_tokens": 365283588.0, + "step": 9574 + }, + { + "epoch": 1.2180384175041343, + "ewc_loss": 0.007669895421713591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66989542171359e-05, + "grad_norm": 3.763478994369507, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8707786798477173, + "num_tokens": 365324869.0, + "step": 9575 + }, + { + "epoch": 1.2181656277827249, + "ewc_loss": 0.0076403748244047165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640374678885564e-05, + "grad_norm": 3.7231178283691406, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8704297542572021, + "num_tokens": 365366832.0, + "step": 9576 + }, + { + "epoch": 1.2182928380613154, + "ewc_loss": 0.007620955817401409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62095587560907e-05, + "grad_norm": 3.812978744506836, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8723375201225281, + "num_tokens": 365401148.0, + "step": 9577 + }, + { + "epoch": 1.218420048339906, + "ewc_loss": 0.007703595329076052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.703595474595204e-05, + "grad_norm": 3.7390053272247314, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8765910863876343, + "num_tokens": 365442303.0, + "step": 9578 + }, + { + "epoch": 1.2185472586184964, + "ewc_loss": 0.007612702902406454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.612702756887302e-05, + "grad_norm": 3.7638683319091797, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8866194486618042, + "num_tokens": 365477877.0, + "step": 9579 + }, + { + "epoch": 1.218674468897087, + "ewc_loss": 0.007651172112673521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.651172199985012e-05, + "grad_norm": 3.7515006065368652, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8634737730026245, + "num_tokens": 365520648.0, + "step": 9580 + }, + { + "epoch": 1.2188016791756775, + "ewc_loss": 0.007624542806297541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.624542922712862e-05, + "grad_norm": 3.8369126319885254, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8565393686294556, + "num_tokens": 365555129.0, + "step": 9581 + }, + { + "epoch": 1.218928889454268, + "ewc_loss": 0.00770466448739171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704664312768728e-05, + "grad_norm": 3.7878854274749756, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8801826238632202, + "num_tokens": 365594368.0, + "step": 9582 + }, + { + "epoch": 1.2190560997328583, + "ewc_loss": 0.007632237859070301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.632237975485623e-05, + "grad_norm": 3.7743911743164062, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8742323517799377, + "num_tokens": 365634763.0, + "step": 9583 + }, + { + "epoch": 1.2191833100114489, + "ewc_loss": 0.0076560936868190765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.656093657715246e-05, + "grad_norm": 3.730875015258789, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8892240524291992, + "num_tokens": 365675951.0, + "step": 9584 + }, + { + "epoch": 1.2193105202900394, + "ewc_loss": 0.007603427395224571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.603427366120741e-05, + "grad_norm": 3.7599992752075195, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8819702863693237, + "num_tokens": 365710348.0, + "step": 9585 + }, + { + "epoch": 1.21943773056863, + "ewc_loss": 0.00764557346701622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.645573350600898e-05, + "grad_norm": 3.825737714767456, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8760758638381958, + "num_tokens": 365745166.0, + "step": 9586 + }, + { + "epoch": 1.2195649408472204, + "ewc_loss": 0.007660306990146637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66030716476962e-05, + "grad_norm": 3.772538661956787, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8872244358062744, + "num_tokens": 365780122.0, + "step": 9587 + }, + { + "epoch": 1.219692151125811, + "ewc_loss": 0.007602988742291927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.602988625876606e-05, + "grad_norm": 3.7911908626556396, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8737590312957764, + "num_tokens": 365816326.0, + "step": 9588 + }, + { + "epoch": 1.2198193614044015, + "ewc_loss": 0.007621561177074909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62156123528257e-05, + "grad_norm": 3.744098424911499, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8473000526428223, + "num_tokens": 365858938.0, + "step": 9589 + }, + { + "epoch": 1.219946571682992, + "ewc_loss": 0.007584044244140387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.584044215036556e-05, + "grad_norm": 3.8311095237731934, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.863298773765564, + "num_tokens": 365893892.0, + "step": 9590 + }, + { + "epoch": 1.2200737819615826, + "ewc_loss": 0.007683327421545982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683327567065135e-05, + "grad_norm": 3.774097204208374, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8651137351989746, + "num_tokens": 365938787.0, + "step": 9591 + }, + { + "epoch": 1.220200992240173, + "ewc_loss": 0.007586997468024492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586997526232153e-05, + "grad_norm": 3.8275091648101807, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8731628060340881, + "num_tokens": 365972664.0, + "step": 9592 + }, + { + "epoch": 1.2203282025187634, + "ewc_loss": 0.0076465904712677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.646590529475361e-05, + "grad_norm": 3.763108253479004, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8796572089195251, + "num_tokens": 366013367.0, + "step": 9593 + }, + { + "epoch": 1.220455412797354, + "ewc_loss": 0.007580596022307873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.580596138723195e-05, + "grad_norm": 3.7761073112487793, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8787786960601807, + "num_tokens": 366051637.0, + "step": 9594 + }, + { + "epoch": 1.2205826230759445, + "ewc_loss": 0.0076125431805849075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.612543413415551e-05, + "grad_norm": 3.8161308765411377, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8698022961616516, + "num_tokens": 366092713.0, + "step": 9595 + }, + { + "epoch": 1.220709833354535, + "ewc_loss": 0.007621928583830595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.621928671142086e-05, + "grad_norm": 3.793450117111206, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.875077486038208, + "num_tokens": 366131594.0, + "step": 9596 + }, + { + "epoch": 1.2208370436331255, + "ewc_loss": 0.007586379069834948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586379069834948e-05, + "grad_norm": 3.783320665359497, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8690016269683838, + "num_tokens": 366170871.0, + "step": 9597 + }, + { + "epoch": 1.220964253911716, + "ewc_loss": 0.007586304098367691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586304127471521e-05, + "grad_norm": 3.7406558990478516, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.86415034532547, + "num_tokens": 366212974.0, + "step": 9598 + }, + { + "epoch": 1.2210914641903066, + "ewc_loss": 0.00754581717774272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.545817061327398e-05, + "grad_norm": 3.747631549835205, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8667362928390503, + "num_tokens": 366255904.0, + "step": 9599 + }, + { + "epoch": 1.221218674468897, + "ewc_loss": 0.007565444801002741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565444684587419e-05, + "grad_norm": 3.7711386680603027, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8787571787834167, + "num_tokens": 366288887.0, + "step": 9600 + }, + { + "epoch": 1.2213458847474876, + "ewc_loss": 0.007579391356557608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579391240142286e-05, + "grad_norm": 3.72318172454834, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8857616186141968, + "num_tokens": 366328796.0, + "step": 9601 + }, + { + "epoch": 1.2214730950260781, + "ewc_loss": 0.007534924894571304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534924952778965e-05, + "grad_norm": 3.7254791259765625, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8713305592536926, + "num_tokens": 366373093.0, + "step": 9602 + }, + { + "epoch": 1.2216003053046687, + "ewc_loss": 0.007555539254099131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.55553919589147e-05, + "grad_norm": 3.751523971557617, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8826767206192017, + "num_tokens": 366414353.0, + "step": 9603 + }, + { + "epoch": 1.2217275155832592, + "ewc_loss": 0.007557497825473547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.557497883681208e-05, + "grad_norm": 3.7633087635040283, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8856775760650635, + "num_tokens": 366450683.0, + "step": 9604 + }, + { + "epoch": 1.2218547258618497, + "ewc_loss": 0.007545063272118568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.545063272118568e-05, + "grad_norm": 3.7898664474487305, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8784558773040771, + "num_tokens": 366487158.0, + "step": 9605 + }, + { + "epoch": 1.2219819361404403, + "ewc_loss": 0.007583983708173037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583983824588358e-05, + "grad_norm": 3.766770362854004, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.876276969909668, + "num_tokens": 366529984.0, + "step": 9606 + }, + { + "epoch": 1.2221091464190306, + "ewc_loss": 0.007536876481026411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.536876364611089e-05, + "grad_norm": 3.8226091861724854, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8757971525192261, + "num_tokens": 366564100.0, + "step": 9607 + }, + { + "epoch": 1.222236356697621, + "ewc_loss": 0.007579637225717306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.579637167509645e-05, + "grad_norm": 3.7478160858154297, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8800821304321289, + "num_tokens": 366603752.0, + "step": 9608 + }, + { + "epoch": 1.2223635669762116, + "ewc_loss": 0.007532420568168163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.532420568168163e-05, + "grad_norm": 3.736438751220703, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8755962252616882, + "num_tokens": 366642170.0, + "step": 9609 + }, + { + "epoch": 1.2224907772548022, + "ewc_loss": 0.007550197187811136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550197187811136e-05, + "grad_norm": 3.7861366271972656, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8604758381843567, + "num_tokens": 366684038.0, + "step": 9610 + }, + { + "epoch": 1.2226179875333927, + "ewc_loss": 0.0076017677783966064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.601767720188946e-05, + "grad_norm": 3.805103063583374, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8727691173553467, + "num_tokens": 366717832.0, + "step": 9611 + }, + { + "epoch": 1.2227451978119832, + "ewc_loss": 0.007578195072710514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.578195072710514e-05, + "grad_norm": 3.745863676071167, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8682808876037598, + "num_tokens": 366757999.0, + "step": 9612 + }, + { + "epoch": 1.2228724080905737, + "ewc_loss": 0.007549215108156204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.549214933533221e-05, + "grad_norm": 3.766836404800415, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8830546140670776, + "num_tokens": 366798533.0, + "step": 9613 + }, + { + "epoch": 1.2229996183691643, + "ewc_loss": 0.0075866044498980045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586604624520987e-05, + "grad_norm": 3.7348384857177734, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8724513053894043, + "num_tokens": 366843066.0, + "step": 9614 + }, + { + "epoch": 1.2231268286477548, + "ewc_loss": 0.0075654033571481705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565403211629018e-05, + "grad_norm": 3.7591562271118164, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8889399766921997, + "num_tokens": 366883190.0, + "step": 9615 + }, + { + "epoch": 1.2232540389263453, + "ewc_loss": 0.007597698830068111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597699004691094e-05, + "grad_norm": 3.7931087017059326, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8790091276168823, + "num_tokens": 366920642.0, + "step": 9616 + }, + { + "epoch": 1.2233812492049359, + "ewc_loss": 0.007601113524287939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.60111361159943e-05, + "grad_norm": 3.8169708251953125, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8757496476173401, + "num_tokens": 366954468.0, + "step": 9617 + }, + { + "epoch": 1.2235084594835262, + "ewc_loss": 0.0076064178720116615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.60641778470017e-05, + "grad_norm": 3.7992072105407715, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8585827946662903, + "num_tokens": 366991170.0, + "step": 9618 + }, + { + "epoch": 1.2236356697621167, + "ewc_loss": 0.007604831829667091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.604831625940278e-05, + "grad_norm": 3.759315252304077, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8783347606658936, + "num_tokens": 367031833.0, + "step": 9619 + }, + { + "epoch": 1.2237628800407072, + "ewc_loss": 0.007579942233860493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.57994203013368e-05, + "grad_norm": 3.741671562194824, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8697843551635742, + "num_tokens": 367074709.0, + "step": 9620 + }, + { + "epoch": 1.2238900903192977, + "ewc_loss": 0.007589483633637428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58948372094892e-05, + "grad_norm": 3.8630404472351074, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8698586225509644, + "num_tokens": 367106820.0, + "step": 9621 + }, + { + "epoch": 1.2240173005978883, + "ewc_loss": 0.00766999926418066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.669999467907473e-05, + "grad_norm": 3.759305000305176, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8878484964370728, + "num_tokens": 367142298.0, + "step": 9622 + }, + { + "epoch": 1.2241445108764788, + "ewc_loss": 0.0075825052335858345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582505350001156e-05, + "grad_norm": 3.7863821983337402, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8665564060211182, + "num_tokens": 367179957.0, + "step": 9623 + }, + { + "epoch": 1.2242717211550693, + "ewc_loss": 0.007625069003552198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.625068974448368e-05, + "grad_norm": 3.6977627277374268, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8754734992980957, + "num_tokens": 367226800.0, + "step": 9624 + }, + { + "epoch": 1.2243989314336599, + "ewc_loss": 0.007568034343421459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.568034197902307e-05, + "grad_norm": 3.7968201637268066, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8770333528518677, + "num_tokens": 367257168.0, + "step": 9625 + }, + { + "epoch": 1.2245261417122504, + "ewc_loss": 0.0076636360026896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.663635915378109e-05, + "grad_norm": 3.7461376190185547, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8754030466079712, + "num_tokens": 367295254.0, + "step": 9626 + }, + { + "epoch": 1.224653351990841, + "ewc_loss": 0.007591702044010162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.591702160425484e-05, + "grad_norm": 3.7355570793151855, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8616872429847717, + "num_tokens": 367335814.0, + "step": 9627 + }, + { + "epoch": 1.2247805622694314, + "ewc_loss": 0.007617861498147249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617861410835758e-05, + "grad_norm": 3.7472755908966064, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8709611296653748, + "num_tokens": 367377592.0, + "step": 9628 + }, + { + "epoch": 1.224907772548022, + "ewc_loss": 0.007622879464179277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62287963880226e-05, + "grad_norm": 3.8730220794677734, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8658252954483032, + "num_tokens": 367408069.0, + "step": 9629 + }, + { + "epoch": 1.2250349828266125, + "ewc_loss": 0.007697885390371084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.697885303059593e-05, + "grad_norm": 3.783325672149658, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8739111423492432, + "num_tokens": 367444787.0, + "step": 9630 + }, + { + "epoch": 1.225162193105203, + "ewc_loss": 0.00759917963296175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.59917966206558e-05, + "grad_norm": 3.739891767501831, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8620242476463318, + "num_tokens": 367486848.0, + "step": 9631 + }, + { + "epoch": 1.2252894033837933, + "ewc_loss": 0.00760182598605752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.601825927849859e-05, + "grad_norm": 3.6965885162353516, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8844066858291626, + "num_tokens": 367530456.0, + "step": 9632 + }, + { + "epoch": 1.2254166136623839, + "ewc_loss": 0.007600228302180767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600228127557784e-05, + "grad_norm": 3.785717010498047, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.875456690788269, + "num_tokens": 367566513.0, + "step": 9633 + }, + { + "epoch": 1.2255438239409744, + "ewc_loss": 0.007671817671507597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.671817729715258e-05, + "grad_norm": 3.746440887451172, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8532016277313232, + "num_tokens": 367607757.0, + "step": 9634 + }, + { + "epoch": 1.225671034219565, + "ewc_loss": 0.007616234011948109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.616233779117465e-05, + "grad_norm": 3.770280122756958, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8755269050598145, + "num_tokens": 367645525.0, + "step": 9635 + }, + { + "epoch": 1.2257982444981554, + "ewc_loss": 0.007659056223928928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.659056427655742e-05, + "grad_norm": 3.7920615673065186, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8822425007820129, + "num_tokens": 367680840.0, + "step": 9636 + }, + { + "epoch": 1.225925454776746, + "ewc_loss": 0.007638124283403158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.638124225195497e-05, + "grad_norm": 3.7646853923797607, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8801978230476379, + "num_tokens": 367717444.0, + "step": 9637 + }, + { + "epoch": 1.2260526650553365, + "ewc_loss": 0.007636585272848606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636585360160097e-05, + "grad_norm": 3.810300588607788, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8531304597854614, + "num_tokens": 367756548.0, + "step": 9638 + }, + { + "epoch": 1.226179875333927, + "ewc_loss": 0.007670846302062273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.670846389373764e-05, + "grad_norm": 3.7491402626037598, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8677142858505249, + "num_tokens": 367798341.0, + "step": 9639 + }, + { + "epoch": 1.2263070856125176, + "ewc_loss": 0.007611741777509451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.611741602886468e-05, + "grad_norm": 3.7262210845947266, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8803584575653076, + "num_tokens": 367835087.0, + "step": 9640 + }, + { + "epoch": 1.226434295891108, + "ewc_loss": 0.0076241763308644295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.624176214449108e-05, + "grad_norm": 3.7641313076019287, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8767213821411133, + "num_tokens": 367873816.0, + "step": 9641 + }, + { + "epoch": 1.2265615061696984, + "ewc_loss": 0.007660286035388708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.660286064492539e-05, + "grad_norm": 3.771320343017578, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.869786262512207, + "num_tokens": 367913621.0, + "step": 9642 + }, + { + "epoch": 1.226688716448289, + "ewc_loss": 0.007626701146364212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626700971741229e-05, + "grad_norm": 3.854247570037842, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8657581806182861, + "num_tokens": 367946881.0, + "step": 9643 + }, + { + "epoch": 1.2268159267268794, + "ewc_loss": 0.007675000932067633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.675000961171463e-05, + "grad_norm": 3.7804489135742188, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.878028154373169, + "num_tokens": 367980806.0, + "step": 9644 + }, + { + "epoch": 1.22694313700547, + "ewc_loss": 0.007619342766702175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.619342795806006e-05, + "grad_norm": 3.75924015045166, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8814352750778198, + "num_tokens": 368016041.0, + "step": 9645 + }, + { + "epoch": 1.2270703472840605, + "ewc_loss": 0.0076294285245239735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.629428728250787e-05, + "grad_norm": 3.7938735485076904, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8711482286453247, + "num_tokens": 368052331.0, + "step": 9646 + }, + { + "epoch": 1.227197557562651, + "ewc_loss": 0.007661021780222654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.661021663807333e-05, + "grad_norm": 3.720715284347534, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8825855255126953, + "num_tokens": 368091458.0, + "step": 9647 + }, + { + "epoch": 1.2273247678412416, + "ewc_loss": 0.007617629133164883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617629307787865e-05, + "grad_norm": 3.738440752029419, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.862497091293335, + "num_tokens": 368132433.0, + "step": 9648 + }, + { + "epoch": 1.227451978119832, + "ewc_loss": 0.007659380789846182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.659380935365334e-05, + "grad_norm": 3.7556934356689453, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8749858140945435, + "num_tokens": 368171030.0, + "step": 9649 + }, + { + "epoch": 1.2275791883984226, + "ewc_loss": 0.007661376614123583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.661376730538905e-05, + "grad_norm": 3.769944667816162, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8626280426979065, + "num_tokens": 368211555.0, + "step": 9650 + }, + { + "epoch": 1.2277063986770131, + "ewc_loss": 0.007644668221473694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644668221473694e-05, + "grad_norm": 3.772449016571045, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8802224397659302, + "num_tokens": 368252076.0, + "step": 9651 + }, + { + "epoch": 1.2278336089556037, + "ewc_loss": 0.007646332960575819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.646332960575819e-05, + "grad_norm": 3.8222873210906982, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8663073182106018, + "num_tokens": 368291265.0, + "step": 9652 + }, + { + "epoch": 1.2279608192341942, + "ewc_loss": 0.0076787699945271015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.67876990721561e-05, + "grad_norm": 3.761788845062256, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8667912483215332, + "num_tokens": 368332178.0, + "step": 9653 + }, + { + "epoch": 1.2280880295127847, + "ewc_loss": 0.007637946866452694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.637946691829711e-05, + "grad_norm": 3.763432025909424, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8714989423751831, + "num_tokens": 368374103.0, + "step": 9654 + }, + { + "epoch": 1.2282152397913753, + "ewc_loss": 0.007645678240805864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.645678124390543e-05, + "grad_norm": 3.783784866333008, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8752747178077698, + "num_tokens": 368411928.0, + "step": 9655 + }, + { + "epoch": 1.2283424500699656, + "ewc_loss": 0.007648506201803684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648506289115176e-05, + "grad_norm": 3.7991855144500732, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8514329791069031, + "num_tokens": 368449200.0, + "step": 9656 + }, + { + "epoch": 1.228469660348556, + "ewc_loss": 0.007655113469809294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.655113586224616e-05, + "grad_norm": 3.7985951900482178, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8569869995117188, + "num_tokens": 368489935.0, + "step": 9657 + }, + { + "epoch": 1.2285968706271466, + "ewc_loss": 0.007644550409168005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644550350960344e-05, + "grad_norm": 3.77940034866333, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8885602951049805, + "num_tokens": 368522227.0, + "step": 9658 + }, + { + "epoch": 1.2287240809057371, + "ewc_loss": 0.007639428600668907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.63942880439572e-05, + "grad_norm": 3.824979543685913, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.871925413608551, + "num_tokens": 368555084.0, + "step": 9659 + }, + { + "epoch": 1.2288512911843277, + "ewc_loss": 0.007678473833948374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678473775740713e-05, + "grad_norm": 3.760521650314331, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8707034587860107, + "num_tokens": 368591845.0, + "step": 9660 + }, + { + "epoch": 1.2289785014629182, + "ewc_loss": 0.007628274615854025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.628274761373177e-05, + "grad_norm": 3.7279627323150635, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8627152442932129, + "num_tokens": 368637841.0, + "step": 9661 + }, + { + "epoch": 1.2291057117415087, + "ewc_loss": 0.00763704814016819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.63704811106436e-05, + "grad_norm": 3.700136661529541, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8867841958999634, + "num_tokens": 368677470.0, + "step": 9662 + }, + { + "epoch": 1.2292329220200993, + "ewc_loss": 0.007639371324330568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639371324330568e-05, + "grad_norm": 3.7582101821899414, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8880594372749329, + "num_tokens": 368718412.0, + "step": 9663 + }, + { + "epoch": 1.2293601322986898, + "ewc_loss": 0.007677930407226086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.677930261706933e-05, + "grad_norm": 3.8411340713500977, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8627558350563049, + "num_tokens": 368759267.0, + "step": 9664 + }, + { + "epoch": 1.2294873425772803, + "ewc_loss": 0.007701342925429344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.701342838117853e-05, + "grad_norm": 3.789018392562866, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8658549785614014, + "num_tokens": 368797586.0, + "step": 9665 + }, + { + "epoch": 1.2296145528558708, + "ewc_loss": 0.007626863196492195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626863225596026e-05, + "grad_norm": 3.790313243865967, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8658543825149536, + "num_tokens": 368833801.0, + "step": 9666 + }, + { + "epoch": 1.2297417631344612, + "ewc_loss": 0.007656225468963385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.656225352548063e-05, + "grad_norm": 3.733107328414917, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8581568598747253, + "num_tokens": 368878513.0, + "step": 9667 + }, + { + "epoch": 1.2298689734130517, + "ewc_loss": 0.007628644350916147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.628644380019978e-05, + "grad_norm": 3.750394821166992, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8673258423805237, + "num_tokens": 368917815.0, + "step": 9668 + }, + { + "epoch": 1.2299961836916422, + "ewc_loss": 0.007654310204088688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.65431032050401e-05, + "grad_norm": 3.773987293243408, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8634827136993408, + "num_tokens": 368954971.0, + "step": 9669 + }, + { + "epoch": 1.2301233939702327, + "ewc_loss": 0.007664392702281475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.664392614969984e-05, + "grad_norm": 3.791210174560547, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8789421319961548, + "num_tokens": 368996212.0, + "step": 9670 + }, + { + "epoch": 1.2302506042488233, + "ewc_loss": 0.0076641300693154335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.664129952900112e-05, + "grad_norm": 3.733922004699707, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8853327035903931, + "num_tokens": 369032186.0, + "step": 9671 + }, + { + "epoch": 1.2303778145274138, + "ewc_loss": 0.0076275784522295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6275784522295e-05, + "grad_norm": 3.7180516719818115, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8758103251457214, + "num_tokens": 369077565.0, + "step": 9672 + }, + { + "epoch": 1.2305050248060043, + "ewc_loss": 0.007643189746886492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.643189746886492e-05, + "grad_norm": 3.833357572555542, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8806130290031433, + "num_tokens": 369111935.0, + "step": 9673 + }, + { + "epoch": 1.2306322350845948, + "ewc_loss": 0.007704451680183411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704451854806393e-05, + "grad_norm": 3.725116729736328, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8711686730384827, + "num_tokens": 369154374.0, + "step": 9674 + }, + { + "epoch": 1.2307594453631854, + "ewc_loss": 0.007599557284265757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.599557284265757e-05, + "grad_norm": 3.72723650932312, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8656272888183594, + "num_tokens": 369198557.0, + "step": 9675 + }, + { + "epoch": 1.230886655641776, + "ewc_loss": 0.007648568134754896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648568134754896e-05, + "grad_norm": 3.8253304958343506, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8561282753944397, + "num_tokens": 369237756.0, + "step": 9676 + }, + { + "epoch": 1.2310138659203664, + "ewc_loss": 0.007699170149862766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699170237174258e-05, + "grad_norm": 3.862710475921631, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8594541549682617, + "num_tokens": 369270155.0, + "step": 9677 + }, + { + "epoch": 1.231141076198957, + "ewc_loss": 0.007678987458348274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678987458348274e-05, + "grad_norm": 3.746968984603882, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8648184537887573, + "num_tokens": 369310870.0, + "step": 9678 + }, + { + "epoch": 1.2312682864775475, + "ewc_loss": 0.007609087973833084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.609088061144575e-05, + "grad_norm": 3.835442304611206, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8553557395935059, + "num_tokens": 369344679.0, + "step": 9679 + }, + { + "epoch": 1.231395496756138, + "ewc_loss": 0.0077093010768294334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709301280556247e-05, + "grad_norm": 3.782748222351074, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8727458715438843, + "num_tokens": 369380388.0, + "step": 9680 + }, + { + "epoch": 1.2315227070347283, + "ewc_loss": 0.007643045857548714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.643045682925731e-05, + "grad_norm": 3.681793212890625, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8856218457221985, + "num_tokens": 369422327.0, + "step": 9681 + }, + { + "epoch": 1.2316499173133189, + "ewc_loss": 0.007613008376210928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.613008347107098e-05, + "grad_norm": 3.848594903945923, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.879837691783905, + "num_tokens": 369457160.0, + "step": 9682 + }, + { + "epoch": 1.2317771275919094, + "ewc_loss": 0.007757114712148905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757114508422092e-05, + "grad_norm": 3.755335807800293, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.873515248298645, + "num_tokens": 369497883.0, + "step": 9683 + }, + { + "epoch": 1.2319043378705, + "ewc_loss": 0.007614050060510635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.614050264237449e-05, + "grad_norm": 3.690269947052002, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.901282548904419, + "num_tokens": 369540014.0, + "step": 9684 + }, + { + "epoch": 1.2320315481490904, + "ewc_loss": 0.007627504877746105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.627504965057597e-05, + "grad_norm": 3.7591562271118164, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8723889589309692, + "num_tokens": 369580424.0, + "step": 9685 + }, + { + "epoch": 1.232158758427681, + "ewc_loss": 0.007673422805964947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.673422805964947e-05, + "grad_norm": 3.782883405685425, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8683397769927979, + "num_tokens": 369623615.0, + "step": 9686 + }, + { + "epoch": 1.2322859687062715, + "ewc_loss": 0.007656311150640249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.65631120884791e-05, + "grad_norm": 3.8286163806915283, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8616424798965454, + "num_tokens": 369658408.0, + "step": 9687 + }, + { + "epoch": 1.232413178984862, + "ewc_loss": 0.007672375999391079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.672375795664266e-05, + "grad_norm": 3.7811527252197266, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8707805871963501, + "num_tokens": 369696311.0, + "step": 9688 + }, + { + "epoch": 1.2325403892634526, + "ewc_loss": 0.007620211690664291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.620211545145139e-05, + "grad_norm": 3.7904114723205566, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.860950231552124, + "num_tokens": 369737473.0, + "step": 9689 + }, + { + "epoch": 1.232667599542043, + "ewc_loss": 0.0076621538028120995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6621538028121e-05, + "grad_norm": 3.8271217346191406, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8711522817611694, + "num_tokens": 369770151.0, + "step": 9690 + }, + { + "epoch": 1.2327948098206334, + "ewc_loss": 0.007662602700293064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.662602729396895e-05, + "grad_norm": 3.6676812171936035, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8913320302963257, + "num_tokens": 369813503.0, + "step": 9691 + }, + { + "epoch": 1.232922020099224, + "ewc_loss": 0.007574767339974642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.574767369078472e-05, + "grad_norm": 3.8102245330810547, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8742491602897644, + "num_tokens": 369848768.0, + "step": 9692 + }, + { + "epoch": 1.2330492303778144, + "ewc_loss": 0.00769636407494545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696363900322467e-05, + "grad_norm": 3.7721362113952637, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.874906063079834, + "num_tokens": 369892510.0, + "step": 9693 + }, + { + "epoch": 1.233176440656405, + "ewc_loss": 0.007620903663337231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.620903488714248e-05, + "grad_norm": 3.8006436824798584, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8803980946540833, + "num_tokens": 369929638.0, + "step": 9694 + }, + { + "epoch": 1.2333036509349955, + "ewc_loss": 0.007642205338925123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.642205309821293e-05, + "grad_norm": 3.7309789657592773, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8928192853927612, + "num_tokens": 369971201.0, + "step": 9695 + }, + { + "epoch": 1.233430861213586, + "ewc_loss": 0.007590855471789837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.590855238959193e-05, + "grad_norm": 3.774336814880371, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8672933578491211, + "num_tokens": 370011713.0, + "step": 9696 + }, + { + "epoch": 1.2335580714921766, + "ewc_loss": 0.007635184098035097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.635184010723606e-05, + "grad_norm": 3.8611159324645996, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8541995286941528, + "num_tokens": 370042647.0, + "step": 9697 + }, + { + "epoch": 1.233685281770767, + "ewc_loss": 0.007675572764128447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.675572851439938e-05, + "grad_norm": 3.7569055557250977, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8711957931518555, + "num_tokens": 370083976.0, + "step": 9698 + }, + { + "epoch": 1.2338124920493576, + "ewc_loss": 0.007570542395114899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.570542220491916e-05, + "grad_norm": 3.7497715950012207, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8697715401649475, + "num_tokens": 370126435.0, + "step": 9699 + }, + { + "epoch": 1.2339397023279481, + "ewc_loss": 0.007623092737048864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.623092824360356e-05, + "grad_norm": 3.755084753036499, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8814055323600769, + "num_tokens": 370165038.0, + "step": 9700 + }, + { + "epoch": 1.2340669126065387, + "ewc_loss": 0.007609867490828037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.609867316205055e-05, + "grad_norm": 3.781658411026001, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8664094805717468, + "num_tokens": 370203438.0, + "step": 9701 + }, + { + "epoch": 1.2341941228851292, + "ewc_loss": 0.007623272482305765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.623272540513426e-05, + "grad_norm": 3.8409292697906494, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8878213167190552, + "num_tokens": 370235144.0, + "step": 9702 + }, + { + "epoch": 1.2343213331637197, + "ewc_loss": 0.00765770161524415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.65770164434798e-05, + "grad_norm": 3.773920774459839, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8670313358306885, + "num_tokens": 370278936.0, + "step": 9703 + }, + { + "epoch": 1.2344485434423103, + "ewc_loss": 0.007605182938277721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.605183054693043e-05, + "grad_norm": 3.7971203327178955, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8792274594306946, + "num_tokens": 370312595.0, + "step": 9704 + }, + { + "epoch": 1.2345757537209006, + "ewc_loss": 0.007649597711861134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.649597682757303e-05, + "grad_norm": 3.7667481899261475, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8701121211051941, + "num_tokens": 370351697.0, + "step": 9705 + }, + { + "epoch": 1.234702963999491, + "ewc_loss": 0.007630538195371628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.63053831178695e-05, + "grad_norm": 3.7476038932800293, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.885513186454773, + "num_tokens": 370391264.0, + "step": 9706 + }, + { + "epoch": 1.2348301742780816, + "ewc_loss": 0.007628330960869789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.628330786246806e-05, + "grad_norm": 3.8238213062286377, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8803175687789917, + "num_tokens": 370425248.0, + "step": 9707 + }, + { + "epoch": 1.2349573845566721, + "ewc_loss": 0.007665400393307209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.665400335099548e-05, + "grad_norm": 3.843419313430786, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8530499935150146, + "num_tokens": 370462670.0, + "step": 9708 + }, + { + "epoch": 1.2350845948352627, + "ewc_loss": 0.0076604620553553104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.660462142666802e-05, + "grad_norm": 3.801780939102173, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.853778600692749, + "num_tokens": 370506126.0, + "step": 9709 + }, + { + "epoch": 1.2352118051138532, + "ewc_loss": 0.007612466346472502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.612466288264841e-05, + "grad_norm": 3.722561836242676, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8834803104400635, + "num_tokens": 370546895.0, + "step": 9710 + }, + { + "epoch": 1.2353390153924437, + "ewc_loss": 0.0075920457020401955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.592045585624874e-05, + "grad_norm": 3.7591912746429443, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8771947026252747, + "num_tokens": 370590645.0, + "step": 9711 + }, + { + "epoch": 1.2354662256710343, + "ewc_loss": 0.00764080323278904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64080323278904e-05, + "grad_norm": 3.784773111343384, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8750075101852417, + "num_tokens": 370628614.0, + "step": 9712 + }, + { + "epoch": 1.2355934359496248, + "ewc_loss": 0.007633293513208628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633293716935441e-05, + "grad_norm": 3.7989251613616943, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8761279582977295, + "num_tokens": 370666082.0, + "step": 9713 + }, + { + "epoch": 1.2357206462282153, + "ewc_loss": 0.007616546470671892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.616546645294875e-05, + "grad_norm": 3.8066771030426025, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8594744205474854, + "num_tokens": 370705780.0, + "step": 9714 + }, + { + "epoch": 1.2358478565068058, + "ewc_loss": 0.00762086920440197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.620869291713461e-05, + "grad_norm": 3.8430275917053223, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8784381747245789, + "num_tokens": 370742550.0, + "step": 9715 + }, + { + "epoch": 1.2359750667853961, + "ewc_loss": 0.007648143917322159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648143946425989e-05, + "grad_norm": 3.74088716506958, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8765969276428223, + "num_tokens": 370784406.0, + "step": 9716 + }, + { + "epoch": 1.2361022770639867, + "ewc_loss": 0.007570310961455107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.570310845039785e-05, + "grad_norm": 3.8141589164733887, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.868461549282074, + "num_tokens": 370820911.0, + "step": 9717 + }, + { + "epoch": 1.2362294873425772, + "ewc_loss": 0.007650600280612707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.650600309716538e-05, + "grad_norm": 3.8014016151428223, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8837620615959167, + "num_tokens": 370857427.0, + "step": 9718 + }, + { + "epoch": 1.2363566976211677, + "ewc_loss": 0.007612017914652824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.612018089275807e-05, + "grad_norm": 3.7291903495788574, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8644921779632568, + "num_tokens": 370897989.0, + "step": 9719 + }, + { + "epoch": 1.2364839078997583, + "ewc_loss": 0.007584808859974146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.584808918181807e-05, + "grad_norm": 3.8628554344177246, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8603299856185913, + "num_tokens": 370932672.0, + "step": 9720 + }, + { + "epoch": 1.2366111181783488, + "ewc_loss": 0.00770935881882906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709358760621399e-05, + "grad_norm": 3.820368766784668, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8644543886184692, + "num_tokens": 370971027.0, + "step": 9721 + }, + { + "epoch": 1.2367383284569393, + "ewc_loss": 0.007617186289280653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617186201969162e-05, + "grad_norm": 3.694589853286743, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8825758099555969, + "num_tokens": 371014742.0, + "step": 9722 + }, + { + "epoch": 1.2368655387355298, + "ewc_loss": 0.007571364287286997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.571364403702319e-05, + "grad_norm": 3.828770160675049, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8574475049972534, + "num_tokens": 371052065.0, + "step": 9723 + }, + { + "epoch": 1.2369927490141204, + "ewc_loss": 0.007703104522079229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.703104347456247e-05, + "grad_norm": 3.794257879257202, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8521656394004822, + "num_tokens": 371093897.0, + "step": 9724 + }, + { + "epoch": 1.237119959292711, + "ewc_loss": 0.0076261041685938835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626104343216866e-05, + "grad_norm": 3.781191110610962, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.873349130153656, + "num_tokens": 371132390.0, + "step": 9725 + }, + { + "epoch": 1.2372471695713014, + "ewc_loss": 0.007628685794770718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.628685852978379e-05, + "grad_norm": 3.7292304039001465, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8848198652267456, + "num_tokens": 371170468.0, + "step": 9726 + }, + { + "epoch": 1.237374379849892, + "ewc_loss": 0.0076294345781207085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.629434549016878e-05, + "grad_norm": 3.774779796600342, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8707913160324097, + "num_tokens": 371209323.0, + "step": 9727 + }, + { + "epoch": 1.2375015901284825, + "ewc_loss": 0.007662991061806679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.662991265533492e-05, + "grad_norm": 3.7958009243011475, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8732593059539795, + "num_tokens": 371246526.0, + "step": 9728 + }, + { + "epoch": 1.237628800407073, + "ewc_loss": 0.007640213239938021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64021315262653e-05, + "grad_norm": 3.7797274589538574, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8776563405990601, + "num_tokens": 371282850.0, + "step": 9729 + }, + { + "epoch": 1.2377560106856633, + "ewc_loss": 0.007632669527083635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.632669439772144e-05, + "grad_norm": 3.816251516342163, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8809613585472107, + "num_tokens": 371312614.0, + "step": 9730 + }, + { + "epoch": 1.2378832209642538, + "ewc_loss": 0.0076752048917114735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.67520468798466e-05, + "grad_norm": 3.8065378665924072, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8748958110809326, + "num_tokens": 371350170.0, + "step": 9731 + }, + { + "epoch": 1.2380104312428444, + "ewc_loss": 0.0076647112146019936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.664711301913485e-05, + "grad_norm": 3.746490478515625, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8793354034423828, + "num_tokens": 371389295.0, + "step": 9732 + }, + { + "epoch": 1.238137641521435, + "ewc_loss": 0.007638605311512947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.638605165993795e-05, + "grad_norm": 3.7759275436401367, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8728663921356201, + "num_tokens": 371430579.0, + "step": 9733 + }, + { + "epoch": 1.2382648518000254, + "ewc_loss": 0.007683937903493643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683938019908965e-05, + "grad_norm": 3.857203722000122, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8629412651062012, + "num_tokens": 371465087.0, + "step": 9734 + }, + { + "epoch": 1.238392062078616, + "ewc_loss": 0.0076982746832072735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.698274566791952e-05, + "grad_norm": 3.760627508163452, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8811551332473755, + "num_tokens": 371503549.0, + "step": 9735 + }, + { + "epoch": 1.2385192723572065, + "ewc_loss": 0.007639085408300161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639085379196331e-05, + "grad_norm": 3.7635750770568848, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8915568590164185, + "num_tokens": 371542953.0, + "step": 9736 + }, + { + "epoch": 1.238646482635797, + "ewc_loss": 0.007673680316656828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.673680374864489e-05, + "grad_norm": 3.776467800140381, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.877554178237915, + "num_tokens": 371582541.0, + "step": 9737 + }, + { + "epoch": 1.2387736929143875, + "ewc_loss": 0.007660238072276115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.660238043172285e-05, + "grad_norm": 3.8066043853759766, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8834962844848633, + "num_tokens": 371617221.0, + "step": 9738 + }, + { + "epoch": 1.238900903192978, + "ewc_loss": 0.007669997401535511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.669997285120189e-05, + "grad_norm": 3.7477827072143555, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8756095170974731, + "num_tokens": 371660773.0, + "step": 9739 + }, + { + "epoch": 1.2390281134715684, + "ewc_loss": 0.007639356888830662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.63935677241534e-05, + "grad_norm": 3.8454558849334717, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8679471611976624, + "num_tokens": 371694763.0, + "step": 9740 + }, + { + "epoch": 1.239155323750159, + "ewc_loss": 0.007700873538851738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.700873538851738e-05, + "grad_norm": 3.7824013233184814, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8733359575271606, + "num_tokens": 371734793.0, + "step": 9741 + }, + { + "epoch": 1.2392825340287494, + "ewc_loss": 0.007621141616255045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.621141412528232e-05, + "grad_norm": 3.754847288131714, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8667821884155273, + "num_tokens": 371775691.0, + "step": 9742 + }, + { + "epoch": 1.23940974430734, + "ewc_loss": 0.007633209228515625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633209315827116e-05, + "grad_norm": 3.789262533187866, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.866844892501831, + "num_tokens": 371817319.0, + "step": 9743 + }, + { + "epoch": 1.2395369545859305, + "ewc_loss": 0.007661499083042145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.661498966626823e-05, + "grad_norm": 3.85961651802063, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8718088865280151, + "num_tokens": 371847431.0, + "step": 9744 + }, + { + "epoch": 1.239664164864521, + "ewc_loss": 0.007684347685426474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.684347656322643e-05, + "grad_norm": 3.7765657901763916, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8854763507843018, + "num_tokens": 371881788.0, + "step": 9745 + }, + { + "epoch": 1.2397913751431116, + "ewc_loss": 0.007613534573465586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.613534398842603e-05, + "grad_norm": 3.825568675994873, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8701339960098267, + "num_tokens": 371917388.0, + "step": 9746 + }, + { + "epoch": 1.239918585421702, + "ewc_loss": 0.0076805297285318375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.680529961362481e-05, + "grad_norm": 3.7618627548217773, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8819544315338135, + "num_tokens": 371957561.0, + "step": 9747 + }, + { + "epoch": 1.2400457957002926, + "ewc_loss": 0.007613658905029297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.613658817717806e-05, + "grad_norm": 3.9159018993377686, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8669754266738892, + "num_tokens": 371987322.0, + "step": 9748 + }, + { + "epoch": 1.2401730059788831, + "ewc_loss": 0.007738884538412094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738884596619755e-05, + "grad_norm": 3.7581424713134766, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8748089075088501, + "num_tokens": 372027263.0, + "step": 9749 + }, + { + "epoch": 1.2403002162574737, + "ewc_loss": 0.007591120898723602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.591120811412111e-05, + "grad_norm": 3.7959344387054443, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8767316937446594, + "num_tokens": 372061441.0, + "step": 9750 + }, + { + "epoch": 1.2404274265360642, + "ewc_loss": 0.007684461772441864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.684461888857186e-05, + "grad_norm": 3.754493474960327, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8793920278549194, + "num_tokens": 372099085.0, + "step": 9751 + }, + { + "epoch": 1.2405546368146547, + "ewc_loss": 0.007649755571037531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.649755571037531e-05, + "grad_norm": 3.7937190532684326, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8755219578742981, + "num_tokens": 372134106.0, + "step": 9752 + }, + { + "epoch": 1.2406818470932452, + "ewc_loss": 0.007681563962250948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.681563874939457e-05, + "grad_norm": 3.885161876678467, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.88324373960495, + "num_tokens": 372164635.0, + "step": 9753 + }, + { + "epoch": 1.2408090573718356, + "ewc_loss": 0.0077295079827308655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729508070042357e-05, + "grad_norm": 3.7411534786224365, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8789316415786743, + "num_tokens": 372204924.0, + "step": 9754 + }, + { + "epoch": 1.240936267650426, + "ewc_loss": 0.007622545585036278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.622545672347769e-05, + "grad_norm": 3.8014543056488037, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8731934428215027, + "num_tokens": 372240939.0, + "step": 9755 + }, + { + "epoch": 1.2410634779290166, + "ewc_loss": 0.007716906722635031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716906839050353e-05, + "grad_norm": 3.869454860687256, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.869059145450592, + "num_tokens": 372272355.0, + "step": 9756 + }, + { + "epoch": 1.2411906882076071, + "ewc_loss": 0.007714888546615839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.714888488408178e-05, + "grad_norm": 3.79050612449646, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8721605539321899, + "num_tokens": 372309566.0, + "step": 9757 + }, + { + "epoch": 1.2413178984861977, + "ewc_loss": 0.0076784780248999596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678478141315281e-05, + "grad_norm": 3.785747528076172, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8689185976982117, + "num_tokens": 372347768.0, + "step": 9758 + }, + { + "epoch": 1.2414451087647882, + "ewc_loss": 0.007712499238550663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71249906392768e-05, + "grad_norm": 3.8222501277923584, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8812469244003296, + "num_tokens": 372381020.0, + "step": 9759 + }, + { + "epoch": 1.2415723190433787, + "ewc_loss": 0.0077425641939044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742564048385248e-05, + "grad_norm": 3.8166496753692627, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8632090091705322, + "num_tokens": 372422434.0, + "step": 9760 + }, + { + "epoch": 1.2416995293219693, + "ewc_loss": 0.007738129235804081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738129352219403e-05, + "grad_norm": 3.7892189025878906, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8645555973052979, + "num_tokens": 372459342.0, + "step": 9761 + }, + { + "epoch": 1.2418267396005598, + "ewc_loss": 0.007716292515397072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716292748227715e-05, + "grad_norm": 3.7552804946899414, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8774192333221436, + "num_tokens": 372496637.0, + "step": 9762 + }, + { + "epoch": 1.2419539498791503, + "ewc_loss": 0.00769796734675765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.697967521380633e-05, + "grad_norm": 3.7286174297332764, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8856745958328247, + "num_tokens": 372532068.0, + "step": 9763 + }, + { + "epoch": 1.2420811601577408, + "ewc_loss": 0.007720366585999727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.720366556895897e-05, + "grad_norm": 3.8046419620513916, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8788439631462097, + "num_tokens": 372565460.0, + "step": 9764 + }, + { + "epoch": 1.2422083704363311, + "ewc_loss": 0.007766680791974068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76668093749322e-05, + "grad_norm": 3.751274585723877, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8781736493110657, + "num_tokens": 372605998.0, + "step": 9765 + }, + { + "epoch": 1.2423355807149217, + "ewc_loss": 0.0077027869410812855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702787115704268e-05, + "grad_norm": 3.815352201461792, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8641742467880249, + "num_tokens": 372642038.0, + "step": 9766 + }, + { + "epoch": 1.2424627909935122, + "ewc_loss": 0.00774743827059865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747438212390989e-05, + "grad_norm": 3.760021686553955, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8703060150146484, + "num_tokens": 372678192.0, + "step": 9767 + }, + { + "epoch": 1.2425900012721027, + "ewc_loss": 0.007706526201218367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706526230322197e-05, + "grad_norm": 3.788361072540283, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8701660633087158, + "num_tokens": 372712867.0, + "step": 9768 + }, + { + "epoch": 1.2427172115506933, + "ewc_loss": 0.007744431961327791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.744431786704808e-05, + "grad_norm": 3.876110792160034, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8554359674453735, + "num_tokens": 372745790.0, + "step": 9769 + }, + { + "epoch": 1.2428444218292838, + "ewc_loss": 0.007772192358970642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772192475385964e-05, + "grad_norm": 3.786722183227539, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8842689394950867, + "num_tokens": 372777634.0, + "step": 9770 + }, + { + "epoch": 1.2429716321078743, + "ewc_loss": 0.007712226826697588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71222694311291e-05, + "grad_norm": 3.7471330165863037, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8577147126197815, + "num_tokens": 372820527.0, + "step": 9771 + }, + { + "epoch": 1.2430988423864648, + "ewc_loss": 0.007724516559392214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.724516763119027e-05, + "grad_norm": 3.7453062534332275, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8812839984893799, + "num_tokens": 372858479.0, + "step": 9772 + }, + { + "epoch": 1.2432260526650554, + "ewc_loss": 0.007734381593763828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734381506452337e-05, + "grad_norm": 3.755767345428467, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8801184892654419, + "num_tokens": 372895219.0, + "step": 9773 + }, + { + "epoch": 1.243353262943646, + "ewc_loss": 0.007720329333096743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.720329449512064e-05, + "grad_norm": 3.729095697402954, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.88489830493927, + "num_tokens": 372934491.0, + "step": 9774 + }, + { + "epoch": 1.2434804732222364, + "ewc_loss": 0.007696721702814102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696721877437085e-05, + "grad_norm": 3.82011079788208, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8645620942115784, + "num_tokens": 372972892.0, + "step": 9775 + }, + { + "epoch": 1.243607683500827, + "ewc_loss": 0.007769800256937742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76980014052242e-05, + "grad_norm": 3.783738851547241, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8765243887901306, + "num_tokens": 373011564.0, + "step": 9776 + }, + { + "epoch": 1.2437348937794175, + "ewc_loss": 0.007708150893449783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708150951657444e-05, + "grad_norm": 3.796586036682129, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8754528164863586, + "num_tokens": 373049444.0, + "step": 9777 + }, + { + "epoch": 1.243862104058008, + "ewc_loss": 0.007722790353000164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722790178377181e-05, + "grad_norm": 3.7878763675689697, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8720770478248596, + "num_tokens": 373085466.0, + "step": 9778 + }, + { + "epoch": 1.2439893143365983, + "ewc_loss": 0.007699413225054741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699413254158571e-05, + "grad_norm": 3.7890405654907227, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8515824675559998, + "num_tokens": 373124712.0, + "step": 9779 + }, + { + "epoch": 1.2441165246151888, + "ewc_loss": 0.007716183550655842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716183608863503e-05, + "grad_norm": 3.786205291748047, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.87923264503479, + "num_tokens": 373160008.0, + "step": 9780 + }, + { + "epoch": 1.2442437348937794, + "ewc_loss": 0.007705169264227152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705169264227152e-05, + "grad_norm": 3.848435640335083, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.874079167842865, + "num_tokens": 373192110.0, + "step": 9781 + }, + { + "epoch": 1.24437094517237, + "ewc_loss": 0.007727782242000103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.727782212896273e-05, + "grad_norm": 3.7393064498901367, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8750712871551514, + "num_tokens": 373234432.0, + "step": 9782 + }, + { + "epoch": 1.2444981554509604, + "ewc_loss": 0.007636608090251684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636607915628701e-05, + "grad_norm": 3.802600622177124, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.864088773727417, + "num_tokens": 373270725.0, + "step": 9783 + }, + { + "epoch": 1.244625365729551, + "ewc_loss": 0.007712129503488541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71212944528088e-05, + "grad_norm": 3.7991788387298584, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8511644601821899, + "num_tokens": 373307531.0, + "step": 9784 + }, + { + "epoch": 1.2447525760081415, + "ewc_loss": 0.007695017848163843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.695017848163843e-05, + "grad_norm": 3.7254393100738525, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.877151370048523, + "num_tokens": 373351657.0, + "step": 9785 + }, + { + "epoch": 1.244879786286732, + "ewc_loss": 0.007648860570043325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648860628250986e-05, + "grad_norm": 3.8196513652801514, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8790159821510315, + "num_tokens": 373388811.0, + "step": 9786 + }, + { + "epoch": 1.2450069965653225, + "ewc_loss": 0.007735163904726505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735163671895862e-05, + "grad_norm": 3.7652270793914795, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8813917636871338, + "num_tokens": 373423378.0, + "step": 9787 + }, + { + "epoch": 1.245134206843913, + "ewc_loss": 0.007662471849471331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.662471762159839e-05, + "grad_norm": 3.7787346839904785, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8761001825332642, + "num_tokens": 373460982.0, + "step": 9788 + }, + { + "epoch": 1.2452614171225034, + "ewc_loss": 0.007704451680183411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704451854806393e-05, + "grad_norm": 3.720227003097534, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8800015449523926, + "num_tokens": 373501474.0, + "step": 9789 + }, + { + "epoch": 1.245388627401094, + "ewc_loss": 0.007661705836653709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.661705603823066e-05, + "grad_norm": 3.812191963195801, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.875834047794342, + "num_tokens": 373537830.0, + "step": 9790 + }, + { + "epoch": 1.2455158376796844, + "ewc_loss": 0.007725882343947887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725882460363209e-05, + "grad_norm": 3.786095380783081, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8663415908813477, + "num_tokens": 373573975.0, + "step": 9791 + }, + { + "epoch": 1.245643047958275, + "ewc_loss": 0.007683803327381611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683803414693102e-05, + "grad_norm": 3.825033187866211, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8541344404220581, + "num_tokens": 373612157.0, + "step": 9792 + }, + { + "epoch": 1.2457702582368655, + "ewc_loss": 0.007711272221058607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711272337473929e-05, + "grad_norm": 3.788428783416748, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8924717903137207, + "num_tokens": 373645065.0, + "step": 9793 + }, + { + "epoch": 1.245897468515456, + "ewc_loss": 0.007684496697038412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.684496813453734e-05, + "grad_norm": 3.744828224182129, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8692280054092407, + "num_tokens": 373687658.0, + "step": 9794 + }, + { + "epoch": 1.2460246787940465, + "ewc_loss": 0.007665394339710474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.665394514333457e-05, + "grad_norm": 3.7335994243621826, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8754469156265259, + "num_tokens": 373732262.0, + "step": 9795 + }, + { + "epoch": 1.246151889072637, + "ewc_loss": 0.0076616001315414906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66160010243766e-05, + "grad_norm": 3.799147844314575, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8542228937149048, + "num_tokens": 373772410.0, + "step": 9796 + }, + { + "epoch": 1.2462790993512276, + "ewc_loss": 0.00771339749917388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713397644693032e-05, + "grad_norm": 3.7578091621398926, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.875064492225647, + "num_tokens": 373808773.0, + "step": 9797 + }, + { + "epoch": 1.2464063096298181, + "ewc_loss": 0.007667043246328831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.667043246328831e-05, + "grad_norm": 3.780506134033203, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8689486384391785, + "num_tokens": 373848945.0, + "step": 9798 + }, + { + "epoch": 1.2465335199084087, + "ewc_loss": 0.0076796445064246655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.679644477320835e-05, + "grad_norm": 3.753235340118408, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8699289560317993, + "num_tokens": 373892716.0, + "step": 9799 + }, + { + "epoch": 1.2466607301869992, + "ewc_loss": 0.0076470766216516495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.647076563443989e-05, + "grad_norm": 3.7602295875549316, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8888293504714966, + "num_tokens": 373929443.0, + "step": 9800 + }, + { + "epoch": 1.2467879404655897, + "ewc_loss": 0.007676610257476568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.67661040299572e-05, + "grad_norm": 3.7938666343688965, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8746904134750366, + "num_tokens": 373968882.0, + "step": 9801 + }, + { + "epoch": 1.2469151507441802, + "ewc_loss": 0.007671302650123835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.671302591916174e-05, + "grad_norm": 3.7920968532562256, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8857688903808594, + "num_tokens": 374009265.0, + "step": 9802 + }, + { + "epoch": 1.2470423610227706, + "ewc_loss": 0.007641306146979332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64130600146018e-05, + "grad_norm": 3.7924678325653076, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8587460517883301, + "num_tokens": 374047152.0, + "step": 9803 + }, + { + "epoch": 1.247169571301361, + "ewc_loss": 0.007641492411494255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.641492265975103e-05, + "grad_norm": 3.798279285430908, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8698626160621643, + "num_tokens": 374083127.0, + "step": 9804 + }, + { + "epoch": 1.2472967815799516, + "ewc_loss": 0.007636853028088808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6368531154003e-05, + "grad_norm": 3.7486207485198975, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8827350735664368, + "num_tokens": 374127086.0, + "step": 9805 + }, + { + "epoch": 1.2474239918585421, + "ewc_loss": 0.007608156185597181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.608156010974199e-05, + "grad_norm": 3.799726724624634, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8669863343238831, + "num_tokens": 374163879.0, + "step": 9806 + }, + { + "epoch": 1.2475512021371327, + "ewc_loss": 0.007660960778594017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.660960545763373e-05, + "grad_norm": 3.759033203125, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8785026669502258, + "num_tokens": 374206639.0, + "step": 9807 + }, + { + "epoch": 1.2476784124157232, + "ewc_loss": 0.007582121063023806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.582121179439127e-05, + "grad_norm": 3.790067434310913, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8704773187637329, + "num_tokens": 374244106.0, + "step": 9808 + }, + { + "epoch": 1.2478056226943137, + "ewc_loss": 0.007638644427061081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.638644456164911e-05, + "grad_norm": 3.7794442176818848, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8706830143928528, + "num_tokens": 374285776.0, + "step": 9809 + }, + { + "epoch": 1.2479328329729042, + "ewc_loss": 0.007602762896567583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.602763071190566e-05, + "grad_norm": 3.776319742202759, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8836076259613037, + "num_tokens": 374323036.0, + "step": 9810 + }, + { + "epoch": 1.2480600432514948, + "ewc_loss": 0.007608404848724604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.608404848724604e-05, + "grad_norm": 3.7782504558563232, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8800530433654785, + "num_tokens": 374360204.0, + "step": 9811 + }, + { + "epoch": 1.2481872535300853, + "ewc_loss": 0.007622392848134041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.622392877237871e-05, + "grad_norm": 3.7940852642059326, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8842582702636719, + "num_tokens": 374394961.0, + "step": 9812 + }, + { + "epoch": 1.2483144638086758, + "ewc_loss": 0.007625758182257414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.625758007634431e-05, + "grad_norm": 3.7723190784454346, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8750425577163696, + "num_tokens": 374436142.0, + "step": 9813 + }, + { + "epoch": 1.2484416740872661, + "ewc_loss": 0.007605749182403088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.605749124195427e-05, + "grad_norm": 3.7681798934936523, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8704891204833984, + "num_tokens": 374474263.0, + "step": 9814 + }, + { + "epoch": 1.2485688843658567, + "ewc_loss": 0.007616125512868166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.616125367349014e-05, + "grad_norm": 3.811866044998169, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8691936731338501, + "num_tokens": 374510250.0, + "step": 9815 + }, + { + "epoch": 1.2486960946444472, + "ewc_loss": 0.007631199900060892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631199696334079e-05, + "grad_norm": 3.7204368114471436, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8913940787315369, + "num_tokens": 374549579.0, + "step": 9816 + }, + { + "epoch": 1.2488233049230377, + "ewc_loss": 0.007574490271508694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.574490155093372e-05, + "grad_norm": 3.827958106994629, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8722416758537292, + "num_tokens": 374588631.0, + "step": 9817 + }, + { + "epoch": 1.2489505152016283, + "ewc_loss": 0.007660462986677885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.660462870262563e-05, + "grad_norm": 3.761399745941162, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8694779872894287, + "num_tokens": 374627092.0, + "step": 9818 + }, + { + "epoch": 1.2490777254802188, + "ewc_loss": 0.007569100242108107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.569100125692785e-05, + "grad_norm": 3.8585355281829834, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8803223371505737, + "num_tokens": 374657797.0, + "step": 9819 + }, + { + "epoch": 1.2492049357588093, + "ewc_loss": 0.007675509434193373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.675509550608695e-05, + "grad_norm": 3.7952616214752197, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8777464032173157, + "num_tokens": 374694011.0, + "step": 9820 + }, + { + "epoch": 1.2493321460373998, + "ewc_loss": 0.007600685581564903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.600685785291716e-05, + "grad_norm": 3.7229835987091064, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8757774829864502, + "num_tokens": 374735127.0, + "step": 9821 + }, + { + "epoch": 1.2494593563159904, + "ewc_loss": 0.007605987600982189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.605987775605172e-05, + "grad_norm": 3.7856831550598145, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8710169792175293, + "num_tokens": 374774170.0, + "step": 9822 + }, + { + "epoch": 1.249586566594581, + "ewc_loss": 0.007651063613593578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.651063788216561e-05, + "grad_norm": 3.7977092266082764, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8768341541290283, + "num_tokens": 374809162.0, + "step": 9823 + }, + { + "epoch": 1.2497137768731714, + "ewc_loss": 0.007646964397281408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64696451369673e-05, + "grad_norm": 3.831329345703125, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8656484484672546, + "num_tokens": 374847974.0, + "step": 9824 + }, + { + "epoch": 1.249840987151762, + "ewc_loss": 0.007668859325349331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.668859325349331e-05, + "grad_norm": 3.6826043128967285, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8892916440963745, + "num_tokens": 374894911.0, + "step": 9825 + }, + { + "epoch": 1.2499681974303525, + "ewc_loss": 0.007573266513645649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.573266339022666e-05, + "grad_norm": 3.7992584705352783, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8691869974136353, + "num_tokens": 374933534.0, + "step": 9826 + }, + { + "epoch": 1.250095407708943, + "ewc_loss": 0.007696365937590599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696366083109751e-05, + "grad_norm": 3.862189292907715, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8629544973373413, + "num_tokens": 374968183.0, + "step": 9827 + }, + { + "epoch": 1.2502226179875333, + "ewc_loss": 0.0076764291152358055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.676429231651127e-05, + "grad_norm": 3.7766945362091064, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8671606779098511, + "num_tokens": 375005973.0, + "step": 9828 + }, + { + "epoch": 1.2503498282661238, + "ewc_loss": 0.007628895342350006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.628895400557667e-05, + "grad_norm": 3.7909488677978516, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8674554824829102, + "num_tokens": 375043446.0, + "step": 9829 + }, + { + "epoch": 1.2504770385447144, + "ewc_loss": 0.007677176967263222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.677177200093865e-05, + "grad_norm": 3.8267784118652344, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8756417036056519, + "num_tokens": 375081518.0, + "step": 9830 + }, + { + "epoch": 1.250604248823305, + "ewc_loss": 0.007692552637308836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.692552753724158e-05, + "grad_norm": 3.8000354766845703, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8668087720870972, + "num_tokens": 375117767.0, + "step": 9831 + }, + { + "epoch": 1.2507314591018954, + "ewc_loss": 0.007659222465008497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.659222319489345e-05, + "grad_norm": 3.8705294132232666, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8506181836128235, + "num_tokens": 375152061.0, + "step": 9832 + }, + { + "epoch": 1.250858669380486, + "ewc_loss": 0.007741763722151518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741763693047687e-05, + "grad_norm": 3.7337841987609863, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8686209917068481, + "num_tokens": 375195677.0, + "step": 9833 + }, + { + "epoch": 1.2509858796590765, + "ewc_loss": 0.007638926152139902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.63892603572458e-05, + "grad_norm": 3.8339738845825195, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8663278818130493, + "num_tokens": 375230319.0, + "step": 9834 + }, + { + "epoch": 1.251113089937667, + "ewc_loss": 0.0077638751827180386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763875328237191e-05, + "grad_norm": 3.782205104827881, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8771191239356995, + "num_tokens": 375268970.0, + "step": 9835 + }, + { + "epoch": 1.2512403002162575, + "ewc_loss": 0.00769983371719718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699833804508671e-05, + "grad_norm": 3.8278491497039795, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8809409737586975, + "num_tokens": 375303661.0, + "step": 9836 + }, + { + "epoch": 1.2513675104948478, + "ewc_loss": 0.0077481744810938835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.748174539301544e-05, + "grad_norm": 3.736549139022827, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8812202215194702, + "num_tokens": 375343278.0, + "step": 9837 + }, + { + "epoch": 1.2514947207734384, + "ewc_loss": 0.0076638078317046165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.663807627977803e-05, + "grad_norm": 3.7798244953155518, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8790724277496338, + "num_tokens": 375378574.0, + "step": 9838 + }, + { + "epoch": 1.251621931052029, + "ewc_loss": 0.0077294884249567986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729488424956799e-05, + "grad_norm": 3.832209587097168, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8687222003936768, + "num_tokens": 375412781.0, + "step": 9839 + }, + { + "epoch": 1.2517491413306194, + "ewc_loss": 0.007740729954093695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740729779470712e-05, + "grad_norm": 3.801440954208374, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8854314088821411, + "num_tokens": 375450431.0, + "step": 9840 + }, + { + "epoch": 1.25187635160921, + "ewc_loss": 0.007701599504798651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.701599679421633e-05, + "grad_norm": 3.7399396896362305, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.867868185043335, + "num_tokens": 375490692.0, + "step": 9841 + }, + { + "epoch": 1.2520035618878005, + "ewc_loss": 0.007681659888476133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.681659917579964e-05, + "grad_norm": 3.7524831295013428, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8777551651000977, + "num_tokens": 375535278.0, + "step": 9842 + }, + { + "epoch": 1.252130772166391, + "ewc_loss": 0.007699630223214626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699630077695474e-05, + "grad_norm": 3.8026437759399414, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8671918511390686, + "num_tokens": 375575859.0, + "step": 9843 + }, + { + "epoch": 1.2522579824449815, + "ewc_loss": 0.007721632719039917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721632573520765e-05, + "grad_norm": 3.829843044281006, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8668049573898315, + "num_tokens": 375611084.0, + "step": 9844 + }, + { + "epoch": 1.252385192723572, + "ewc_loss": 0.007707646116614342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70764600019902e-05, + "grad_norm": 3.8054122924804688, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8833106160163879, + "num_tokens": 375644926.0, + "step": 9845 + }, + { + "epoch": 1.2525124030021626, + "ewc_loss": 0.007681013084948063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.681013084948063e-05, + "grad_norm": 3.753932237625122, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8825592994689941, + "num_tokens": 375686970.0, + "step": 9846 + }, + { + "epoch": 1.2526396132807531, + "ewc_loss": 0.007679368834942579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.679368718527257e-05, + "grad_norm": 3.740797519683838, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8855090141296387, + "num_tokens": 375727372.0, + "step": 9847 + }, + { + "epoch": 1.2527668235593437, + "ewc_loss": 0.007665644865483046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.665644807275385e-05, + "grad_norm": 3.816863775253296, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8632298707962036, + "num_tokens": 375763568.0, + "step": 9848 + }, + { + "epoch": 1.2528940338379342, + "ewc_loss": 0.007706147152930498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706147152930498e-05, + "grad_norm": 3.858900547027588, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8563296794891357, + "num_tokens": 375797954.0, + "step": 9849 + }, + { + "epoch": 1.2530212441165247, + "ewc_loss": 0.0077131749130785465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713175000390038e-05, + "grad_norm": 3.7801315784454346, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8723175525665283, + "num_tokens": 375836362.0, + "step": 9850 + }, + { + "epoch": 1.2531484543951152, + "ewc_loss": 0.007658700458705425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658700633328408e-05, + "grad_norm": 3.7378921508789062, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8683513402938843, + "num_tokens": 375879403.0, + "step": 9851 + }, + { + "epoch": 1.2532756646737058, + "ewc_loss": 0.007679901085793972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.679901318624616e-05, + "grad_norm": 3.8177003860473633, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8749514818191528, + "num_tokens": 375916728.0, + "step": 9852 + }, + { + "epoch": 1.253402874952296, + "ewc_loss": 0.007726899813860655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.726899639237672e-05, + "grad_norm": 3.827974557876587, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8678207397460938, + "num_tokens": 375951743.0, + "step": 9853 + }, + { + "epoch": 1.2535300852308866, + "ewc_loss": 0.007721235975623131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721236033830792e-05, + "grad_norm": 3.8228843212127686, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8495396375656128, + "num_tokens": 375988872.0, + "step": 9854 + }, + { + "epoch": 1.2536572955094771, + "ewc_loss": 0.007712314371019602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71231425460428e-05, + "grad_norm": 3.7298998832702637, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8637118339538574, + "num_tokens": 376030388.0, + "step": 9855 + }, + { + "epoch": 1.2537845057880677, + "ewc_loss": 0.00766562856733799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.665628800168633e-05, + "grad_norm": 3.814588785171509, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8812611699104309, + "num_tokens": 376062975.0, + "step": 9856 + }, + { + "epoch": 1.2539117160666582, + "ewc_loss": 0.007757736369967461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757736602798104e-05, + "grad_norm": 3.7689878940582275, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8653233051300049, + "num_tokens": 376101873.0, + "step": 9857 + }, + { + "epoch": 1.2540389263452487, + "ewc_loss": 0.007699612062424421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699611887801439e-05, + "grad_norm": 3.8711485862731934, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8541397452354431, + "num_tokens": 376138774.0, + "step": 9858 + }, + { + "epoch": 1.2541661366238392, + "ewc_loss": 0.007781361695379019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781361637171358e-05, + "grad_norm": 3.846078395843506, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8663561940193176, + "num_tokens": 376172357.0, + "step": 9859 + }, + { + "epoch": 1.2542933469024298, + "ewc_loss": 0.007744300179183483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.744300091871992e-05, + "grad_norm": 3.7654685974121094, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8642295002937317, + "num_tokens": 376212977.0, + "step": 9860 + }, + { + "epoch": 1.2544205571810203, + "ewc_loss": 0.007701371796429157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70137194194831e-05, + "grad_norm": 3.7052860260009766, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8647096157073975, + "num_tokens": 376256989.0, + "step": 9861 + }, + { + "epoch": 1.2545477674596106, + "ewc_loss": 0.007706352509558201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706352334935218e-05, + "grad_norm": 3.736726999282837, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8773249387741089, + "num_tokens": 376300130.0, + "step": 9862 + }, + { + "epoch": 1.2546749777382011, + "ewc_loss": 0.007729268167167902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729267963441089e-05, + "grad_norm": 3.7768611907958984, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8773020505905151, + "num_tokens": 376337475.0, + "step": 9863 + }, + { + "epoch": 1.2548021880167917, + "ewc_loss": 0.007714119274169207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.714119419688359e-05, + "grad_norm": 3.792203903198242, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8647115230560303, + "num_tokens": 376376485.0, + "step": 9864 + }, + { + "epoch": 1.2549293982953822, + "ewc_loss": 0.007732574827969074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732574886176735e-05, + "grad_norm": 3.787705421447754, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8800193071365356, + "num_tokens": 376413393.0, + "step": 9865 + }, + { + "epoch": 1.2550566085739727, + "ewc_loss": 0.0077273668721318245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.727366755716503e-05, + "grad_norm": 3.835707187652588, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8632144331932068, + "num_tokens": 376448192.0, + "step": 9866 + }, + { + "epoch": 1.2551838188525632, + "ewc_loss": 0.007743332535028458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.743332389509305e-05, + "grad_norm": 3.7507805824279785, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8757942318916321, + "num_tokens": 376489382.0, + "step": 9867 + }, + { + "epoch": 1.2553110291311538, + "ewc_loss": 0.007676891982555389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.67689198255539e-05, + "grad_norm": 3.7804505825042725, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8762775659561157, + "num_tokens": 376527221.0, + "step": 9868 + }, + { + "epoch": 1.2554382394097443, + "ewc_loss": 0.007735217455774546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735217513982207e-05, + "grad_norm": 3.7723166942596436, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.868024468421936, + "num_tokens": 376567045.0, + "step": 9869 + }, + { + "epoch": 1.2555654496883348, + "ewc_loss": 0.007693804334849119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.693804218433797e-05, + "grad_norm": 3.727142810821533, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8786382675170898, + "num_tokens": 376607039.0, + "step": 9870 + }, + { + "epoch": 1.2556926599669254, + "ewc_loss": 0.007686523254960775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.686523167649284e-05, + "grad_norm": 3.777334213256836, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8779333829879761, + "num_tokens": 376643220.0, + "step": 9871 + }, + { + "epoch": 1.255819870245516, + "ewc_loss": 0.00772151118144393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721511065028608e-05, + "grad_norm": 3.755679130554199, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8924386501312256, + "num_tokens": 376681417.0, + "step": 9872 + }, + { + "epoch": 1.2559470805241064, + "ewc_loss": 0.007695542648434639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.695542444707826e-05, + "grad_norm": 3.755783796310425, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8902121782302856, + "num_tokens": 376722836.0, + "step": 9873 + }, + { + "epoch": 1.256074290802697, + "ewc_loss": 0.00768758961930871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.687589823035523e-05, + "grad_norm": 3.8002583980560303, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8660720586776733, + "num_tokens": 376763194.0, + "step": 9874 + }, + { + "epoch": 1.2562015010812875, + "ewc_loss": 0.0076963710598647594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696371176280081e-05, + "grad_norm": 3.7915310859680176, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8674290180206299, + "num_tokens": 376803701.0, + "step": 9875 + }, + { + "epoch": 1.256328711359878, + "ewc_loss": 0.0076700234785676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6700234785676e-05, + "grad_norm": 3.802272319793701, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8824912905693054, + "num_tokens": 376835296.0, + "step": 9876 + }, + { + "epoch": 1.2564559216384683, + "ewc_loss": 0.007675266359001398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.675266533624381e-05, + "grad_norm": 3.7532219886779785, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8763425350189209, + "num_tokens": 376876317.0, + "step": 9877 + }, + { + "epoch": 1.2565831319170588, + "ewc_loss": 0.007638921495527029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.638921670150012e-05, + "grad_norm": 3.7514660358428955, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.870965838432312, + "num_tokens": 376916403.0, + "step": 9878 + }, + { + "epoch": 1.2567103421956494, + "ewc_loss": 0.007655267138034105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.655267108930275e-05, + "grad_norm": 3.782151937484741, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8819793462753296, + "num_tokens": 376956079.0, + "step": 9879 + }, + { + "epoch": 1.25683755247424, + "ewc_loss": 0.0076682730577886105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.668272883165628e-05, + "grad_norm": 3.9107677936553955, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8629648089408875, + "num_tokens": 376991618.0, + "step": 9880 + }, + { + "epoch": 1.2569647627528304, + "ewc_loss": 0.007721590343862772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721590372966602e-05, + "grad_norm": 3.765979051589966, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8795437812805176, + "num_tokens": 377033964.0, + "step": 9881 + }, + { + "epoch": 1.257091973031421, + "ewc_loss": 0.007590155117213726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.590155291836709e-05, + "grad_norm": 3.791579246520996, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8790199160575867, + "num_tokens": 377070091.0, + "step": 9882 + }, + { + "epoch": 1.2572191833100115, + "ewc_loss": 0.00765047874301672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.650478801224381e-05, + "grad_norm": 3.734609842300415, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8823869228363037, + "num_tokens": 377111185.0, + "step": 9883 + }, + { + "epoch": 1.257346393588602, + "ewc_loss": 0.0075782532803714275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.578253280371428e-05, + "grad_norm": 3.773224115371704, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8785676956176758, + "num_tokens": 377147637.0, + "step": 9884 + }, + { + "epoch": 1.2574736038671925, + "ewc_loss": 0.007644219323992729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644219294888899e-05, + "grad_norm": 3.772780179977417, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8721255660057068, + "num_tokens": 377181918.0, + "step": 9885 + }, + { + "epoch": 1.2576008141457828, + "ewc_loss": 0.007622569799423218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.622569683007896e-05, + "grad_norm": 3.7844059467315674, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8679365515708923, + "num_tokens": 377222900.0, + "step": 9886 + }, + { + "epoch": 1.2577280244243734, + "ewc_loss": 0.007629139348864555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.629139145137742e-05, + "grad_norm": 3.7323930263519287, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8866773247718811, + "num_tokens": 377263107.0, + "step": 9887 + }, + { + "epoch": 1.257855234702964, + "ewc_loss": 0.007609859108924866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.609859312651679e-05, + "grad_norm": 3.755340814590454, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8881864547729492, + "num_tokens": 377301384.0, + "step": 9888 + }, + { + "epoch": 1.2579824449815544, + "ewc_loss": 0.007627003826200962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62700365157798e-05, + "grad_norm": 3.8273770809173584, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.867581307888031, + "num_tokens": 377339865.0, + "step": 9889 + }, + { + "epoch": 1.258109655260145, + "ewc_loss": 0.007648827042430639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648827158845961e-05, + "grad_norm": 3.7531254291534424, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8818055987358093, + "num_tokens": 377381582.0, + "step": 9890 + }, + { + "epoch": 1.2582368655387355, + "ewc_loss": 0.007571707013994455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.571707101305947e-05, + "grad_norm": 3.76016902923584, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8512086868286133, + "num_tokens": 377425548.0, + "step": 9891 + }, + { + "epoch": 1.258364075817326, + "ewc_loss": 0.00759799825027585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597998046549037e-05, + "grad_norm": 3.823155164718628, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8726468086242676, + "num_tokens": 377462298.0, + "step": 9892 + }, + { + "epoch": 1.2584912860959165, + "ewc_loss": 0.007627676706761122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.627676677657291e-05, + "grad_norm": 3.7490437030792236, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8654807806015015, + "num_tokens": 377503873.0, + "step": 9893 + }, + { + "epoch": 1.258618496374507, + "ewc_loss": 0.0075546070002019405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.554607145721093e-05, + "grad_norm": 3.7671637535095215, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8891425728797913, + "num_tokens": 377540497.0, + "step": 9894 + }, + { + "epoch": 1.2587457066530976, + "ewc_loss": 0.007601034361869097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.601034303661436e-05, + "grad_norm": 3.7970640659332275, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8741227388381958, + "num_tokens": 377582242.0, + "step": 9895 + }, + { + "epoch": 1.2588729169316881, + "ewc_loss": 0.007589427754282951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.58942769607529e-05, + "grad_norm": 3.785118341445923, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8886985182762146, + "num_tokens": 377614972.0, + "step": 9896 + }, + { + "epoch": 1.2590001272102787, + "ewc_loss": 0.007590286433696747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.590286259073764e-05, + "grad_norm": 3.918229341506958, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8709731101989746, + "num_tokens": 377645392.0, + "step": 9897 + }, + { + "epoch": 1.2591273374888692, + "ewc_loss": 0.007662453688681126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.662453572265804e-05, + "grad_norm": 3.783142328262329, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8812333941459656, + "num_tokens": 377680317.0, + "step": 9898 + }, + { + "epoch": 1.2592545477674597, + "ewc_loss": 0.007560839876532555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560839731013402e-05, + "grad_norm": 3.828855514526367, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.872559130191803, + "num_tokens": 377714879.0, + "step": 9899 + }, + { + "epoch": 1.2593817580460502, + "ewc_loss": 0.007652022875845432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.65202275943011e-05, + "grad_norm": 3.8068807125091553, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8597513437271118, + "num_tokens": 377751404.0, + "step": 9900 + }, + { + "epoch": 1.2595089683246408, + "ewc_loss": 0.00761454226449132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.614542118972167e-05, + "grad_norm": 3.769683599472046, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8780941367149353, + "num_tokens": 377788354.0, + "step": 9901 + }, + { + "epoch": 1.259636178603231, + "ewc_loss": 0.0076178889721632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617889059474692e-05, + "grad_norm": 3.796675205230713, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.864374577999115, + "num_tokens": 377827259.0, + "step": 9902 + }, + { + "epoch": 1.2597633888818216, + "ewc_loss": 0.007670773658901453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.670773629797623e-05, + "grad_norm": 3.758255958557129, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8646007776260376, + "num_tokens": 377872477.0, + "step": 9903 + }, + { + "epoch": 1.2598905991604121, + "ewc_loss": 0.007625068072229624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.625068246852607e-05, + "grad_norm": 3.8101813793182373, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8654422760009766, + "num_tokens": 377911159.0, + "step": 9904 + }, + { + "epoch": 1.2600178094390027, + "ewc_loss": 0.007693103514611721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.693103543715551e-05, + "grad_norm": 3.7580323219299316, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8777189254760742, + "num_tokens": 377951919.0, + "step": 9905 + }, + { + "epoch": 1.2601450197175932, + "ewc_loss": 0.007639062590897083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639062823727727e-05, + "grad_norm": 3.7775018215179443, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8553248047828674, + "num_tokens": 377997567.0, + "step": 9906 + }, + { + "epoch": 1.2602722299961837, + "ewc_loss": 0.007682616822421551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.682616706006229e-05, + "grad_norm": 3.796818256378174, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8718894720077515, + "num_tokens": 378033811.0, + "step": 9907 + }, + { + "epoch": 1.2603994402747742, + "ewc_loss": 0.007679754868149757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.67975507187657e-05, + "grad_norm": 3.8034555912017822, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8614704012870789, + "num_tokens": 378070136.0, + "step": 9908 + }, + { + "epoch": 1.2605266505533648, + "ewc_loss": 0.007701147347688675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.701147114858031e-05, + "grad_norm": 3.876507043838501, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8698236346244812, + "num_tokens": 378106902.0, + "step": 9909 + }, + { + "epoch": 1.2606538608319553, + "ewc_loss": 0.007740334141999483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.7403339673765e-05, + "grad_norm": 3.7649879455566406, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8761123418807983, + "num_tokens": 378144638.0, + "step": 9910 + }, + { + "epoch": 1.2607810711105456, + "ewc_loss": 0.007649210747331381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.649210601812229e-05, + "grad_norm": 3.754779815673828, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8834877014160156, + "num_tokens": 378179059.0, + "step": 9911 + }, + { + "epoch": 1.2609082813891361, + "ewc_loss": 0.0076820142567157745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.682014256715775e-05, + "grad_norm": 3.902644395828247, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8592963218688965, + "num_tokens": 378210346.0, + "step": 9912 + }, + { + "epoch": 1.2610354916677267, + "ewc_loss": 0.007760907988995314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.760908192722127e-05, + "grad_norm": 3.798729181289673, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8788287043571472, + "num_tokens": 378242687.0, + "step": 9913 + }, + { + "epoch": 1.2611627019463172, + "ewc_loss": 0.007663080934435129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.663080759812146e-05, + "grad_norm": 3.753505229949951, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8782181739807129, + "num_tokens": 378279421.0, + "step": 9914 + }, + { + "epoch": 1.2612899122249077, + "ewc_loss": 0.0076897223480045795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.68972240621224e-05, + "grad_norm": 3.784029245376587, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8696776628494263, + "num_tokens": 378318133.0, + "step": 9915 + }, + { + "epoch": 1.2614171225034982, + "ewc_loss": 0.007720199413597584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72019920987077e-05, + "grad_norm": 3.7573585510253906, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8562839031219482, + "num_tokens": 378362987.0, + "step": 9916 + }, + { + "epoch": 1.2615443327820888, + "ewc_loss": 0.007694695610553026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.694695523241535e-05, + "grad_norm": 3.7723968029022217, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8896011114120483, + "num_tokens": 378396197.0, + "step": 9917 + }, + { + "epoch": 1.2616715430606793, + "ewc_loss": 0.007732070051133633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732069934718311e-05, + "grad_norm": 3.807520866394043, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8818545341491699, + "num_tokens": 378433691.0, + "step": 9918 + }, + { + "epoch": 1.2617987533392698, + "ewc_loss": 0.0077478596940636635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.74785949033685e-05, + "grad_norm": 3.7641854286193848, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8760915398597717, + "num_tokens": 378470277.0, + "step": 9919 + }, + { + "epoch": 1.2619259636178604, + "ewc_loss": 0.00769124086946249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69124089856632e-05, + "grad_norm": 3.7543423175811768, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8681225180625916, + "num_tokens": 378510500.0, + "step": 9920 + }, + { + "epoch": 1.2620531738964509, + "ewc_loss": 0.007722192909568548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722192822257057e-05, + "grad_norm": 3.809253454208374, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8720928430557251, + "num_tokens": 378552893.0, + "step": 9921 + }, + { + "epoch": 1.2621803841750414, + "ewc_loss": 0.007747786119580269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747786003164947e-05, + "grad_norm": 3.758270025253296, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8812774419784546, + "num_tokens": 378590917.0, + "step": 9922 + }, + { + "epoch": 1.262307594453632, + "ewc_loss": 0.0076935975812375546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.693597581237555e-05, + "grad_norm": 3.746309280395508, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8821778297424316, + "num_tokens": 378637764.0, + "step": 9923 + }, + { + "epoch": 1.2624348047322225, + "ewc_loss": 0.007701889146119356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.701889262534678e-05, + "grad_norm": 3.7997190952301025, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.871737539768219, + "num_tokens": 378675497.0, + "step": 9924 + }, + { + "epoch": 1.262562015010813, + "ewc_loss": 0.007733235601335764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.733235543128103e-05, + "grad_norm": 3.8706231117248535, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8647116422653198, + "num_tokens": 378710470.0, + "step": 9925 + }, + { + "epoch": 1.2626892252894033, + "ewc_loss": 0.007745548151433468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745547918602824e-05, + "grad_norm": 3.7308509349823, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8688403964042664, + "num_tokens": 378758116.0, + "step": 9926 + }, + { + "epoch": 1.2628164355679938, + "ewc_loss": 0.007632466498762369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.632466440554708e-05, + "grad_norm": 3.7498767375946045, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.870847225189209, + "num_tokens": 378798166.0, + "step": 9927 + }, + { + "epoch": 1.2629436458465844, + "ewc_loss": 0.0077151148580014706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.715114770689979e-05, + "grad_norm": 3.7171266078948975, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8883089423179626, + "num_tokens": 378839899.0, + "step": 9928 + }, + { + "epoch": 1.263070856125175, + "ewc_loss": 0.007646884303539991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.646884478162974e-05, + "grad_norm": 3.7130863666534424, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8743025064468384, + "num_tokens": 378886022.0, + "step": 9929 + }, + { + "epoch": 1.2631980664037654, + "ewc_loss": 0.007662479765713215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.662479765713215e-05, + "grad_norm": 3.772923469543457, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.862816333770752, + "num_tokens": 378925097.0, + "step": 9930 + }, + { + "epoch": 1.263325276682356, + "ewc_loss": 0.007698361296206713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69836115068756e-05, + "grad_norm": 3.757669448852539, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.885697066783905, + "num_tokens": 378965242.0, + "step": 9931 + }, + { + "epoch": 1.2634524869609465, + "ewc_loss": 0.007653713691979647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.653713691979647e-05, + "grad_norm": 3.8182530403137207, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8545064330101013, + "num_tokens": 379005932.0, + "step": 9932 + }, + { + "epoch": 1.263579697239537, + "ewc_loss": 0.007696126122027636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696125976508483e-05, + "grad_norm": 3.7651021480560303, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8709858655929565, + "num_tokens": 379045817.0, + "step": 9933 + }, + { + "epoch": 1.2637069075181275, + "ewc_loss": 0.007626618258655071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626618025824428e-05, + "grad_norm": 3.749149799346924, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8671945929527283, + "num_tokens": 379086481.0, + "step": 9934 + }, + { + "epoch": 1.2638341177967178, + "ewc_loss": 0.007650050334632397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.650050247320905e-05, + "grad_norm": 3.76576566696167, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8575655817985535, + "num_tokens": 379129782.0, + "step": 9935 + }, + { + "epoch": 1.2639613280753084, + "ewc_loss": 0.007669116836041212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.669116894248873e-05, + "grad_norm": 3.7786967754364014, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8716457486152649, + "num_tokens": 379171525.0, + "step": 9936 + }, + { + "epoch": 1.264088538353899, + "ewc_loss": 0.007645822130143642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.645822188351303e-05, + "grad_norm": 3.7571206092834473, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8675719499588013, + "num_tokens": 379213887.0, + "step": 9937 + }, + { + "epoch": 1.2642157486324894, + "ewc_loss": 0.007637877482920885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.637877570232376e-05, + "grad_norm": 3.7641098499298096, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8728134036064148, + "num_tokens": 379252634.0, + "step": 9938 + }, + { + "epoch": 1.26434295891108, + "ewc_loss": 0.007644052151590586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644051947863773e-05, + "grad_norm": 3.839615821838379, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8715794086456299, + "num_tokens": 379287402.0, + "step": 9939 + }, + { + "epoch": 1.2644701691896705, + "ewc_loss": 0.007690146565437317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.690146594541147e-05, + "grad_norm": 3.7677347660064697, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8788861036300659, + "num_tokens": 379325592.0, + "step": 9940 + }, + { + "epoch": 1.264597379468261, + "ewc_loss": 0.007619617972522974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.619617827003822e-05, + "grad_norm": 3.890890598297119, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8606795072555542, + "num_tokens": 379359010.0, + "step": 9941 + }, + { + "epoch": 1.2647245897468515, + "ewc_loss": 0.007725986652076244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725986506557092e-05, + "grad_norm": 3.8017406463623047, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8660681843757629, + "num_tokens": 379399783.0, + "step": 9942 + }, + { + "epoch": 1.264851800025442, + "ewc_loss": 0.007630976382642984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.630976324435323e-05, + "grad_norm": 3.7921178340911865, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8768733739852905, + "num_tokens": 379442207.0, + "step": 9943 + }, + { + "epoch": 1.2649790103040326, + "ewc_loss": 0.007635167799890041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.635168003616855e-05, + "grad_norm": 3.776393413543701, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8712601661682129, + "num_tokens": 379479561.0, + "step": 9944 + }, + { + "epoch": 1.2651062205826231, + "ewc_loss": 0.0076471478678286076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.647147867828608e-05, + "grad_norm": 3.795849323272705, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8788434863090515, + "num_tokens": 379516413.0, + "step": 9945 + }, + { + "epoch": 1.2652334308612136, + "ewc_loss": 0.007662330754101276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.662330608582124e-05, + "grad_norm": 3.7621536254882812, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8644466400146484, + "num_tokens": 379557993.0, + "step": 9946 + }, + { + "epoch": 1.2653606411398042, + "ewc_loss": 0.007634682115167379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.634681969648227e-05, + "grad_norm": 3.8265514373779297, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8555540442466736, + "num_tokens": 379594028.0, + "step": 9947 + }, + { + "epoch": 1.2654878514183947, + "ewc_loss": 0.0076961275190114975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696127431700006e-05, + "grad_norm": 3.8057119846343994, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8684046268463135, + "num_tokens": 379631235.0, + "step": 9948 + }, + { + "epoch": 1.2656150616969852, + "ewc_loss": 0.00765898497775197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658985123271123e-05, + "grad_norm": 3.796694040298462, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8620948791503906, + "num_tokens": 379668714.0, + "step": 9949 + }, + { + "epoch": 1.2657422719755758, + "ewc_loss": 0.007683778181672096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683777948841453e-05, + "grad_norm": 3.746227741241455, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8778289556503296, + "num_tokens": 379710233.0, + "step": 9950 + }, + { + "epoch": 1.265869482254166, + "ewc_loss": 0.007670385763049126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.670385821256787e-05, + "grad_norm": 3.832545518875122, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8704404830932617, + "num_tokens": 379746208.0, + "step": 9951 + }, + { + "epoch": 1.2659966925327566, + "ewc_loss": 0.007740913890302181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740913861198351e-05, + "grad_norm": 3.7163941860198975, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8818300366401672, + "num_tokens": 379796420.0, + "step": 9952 + }, + { + "epoch": 1.2661239028113471, + "ewc_loss": 0.007632994093000889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.632993947481737e-05, + "grad_norm": 3.8455679416656494, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8747760653495789, + "num_tokens": 379830027.0, + "step": 9953 + }, + { + "epoch": 1.2662511130899377, + "ewc_loss": 0.007749863434582949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749863289063796e-05, + "grad_norm": 3.758348226547241, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8721370697021484, + "num_tokens": 379871751.0, + "step": 9954 + }, + { + "epoch": 1.2663783233685282, + "ewc_loss": 0.0076597388833761215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.659738912479952e-05, + "grad_norm": 3.795654296875, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8652998208999634, + "num_tokens": 379908255.0, + "step": 9955 + }, + { + "epoch": 1.2665055336471187, + "ewc_loss": 0.0077010393142700195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.701039430685341e-05, + "grad_norm": 3.79160213470459, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8745085597038269, + "num_tokens": 379944502.0, + "step": 9956 + }, + { + "epoch": 1.2666327439257092, + "ewc_loss": 0.007681047078222036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.68104728194885e-05, + "grad_norm": 3.77944016456604, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8696653842926025, + "num_tokens": 379981929.0, + "step": 9957 + }, + { + "epoch": 1.2667599542042998, + "ewc_loss": 0.007672154810279608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.672154606552795e-05, + "grad_norm": 3.7788331508636475, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8667292594909668, + "num_tokens": 380018876.0, + "step": 9958 + }, + { + "epoch": 1.2668871644828903, + "ewc_loss": 0.007678863126784563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678863039473072e-05, + "grad_norm": 3.7927563190460205, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8669511079788208, + "num_tokens": 380054153.0, + "step": 9959 + }, + { + "epoch": 1.2670143747614806, + "ewc_loss": 0.007685358170419931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.685358286835253e-05, + "grad_norm": 3.8723318576812744, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.864669919013977, + "num_tokens": 380086550.0, + "step": 9960 + }, + { + "epoch": 1.2671415850400711, + "ewc_loss": 0.007739790249615908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.739790453342721e-05, + "grad_norm": 3.8417418003082275, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8671759366989136, + "num_tokens": 380120308.0, + "step": 9961 + }, + { + "epoch": 1.2672687953186617, + "ewc_loss": 0.007696045096963644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696045213378966e-05, + "grad_norm": 3.7689895629882812, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8570200204849243, + "num_tokens": 380160204.0, + "step": 9962 + }, + { + "epoch": 1.2673960055972522, + "ewc_loss": 0.007668572477996349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.668572652619332e-05, + "grad_norm": 3.7886343002319336, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8754026889801025, + "num_tokens": 380196687.0, + "step": 9963 + }, + { + "epoch": 1.2675232158758427, + "ewc_loss": 0.00771403918042779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.714039384154603e-05, + "grad_norm": 3.7305376529693604, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.888058602809906, + "num_tokens": 380237451.0, + "step": 9964 + }, + { + "epoch": 1.2676504261544332, + "ewc_loss": 0.007676857989281416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.676857785554603e-05, + "grad_norm": 3.793337821960449, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8676902055740356, + "num_tokens": 380276631.0, + "step": 9965 + }, + { + "epoch": 1.2677776364330238, + "ewc_loss": 0.0077218785881996155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721878500888124e-05, + "grad_norm": 3.768575668334961, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8767841458320618, + "num_tokens": 380319432.0, + "step": 9966 + }, + { + "epoch": 1.2679048467116143, + "ewc_loss": 0.007677913643419743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.677913527004421e-05, + "grad_norm": 3.7685387134552, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8791897296905518, + "num_tokens": 380356666.0, + "step": 9967 + }, + { + "epoch": 1.2680320569902048, + "ewc_loss": 0.00767603050917387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.67603050917387e-05, + "grad_norm": 3.8141865730285645, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8797175884246826, + "num_tokens": 380390347.0, + "step": 9968 + }, + { + "epoch": 1.2681592672687954, + "ewc_loss": 0.007717021740972996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.717021799180657e-05, + "grad_norm": 3.8104746341705322, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8775261044502258, + "num_tokens": 380424083.0, + "step": 9969 + }, + { + "epoch": 1.2682864775473859, + "ewc_loss": 0.007704504765570164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704504969296977e-05, + "grad_norm": 3.7522215843200684, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8672502636909485, + "num_tokens": 380471179.0, + "step": 9970 + }, + { + "epoch": 1.2684136878259764, + "ewc_loss": 0.007678622379899025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678622205276042e-05, + "grad_norm": 3.8152668476104736, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8740604519844055, + "num_tokens": 380510291.0, + "step": 9971 + }, + { + "epoch": 1.268540898104567, + "ewc_loss": 0.007709810510277748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70981059758924e-05, + "grad_norm": 3.7304024696350098, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8775066137313843, + "num_tokens": 380556297.0, + "step": 9972 + }, + { + "epoch": 1.2686681083831575, + "ewc_loss": 0.007658780552446842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658780668862164e-05, + "grad_norm": 3.8140642642974854, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8824101686477661, + "num_tokens": 380592113.0, + "step": 9973 + }, + { + "epoch": 1.268795318661748, + "ewc_loss": 0.007725833915174007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725833711447194e-05, + "grad_norm": 3.8244478702545166, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.884904682636261, + "num_tokens": 380623942.0, + "step": 9974 + }, + { + "epoch": 1.2689225289403383, + "ewc_loss": 0.007700268179178238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.700268179178238e-05, + "grad_norm": 3.773510456085205, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8665243983268738, + "num_tokens": 380662300.0, + "step": 9975 + }, + { + "epoch": 1.2690497392189288, + "ewc_loss": 0.007657080423086882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.65708027756773e-05, + "grad_norm": 3.779874801635742, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8745517730712891, + "num_tokens": 380701452.0, + "step": 9976 + }, + { + "epoch": 1.2691769494975194, + "ewc_loss": 0.007702391128987074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702391303610057e-05, + "grad_norm": 3.771937370300293, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.883285641670227, + "num_tokens": 380741248.0, + "step": 9977 + }, + { + "epoch": 1.2693041597761099, + "ewc_loss": 0.007674997206777334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.674997323192656e-05, + "grad_norm": 3.8080196380615234, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8666054010391235, + "num_tokens": 380781455.0, + "step": 9978 + }, + { + "epoch": 1.2694313700547004, + "ewc_loss": 0.007704624906182289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704625022597611e-05, + "grad_norm": 3.8349268436431885, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8655387163162231, + "num_tokens": 380818139.0, + "step": 9979 + }, + { + "epoch": 1.269558580333291, + "ewc_loss": 0.007690484169870615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.690484198974445e-05, + "grad_norm": 3.7286248207092285, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8782246112823486, + "num_tokens": 380860530.0, + "step": 9980 + }, + { + "epoch": 1.2696857906118815, + "ewc_loss": 0.007634594105184078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.634593930561095e-05, + "grad_norm": 3.759140968322754, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.884016215801239, + "num_tokens": 380900825.0, + "step": 9981 + }, + { + "epoch": 1.269813000890472, + "ewc_loss": 0.007683364674448967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683364674448967e-05, + "grad_norm": 3.7816390991210938, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8703128695487976, + "num_tokens": 380939270.0, + "step": 9982 + }, + { + "epoch": 1.2699402111690625, + "ewc_loss": 0.007672139443457127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.672139327041805e-05, + "grad_norm": 3.8139212131500244, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8500182628631592, + "num_tokens": 380981708.0, + "step": 9983 + }, + { + "epoch": 1.2700674214476528, + "ewc_loss": 0.007680073846131563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.680073758820072e-05, + "grad_norm": 3.844395637512207, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8775948286056519, + "num_tokens": 381015186.0, + "step": 9984 + }, + { + "epoch": 1.2701946317262434, + "ewc_loss": 0.007682614494115114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.682614523218945e-05, + "grad_norm": 3.7641496658325195, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8828756213188171, + "num_tokens": 381055503.0, + "step": 9985 + }, + { + "epoch": 1.270321842004834, + "ewc_loss": 0.007627084385603666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.627084414707497e-05, + "grad_norm": 3.7636687755584717, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8845527172088623, + "num_tokens": 381091935.0, + "step": 9986 + }, + { + "epoch": 1.2704490522834244, + "ewc_loss": 0.007658602204173803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658602407900617e-05, + "grad_norm": 3.8184940814971924, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8776542544364929, + "num_tokens": 381127951.0, + "step": 9987 + }, + { + "epoch": 1.270576262562015, + "ewc_loss": 0.007679159287363291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.679159170947969e-05, + "grad_norm": 3.8186564445495605, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8688105344772339, + "num_tokens": 381161918.0, + "step": 9988 + }, + { + "epoch": 1.2707034728406055, + "ewc_loss": 0.007664395496249199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66439552535303e-05, + "grad_norm": 3.807291030883789, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8627506494522095, + "num_tokens": 381198747.0, + "step": 9989 + }, + { + "epoch": 1.270830683119196, + "ewc_loss": 0.0076591577380895615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.659157563466579e-05, + "grad_norm": 3.742978811264038, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8732744455337524, + "num_tokens": 381242714.0, + "step": 9990 + }, + { + "epoch": 1.2709578933977865, + "ewc_loss": 0.007636853959411383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636853842996061e-05, + "grad_norm": 3.8212430477142334, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8851161003112793, + "num_tokens": 381274667.0, + "step": 9991 + }, + { + "epoch": 1.271085103676377, + "ewc_loss": 0.007694287225604057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.694287342019379e-05, + "grad_norm": 3.702162504196167, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8816332817077637, + "num_tokens": 381318110.0, + "step": 9992 + }, + { + "epoch": 1.2712123139549676, + "ewc_loss": 0.00760242622345686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.602426194353029e-05, + "grad_norm": 3.8015856742858887, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8746354579925537, + "num_tokens": 381356694.0, + "step": 9993 + }, + { + "epoch": 1.2713395242335581, + "ewc_loss": 0.007709925528615713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709925557719544e-05, + "grad_norm": 3.8230443000793457, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8652949333190918, + "num_tokens": 381395218.0, + "step": 9994 + }, + { + "epoch": 1.2714667345121486, + "ewc_loss": 0.007670641411095858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.670641207369044e-05, + "grad_norm": 3.776972770690918, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8857086300849915, + "num_tokens": 381429195.0, + "step": 9995 + }, + { + "epoch": 1.2715939447907392, + "ewc_loss": 0.007639121264219284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.63912103138864e-05, + "grad_norm": 3.8524651527404785, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8673630356788635, + "num_tokens": 381468276.0, + "step": 9996 + }, + { + "epoch": 1.2717211550693297, + "ewc_loss": 0.007709410972893238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709411147516221e-05, + "grad_norm": 3.7870895862579346, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8800350427627563, + "num_tokens": 381503400.0, + "step": 9997 + }, + { + "epoch": 1.2718483653479202, + "ewc_loss": 0.007648348342627287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648348400834948e-05, + "grad_norm": 3.8382039070129395, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8788270950317383, + "num_tokens": 381542126.0, + "step": 9998 + }, + { + "epoch": 1.2719755756265108, + "ewc_loss": 0.007698449306190014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.698449189774692e-05, + "grad_norm": 3.7628355026245117, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8812305331230164, + "num_tokens": 381579169.0, + "step": 9999 + }, + { + "epoch": 1.272102785905101, + "ewc_loss": 0.007648657541722059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64865762903355e-05, + "grad_norm": 3.865297794342041, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8607558608055115, + "num_tokens": 381615501.0, + "step": 10000 + }, + { + "epoch": 1.2722299961836916, + "ewc_loss": 0.007728926837444305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.728926721028984e-05, + "grad_norm": 3.8390650749206543, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8680362701416016, + "num_tokens": 381649737.0, + "step": 10001 + }, + { + "epoch": 1.2723572064622821, + "ewc_loss": 0.007689157035201788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.689157064305618e-05, + "grad_norm": 3.7932534217834473, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8760053515434265, + "num_tokens": 381693776.0, + "step": 10002 + }, + { + "epoch": 1.2724844167408726, + "ewc_loss": 0.007659900933504105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.659901166334748e-05, + "grad_norm": 3.7872514724731445, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8820006251335144, + "num_tokens": 381733324.0, + "step": 10003 + }, + { + "epoch": 1.2726116270194632, + "ewc_loss": 0.007699683308601379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699683192186058e-05, + "grad_norm": 3.8603713512420654, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8708845376968384, + "num_tokens": 381765893.0, + "step": 10004 + }, + { + "epoch": 1.2727388372980537, + "ewc_loss": 0.007730865851044655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730865763733163e-05, + "grad_norm": 3.800614595413208, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8709624409675598, + "num_tokens": 381802782.0, + "step": 10005 + }, + { + "epoch": 1.2728660475766442, + "ewc_loss": 0.007671787403523922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.671787170693278e-05, + "grad_norm": 3.7937519550323486, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8759441375732422, + "num_tokens": 381843448.0, + "step": 10006 + }, + { + "epoch": 1.2729932578552348, + "ewc_loss": 0.007695185951888561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.695185922784731e-05, + "grad_norm": 3.846151351928711, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8629757165908813, + "num_tokens": 381878850.0, + "step": 10007 + }, + { + "epoch": 1.2731204681338253, + "ewc_loss": 0.007729599252343178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729599019512534e-05, + "grad_norm": 3.867415189743042, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8525230884552002, + "num_tokens": 381915508.0, + "step": 10008 + }, + { + "epoch": 1.2732476784124156, + "ewc_loss": 0.007702214643359184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702214497840032e-05, + "grad_norm": 3.8192338943481445, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.863650381565094, + "num_tokens": 381952699.0, + "step": 10009 + }, + { + "epoch": 1.2733748886910061, + "ewc_loss": 0.0076920450665056705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.692044891882688e-05, + "grad_norm": 3.784010410308838, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8673679828643799, + "num_tokens": 381989947.0, + "step": 10010 + }, + { + "epoch": 1.2735020989695967, + "ewc_loss": 0.007689688354730606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.689688209211454e-05, + "grad_norm": 3.782658815383911, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8738926649093628, + "num_tokens": 382027701.0, + "step": 10011 + }, + { + "epoch": 1.2736293092481872, + "ewc_loss": 0.007698689121752977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69868929637596e-05, + "grad_norm": 3.818749189376831, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8709810376167297, + "num_tokens": 382066415.0, + "step": 10012 + }, + { + "epoch": 1.2737565195267777, + "ewc_loss": 0.007730104029178619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730103970970958e-05, + "grad_norm": 3.762556791305542, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8674671649932861, + "num_tokens": 382108402.0, + "step": 10013 + }, + { + "epoch": 1.2738837298053682, + "ewc_loss": 0.007672777399420738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.672777428524569e-05, + "grad_norm": 3.8021373748779297, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8724392652511597, + "num_tokens": 382146865.0, + "step": 10014 + }, + { + "epoch": 1.2740109400839588, + "ewc_loss": 0.007716139778494835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716139953117818e-05, + "grad_norm": 3.7731783390045166, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8696779608726501, + "num_tokens": 382186746.0, + "step": 10015 + }, + { + "epoch": 1.2741381503625493, + "ewc_loss": 0.007707173936069012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70717379054986e-05, + "grad_norm": 3.805467367172241, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8851085901260376, + "num_tokens": 382220693.0, + "step": 10016 + }, + { + "epoch": 1.2742653606411398, + "ewc_loss": 0.007735165301710367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735165127087384e-05, + "grad_norm": 3.7679288387298584, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8665096759796143, + "num_tokens": 382259549.0, + "step": 10017 + }, + { + "epoch": 1.2743925709197303, + "ewc_loss": 0.0076909372583031654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.690937491133809e-05, + "grad_norm": 3.753807783126831, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8824499845504761, + "num_tokens": 382299164.0, + "step": 10018 + }, + { + "epoch": 1.2745197811983209, + "ewc_loss": 0.007709946017712355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709945930400863e-05, + "grad_norm": 3.772372245788574, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8864181041717529, + "num_tokens": 382333596.0, + "step": 10019 + }, + { + "epoch": 1.2746469914769114, + "ewc_loss": 0.007704557850956917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70455808378756e-05, + "grad_norm": 3.7920215129852295, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8741263151168823, + "num_tokens": 382372321.0, + "step": 10020 + }, + { + "epoch": 1.274774201755502, + "ewc_loss": 0.007725403644144535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725403702352196e-05, + "grad_norm": 3.791677474975586, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8751429319381714, + "num_tokens": 382411125.0, + "step": 10021 + }, + { + "epoch": 1.2749014120340925, + "ewc_loss": 0.007726904936134815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.726904732408002e-05, + "grad_norm": 3.813459873199463, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8633513450622559, + "num_tokens": 382450935.0, + "step": 10022 + }, + { + "epoch": 1.275028622312683, + "ewc_loss": 0.0077293021604418755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729302160441875e-05, + "grad_norm": 3.7881665229797363, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8713247179985046, + "num_tokens": 382488232.0, + "step": 10023 + }, + { + "epoch": 1.2751558325912733, + "ewc_loss": 0.007710423786193132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.710423960816115e-05, + "grad_norm": 3.860530138015747, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8697836399078369, + "num_tokens": 382522280.0, + "step": 10024 + }, + { + "epoch": 1.2752830428698638, + "ewc_loss": 0.007744120433926582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.744120375718921e-05, + "grad_norm": 3.747281312942505, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8816506862640381, + "num_tokens": 382562788.0, + "step": 10025 + }, + { + "epoch": 1.2754102531484544, + "ewc_loss": 0.0076582045294344425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658204413019121e-05, + "grad_norm": 3.8337745666503906, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8659369349479675, + "num_tokens": 382598534.0, + "step": 10026 + }, + { + "epoch": 1.2755374634270449, + "ewc_loss": 0.007731291465461254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.731291407253593e-05, + "grad_norm": 3.783085823059082, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8673624992370605, + "num_tokens": 382641803.0, + "step": 10027 + }, + { + "epoch": 1.2756646737056354, + "ewc_loss": 0.007677440531551838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.677440589759499e-05, + "grad_norm": 3.787140130996704, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8834344148635864, + "num_tokens": 382677871.0, + "step": 10028 + }, + { + "epoch": 1.275791883984226, + "ewc_loss": 0.007700175978243351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.700175774516538e-05, + "grad_norm": 3.7445428371429443, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8820033669471741, + "num_tokens": 382714989.0, + "step": 10029 + }, + { + "epoch": 1.2759190942628165, + "ewc_loss": 0.0076788319274783134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678831752855331e-05, + "grad_norm": 3.7965314388275146, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8743254542350769, + "num_tokens": 382751919.0, + "step": 10030 + }, + { + "epoch": 1.276046304541407, + "ewc_loss": 0.007722875103354454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722875307081267e-05, + "grad_norm": 3.731154441833496, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8837469220161438, + "num_tokens": 382793550.0, + "step": 10031 + }, + { + "epoch": 1.2761735148199975, + "ewc_loss": 0.007655021268874407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.655021181562915e-05, + "grad_norm": 3.740877628326416, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8808646202087402, + "num_tokens": 382833142.0, + "step": 10032 + }, + { + "epoch": 1.2763007250985878, + "ewc_loss": 0.0076680900529026985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.668090256629512e-05, + "grad_norm": 3.763521194458008, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8806004524230957, + "num_tokens": 382869934.0, + "step": 10033 + }, + { + "epoch": 1.2764279353771784, + "ewc_loss": 0.007683830335736275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683830335736275e-05, + "grad_norm": 3.8192267417907715, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8607282638549805, + "num_tokens": 382909533.0, + "step": 10034 + }, + { + "epoch": 1.2765551456557689, + "ewc_loss": 0.0076927077025175095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69270773162134e-05, + "grad_norm": 3.759915351867676, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8781000971794128, + "num_tokens": 382947462.0, + "step": 10035 + }, + { + "epoch": 1.2766823559343594, + "ewc_loss": 0.007650915998965502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.650916086276993e-05, + "grad_norm": 3.750692367553711, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8753384947776794, + "num_tokens": 382989973.0, + "step": 10036 + }, + { + "epoch": 1.27680956621295, + "ewc_loss": 0.007660052739083767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.660052506253123e-05, + "grad_norm": 3.7712693214416504, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8702200055122375, + "num_tokens": 383033052.0, + "step": 10037 + }, + { + "epoch": 1.2769367764915405, + "ewc_loss": 0.007657567039132118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.657567039132118e-05, + "grad_norm": 3.7519872188568115, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8724156618118286, + "num_tokens": 383075981.0, + "step": 10038 + }, + { + "epoch": 1.277063986770131, + "ewc_loss": 0.007630569394677877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.63056959840469e-05, + "grad_norm": 3.7760443687438965, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8892861008644104, + "num_tokens": 383111984.0, + "step": 10039 + }, + { + "epoch": 1.2771911970487215, + "ewc_loss": 0.007638736627995968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.638736860826612e-05, + "grad_norm": 3.7496089935302734, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8739255666732788, + "num_tokens": 383155323.0, + "step": 10040 + }, + { + "epoch": 1.277318407327312, + "ewc_loss": 0.007613507565110922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.61350747779943e-05, + "grad_norm": 3.8458733558654785, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8728805184364319, + "num_tokens": 383190069.0, + "step": 10041 + }, + { + "epoch": 1.2774456176059026, + "ewc_loss": 0.0076598371379077435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.659837137907743e-05, + "grad_norm": 3.7815146446228027, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.863940417766571, + "num_tokens": 383224999.0, + "step": 10042 + }, + { + "epoch": 1.2775728278844931, + "ewc_loss": 0.007605134975165129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.60513503337279e-05, + "grad_norm": 3.7803711891174316, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.866051435470581, + "num_tokens": 383267764.0, + "step": 10043 + }, + { + "epoch": 1.2777000381630836, + "ewc_loss": 0.007620617747306824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62061754358001e-05, + "grad_norm": 3.774085283279419, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8817850947380066, + "num_tokens": 383306148.0, + "step": 10044 + }, + { + "epoch": 1.2778272484416742, + "ewc_loss": 0.007617747876793146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617747905896977e-05, + "grad_norm": 3.8099162578582764, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8783813714981079, + "num_tokens": 383343408.0, + "step": 10045 + }, + { + "epoch": 1.2779544587202647, + "ewc_loss": 0.00762939453125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62939453125e-05, + "grad_norm": 3.7233946323394775, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8958666324615479, + "num_tokens": 383383626.0, + "step": 10046 + }, + { + "epoch": 1.2780816689988552, + "ewc_loss": 0.007572587579488754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.572587492177263e-05, + "grad_norm": 3.817009210586548, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8573148250579834, + "num_tokens": 383423218.0, + "step": 10047 + }, + { + "epoch": 1.2782088792774458, + "ewc_loss": 0.0076571619138121605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.657161768293008e-05, + "grad_norm": 3.7901408672332764, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8803107738494873, + "num_tokens": 383461126.0, + "step": 10048 + }, + { + "epoch": 1.278336089556036, + "ewc_loss": 0.007601968478411436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.601968536619097e-05, + "grad_norm": 3.7783806324005127, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.864300549030304, + "num_tokens": 383501808.0, + "step": 10049 + }, + { + "epoch": 1.2784632998346266, + "ewc_loss": 0.007597995921969414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.597995863761753e-05, + "grad_norm": 3.74324893951416, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8927316069602966, + "num_tokens": 383542395.0, + "step": 10050 + }, + { + "epoch": 1.2785905101132171, + "ewc_loss": 0.007574378978461027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.574378832941875e-05, + "grad_norm": 3.7835845947265625, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.871066689491272, + "num_tokens": 383579968.0, + "step": 10051 + }, + { + "epoch": 1.2787177203918076, + "ewc_loss": 0.007622751407325268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62275158194825e-05, + "grad_norm": 3.811721086502075, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8731671571731567, + "num_tokens": 383617861.0, + "step": 10052 + }, + { + "epoch": 1.2788449306703982, + "ewc_loss": 0.0076111131347715855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.611112960148603e-05, + "grad_norm": 3.8192219734191895, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8691855669021606, + "num_tokens": 383652534.0, + "step": 10053 + }, + { + "epoch": 1.2789721409489887, + "ewc_loss": 0.007622746285051107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62274648877792e-05, + "grad_norm": 3.769710063934326, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8680368661880493, + "num_tokens": 383693120.0, + "step": 10054 + }, + { + "epoch": 1.2790993512275792, + "ewc_loss": 0.007578076329082251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.578076474601403e-05, + "grad_norm": 3.775381565093994, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8649634122848511, + "num_tokens": 383734958.0, + "step": 10055 + }, + { + "epoch": 1.2792265615061698, + "ewc_loss": 0.007610552944242954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.61055271141231e-05, + "grad_norm": 3.8451809883117676, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8678139448165894, + "num_tokens": 383768873.0, + "step": 10056 + }, + { + "epoch": 1.2793537717847603, + "ewc_loss": 0.007664002012461424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.664001896046102e-05, + "grad_norm": 3.831125020980835, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8756591081619263, + "num_tokens": 383804063.0, + "step": 10057 + }, + { + "epoch": 1.2794809820633506, + "ewc_loss": 0.0076283845119178295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.628384628333151e-05, + "grad_norm": 3.745481491088867, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8867794871330261, + "num_tokens": 383843098.0, + "step": 10058 + }, + { + "epoch": 1.2796081923419411, + "ewc_loss": 0.007603847421705723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.603847188875079e-05, + "grad_norm": 3.8110499382019043, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8822466135025024, + "num_tokens": 383879443.0, + "step": 10059 + }, + { + "epoch": 1.2797354026205316, + "ewc_loss": 0.007676636800169945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.676636596443132e-05, + "grad_norm": 3.7683825492858887, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8731870651245117, + "num_tokens": 383918071.0, + "step": 10060 + }, + { + "epoch": 1.2798626128991222, + "ewc_loss": 0.007611682638525963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.611682667629793e-05, + "grad_norm": 3.8100128173828125, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.866855263710022, + "num_tokens": 383953052.0, + "step": 10061 + }, + { + "epoch": 1.2799898231777127, + "ewc_loss": 0.007686922792345285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.686922617722303e-05, + "grad_norm": 3.779353618621826, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8702284693717957, + "num_tokens": 383992835.0, + "step": 10062 + }, + { + "epoch": 1.2801170334563032, + "ewc_loss": 0.007647821679711342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.647821621503681e-05, + "grad_norm": 3.8397884368896484, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8616951107978821, + "num_tokens": 384031234.0, + "step": 10063 + }, + { + "epoch": 1.2802442437348938, + "ewc_loss": 0.00770692341029644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706923497607931e-05, + "grad_norm": 3.869032382965088, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8644202947616577, + "num_tokens": 384063611.0, + "step": 10064 + }, + { + "epoch": 1.2803714540134843, + "ewc_loss": 0.0077027310617268085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702731090830639e-05, + "grad_norm": 3.716181755065918, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8813033699989319, + "num_tokens": 384106757.0, + "step": 10065 + }, + { + "epoch": 1.2804986642920748, + "ewc_loss": 0.007617006544023752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.617006485816091e-05, + "grad_norm": 3.786868095397949, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8759920597076416, + "num_tokens": 384143094.0, + "step": 10066 + }, + { + "epoch": 1.2806258745706653, + "ewc_loss": 0.007714728359133005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.714728417340666e-05, + "grad_norm": 3.7848472595214844, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8735800385475159, + "num_tokens": 384181498.0, + "step": 10067 + }, + { + "epoch": 1.2807530848492559, + "ewc_loss": 0.0076893651857972145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.689365156693384e-05, + "grad_norm": 3.8821840286254883, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8786283731460571, + "num_tokens": 384209784.0, + "step": 10068 + }, + { + "epoch": 1.2808802951278464, + "ewc_loss": 0.007757367566227913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757367711747065e-05, + "grad_norm": 3.7789552211761475, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8768782615661621, + "num_tokens": 384248000.0, + "step": 10069 + }, + { + "epoch": 1.281007505406437, + "ewc_loss": 0.007666348945349455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.666349119972438e-05, + "grad_norm": 3.75948166847229, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8639826774597168, + "num_tokens": 384290897.0, + "step": 10070 + }, + { + "epoch": 1.2811347156850275, + "ewc_loss": 0.007704413495957851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704413292231038e-05, + "grad_norm": 3.837151288986206, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8678498864173889, + "num_tokens": 384329464.0, + "step": 10071 + }, + { + "epoch": 1.281261925963618, + "ewc_loss": 0.007763918489217758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763918256387115e-05, + "grad_norm": 3.768869161605835, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8765047788619995, + "num_tokens": 384368915.0, + "step": 10072 + }, + { + "epoch": 1.2813891362422083, + "ewc_loss": 0.007666470017284155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.666469900868833e-05, + "grad_norm": 3.7650156021118164, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8826655149459839, + "num_tokens": 384407445.0, + "step": 10073 + }, + { + "epoch": 1.2815163465207988, + "ewc_loss": 0.00771075626835227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.710756472079083e-05, + "grad_norm": 3.8397583961486816, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8693311214447021, + "num_tokens": 384444384.0, + "step": 10074 + }, + { + "epoch": 1.2816435567993893, + "ewc_loss": 0.007761453744024038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76145388954319e-05, + "grad_norm": 3.8454363346099854, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8705404996871948, + "num_tokens": 384479670.0, + "step": 10075 + }, + { + "epoch": 1.2817707670779799, + "ewc_loss": 0.00770785054191947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.707850454607978e-05, + "grad_norm": 3.783953905105591, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8618018627166748, + "num_tokens": 384522436.0, + "step": 10076 + }, + { + "epoch": 1.2818979773565704, + "ewc_loss": 0.007692389190196991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.692389044677839e-05, + "grad_norm": 3.8304665088653564, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8653367161750793, + "num_tokens": 384559754.0, + "step": 10077 + }, + { + "epoch": 1.282025187635161, + "ewc_loss": 0.00772955734282732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729557546554133e-05, + "grad_norm": 3.814054250717163, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.870536208152771, + "num_tokens": 384595153.0, + "step": 10078 + }, + { + "epoch": 1.2821523979137515, + "ewc_loss": 0.007692234590649605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.692234794376418e-05, + "grad_norm": 3.834083080291748, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8720951080322266, + "num_tokens": 384628345.0, + "step": 10079 + }, + { + "epoch": 1.282279608192342, + "ewc_loss": 0.007710407022386789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.710407226113603e-05, + "grad_norm": 3.764204502105713, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8687454462051392, + "num_tokens": 384666725.0, + "step": 10080 + }, + { + "epoch": 1.2824068184709325, + "ewc_loss": 0.007683401927351952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683401781832799e-05, + "grad_norm": 3.8113138675689697, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8628393411636353, + "num_tokens": 384706217.0, + "step": 10081 + }, + { + "epoch": 1.2825340287495228, + "ewc_loss": 0.00773192523047328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.731925143161789e-05, + "grad_norm": 3.8330788612365723, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8715236783027649, + "num_tokens": 384741835.0, + "step": 10082 + }, + { + "epoch": 1.2826612390281134, + "ewc_loss": 0.007716339081525803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716339314356446e-05, + "grad_norm": 3.783715009689331, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.873807430267334, + "num_tokens": 384777273.0, + "step": 10083 + }, + { + "epoch": 1.2827884493067039, + "ewc_loss": 0.007698935456573963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69893522374332e-05, + "grad_norm": 3.7858829498291016, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.886482834815979, + "num_tokens": 384814950.0, + "step": 10084 + }, + { + "epoch": 1.2829156595852944, + "ewc_loss": 0.0077215577475726604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721557631157339e-05, + "grad_norm": 3.9000461101531982, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8657320737838745, + "num_tokens": 384848088.0, + "step": 10085 + }, + { + "epoch": 1.283042869863885, + "ewc_loss": 0.007787693291902542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78769317548722e-05, + "grad_norm": 3.769296407699585, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8659787774085999, + "num_tokens": 384887361.0, + "step": 10086 + }, + { + "epoch": 1.2831700801424755, + "ewc_loss": 0.0076693459413945675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66934608691372e-05, + "grad_norm": 3.742936611175537, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8766436576843262, + "num_tokens": 384930459.0, + "step": 10087 + }, + { + "epoch": 1.283297290421066, + "ewc_loss": 0.007715929765254259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.715929677942768e-05, + "grad_norm": 3.823824882507324, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8661850690841675, + "num_tokens": 384969506.0, + "step": 10088 + }, + { + "epoch": 1.2834245006996565, + "ewc_loss": 0.007755591068416834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.755590922897682e-05, + "grad_norm": 3.8482608795166016, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8782641887664795, + "num_tokens": 385004368.0, + "step": 10089 + }, + { + "epoch": 1.283551710978247, + "ewc_loss": 0.007745902985334396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745902985334396e-05, + "grad_norm": 3.7230474948883057, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8747538328170776, + "num_tokens": 385044553.0, + "step": 10090 + }, + { + "epoch": 1.2836789212568376, + "ewc_loss": 0.007670278660953045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.670278864679858e-05, + "grad_norm": 3.835278034210205, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8837580680847168, + "num_tokens": 385076873.0, + "step": 10091 + }, + { + "epoch": 1.283806131535428, + "ewc_loss": 0.007782493252307177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782493048580363e-05, + "grad_norm": 3.7641165256500244, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8756135702133179, + "num_tokens": 385113969.0, + "step": 10092 + }, + { + "epoch": 1.2839333418140186, + "ewc_loss": 0.007685281336307526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.685281161684543e-05, + "grad_norm": 3.7854983806610107, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8827462196350098, + "num_tokens": 385149460.0, + "step": 10093 + }, + { + "epoch": 1.2840605520926092, + "ewc_loss": 0.007724960800260305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.724960596533492e-05, + "grad_norm": 3.8345377445220947, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8571672439575195, + "num_tokens": 385187240.0, + "step": 10094 + }, + { + "epoch": 1.2841877623711997, + "ewc_loss": 0.007729676086455584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729676144663244e-05, + "grad_norm": 3.806220769882202, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8659672737121582, + "num_tokens": 385225532.0, + "step": 10095 + }, + { + "epoch": 1.2843149726497902, + "ewc_loss": 0.007699941284954548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699941488681361e-05, + "grad_norm": 3.7483558654785156, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8758404850959778, + "num_tokens": 385267404.0, + "step": 10096 + }, + { + "epoch": 1.2844421829283807, + "ewc_loss": 0.007681248243898153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.681248098379001e-05, + "grad_norm": 3.7300403118133545, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8604669570922852, + "num_tokens": 385314008.0, + "step": 10097 + }, + { + "epoch": 1.284569393206971, + "ewc_loss": 0.0076994807459414005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699480920564383e-05, + "grad_norm": 3.8637092113494873, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8600233793258667, + "num_tokens": 385352204.0, + "step": 10098 + }, + { + "epoch": 1.2846966034855616, + "ewc_loss": 0.0077735260128974915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.773526158416644e-05, + "grad_norm": 3.7836432456970215, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8644230365753174, + "num_tokens": 385393307.0, + "step": 10099 + }, + { + "epoch": 1.2848238137641521, + "ewc_loss": 0.007677766494452953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.677766552660614e-05, + "grad_norm": 3.8611884117126465, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8741103410720825, + "num_tokens": 385426124.0, + "step": 10100 + }, + { + "epoch": 1.2849510240427426, + "ewc_loss": 0.007746474351733923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.74647414800711e-05, + "grad_norm": 3.7645299434661865, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8780080676078796, + "num_tokens": 385467471.0, + "step": 10101 + }, + { + "epoch": 1.2850782343213332, + "ewc_loss": 0.007644159719347954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644159632036462e-05, + "grad_norm": 3.767512798309326, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8635212779045105, + "num_tokens": 385508780.0, + "step": 10102 + }, + { + "epoch": 1.2852054445999237, + "ewc_loss": 0.007694479543715715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.694479427300394e-05, + "grad_norm": 3.733163833618164, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8781106472015381, + "num_tokens": 385552646.0, + "step": 10103 + }, + { + "epoch": 1.2853326548785142, + "ewc_loss": 0.007666400633752346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.666400779271498e-05, + "grad_norm": 3.882418155670166, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8612259030342102, + "num_tokens": 385586676.0, + "step": 10104 + }, + { + "epoch": 1.2854598651571048, + "ewc_loss": 0.007753316313028336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.753316458547488e-05, + "grad_norm": 3.8304426670074463, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.86105877161026, + "num_tokens": 385623201.0, + "step": 10105 + }, + { + "epoch": 1.2855870754356953, + "ewc_loss": 0.007683070842176676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683070725761354e-05, + "grad_norm": 3.75727915763855, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.88245689868927, + "num_tokens": 385662716.0, + "step": 10106 + }, + { + "epoch": 1.2857142857142856, + "ewc_loss": 0.007673177402466536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.673177606193349e-05, + "grad_norm": 3.7983860969543457, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8614622354507446, + "num_tokens": 385706986.0, + "step": 10107 + }, + { + "epoch": 1.2858414959928761, + "ewc_loss": 0.007717338856309652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.717339030932635e-05, + "grad_norm": 3.795173406600952, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8723131418228149, + "num_tokens": 385744240.0, + "step": 10108 + }, + { + "epoch": 1.2859687062714666, + "ewc_loss": 0.00767310569062829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.673105574212968e-05, + "grad_norm": 3.8459739685058594, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8587104082107544, + "num_tokens": 385778583.0, + "step": 10109 + }, + { + "epoch": 1.2860959165500572, + "ewc_loss": 0.007717747241258621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.717747212154791e-05, + "grad_norm": 3.74369478225708, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8725886344909668, + "num_tokens": 385819910.0, + "step": 10110 + }, + { + "epoch": 1.2862231268286477, + "ewc_loss": 0.007658221293240786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658221147721633e-05, + "grad_norm": 3.784987211227417, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.870428740978241, + "num_tokens": 385860068.0, + "step": 10111 + }, + { + "epoch": 1.2863503371072382, + "ewc_loss": 0.007708367891609669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708367775194347e-05, + "grad_norm": 3.9218032360076904, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8649498224258423, + "num_tokens": 385890460.0, + "step": 10112 + }, + { + "epoch": 1.2864775473858288, + "ewc_loss": 0.007776323705911636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776323764119297e-05, + "grad_norm": 3.831461191177368, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8700656890869141, + "num_tokens": 385926174.0, + "step": 10113 + }, + { + "epoch": 1.2866047576644193, + "ewc_loss": 0.0076802936382591724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.68029349274002e-05, + "grad_norm": 3.7819790840148926, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8825850486755371, + "num_tokens": 385962928.0, + "step": 10114 + }, + { + "epoch": 1.2867319679430098, + "ewc_loss": 0.007692971266806126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.692971121286973e-05, + "grad_norm": 3.790156364440918, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8807653188705444, + "num_tokens": 386002509.0, + "step": 10115 + }, + { + "epoch": 1.2868591782216003, + "ewc_loss": 0.0077307457104325294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73074571043253e-05, + "grad_norm": 3.8061766624450684, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8723028302192688, + "num_tokens": 386043304.0, + "step": 10116 + }, + { + "epoch": 1.2869863885001909, + "ewc_loss": 0.007708239834755659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708239718340337e-05, + "grad_norm": 3.7852485179901123, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8799316883087158, + "num_tokens": 386081607.0, + "step": 10117 + }, + { + "epoch": 1.2871135987787814, + "ewc_loss": 0.007704822812229395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704822928644717e-05, + "grad_norm": 3.8033432960510254, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8905857801437378, + "num_tokens": 386117762.0, + "step": 10118 + }, + { + "epoch": 1.287240809057372, + "ewc_loss": 0.0077276709489524364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.727670890744776e-05, + "grad_norm": 3.804515838623047, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8652763366699219, + "num_tokens": 386155908.0, + "step": 10119 + }, + { + "epoch": 1.2873680193359625, + "ewc_loss": 0.007725853472948074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725853356532753e-05, + "grad_norm": 3.787508964538574, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8697880506515503, + "num_tokens": 386199385.0, + "step": 10120 + }, + { + "epoch": 1.287495229614553, + "ewc_loss": 0.007702362723648548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702362927375361e-05, + "grad_norm": 3.898207187652588, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8695049285888672, + "num_tokens": 386231892.0, + "step": 10121 + }, + { + "epoch": 1.2876224398931433, + "ewc_loss": 0.007782237138599157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782236934872344e-05, + "grad_norm": 3.83940052986145, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.858100175857544, + "num_tokens": 386269657.0, + "step": 10122 + }, + { + "epoch": 1.2877496501717338, + "ewc_loss": 0.007703556213527918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.703556184424087e-05, + "grad_norm": 3.7867138385772705, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8785284757614136, + "num_tokens": 386306936.0, + "step": 10123 + }, + { + "epoch": 1.2878768604503243, + "ewc_loss": 0.007700798567384481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.700798596488312e-05, + "grad_norm": 3.784632682800293, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8730728626251221, + "num_tokens": 386342260.0, + "step": 10124 + }, + { + "epoch": 1.2880040707289149, + "ewc_loss": 0.0077218106016516685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721810834482312e-05, + "grad_norm": 3.8252270221710205, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8690712451934814, + "num_tokens": 386381784.0, + "step": 10125 + }, + { + "epoch": 1.2881312810075054, + "ewc_loss": 0.0077369799837470055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.736979750916362e-05, + "grad_norm": 3.7618420124053955, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8726825714111328, + "num_tokens": 386422350.0, + "step": 10126 + }, + { + "epoch": 1.288258491286096, + "ewc_loss": 0.007701320108026266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.701320282649249e-05, + "grad_norm": 3.7946629524230957, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.867780327796936, + "num_tokens": 386467951.0, + "step": 10127 + }, + { + "epoch": 1.2883857015646865, + "ewc_loss": 0.00773353548720479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.733535312581807e-05, + "grad_norm": 3.7321908473968506, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8828849792480469, + "num_tokens": 386511896.0, + "step": 10128 + }, + { + "epoch": 1.288512911843277, + "ewc_loss": 0.007690164726227522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.690164784435183e-05, + "grad_norm": 3.824481248855591, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8609194755554199, + "num_tokens": 386550704.0, + "step": 10129 + }, + { + "epoch": 1.2886401221218675, + "ewc_loss": 0.007744683418422937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.744683534838259e-05, + "grad_norm": 3.8077545166015625, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8566096425056458, + "num_tokens": 386589295.0, + "step": 10130 + }, + { + "epoch": 1.2887673324004578, + "ewc_loss": 0.0077050914987921715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70509141148068e-05, + "grad_norm": 3.873887777328491, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8718791007995605, + "num_tokens": 386623462.0, + "step": 10131 + }, + { + "epoch": 1.2888945426790483, + "ewc_loss": 0.00773954764008522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.739547436358407e-05, + "grad_norm": 3.742258310317993, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8745242357254028, + "num_tokens": 386663036.0, + "step": 10132 + }, + { + "epoch": 1.2890217529576389, + "ewc_loss": 0.007642456330358982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.642456330358982e-05, + "grad_norm": 3.812265634536743, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8734546899795532, + "num_tokens": 386702530.0, + "step": 10133 + }, + { + "epoch": 1.2891489632362294, + "ewc_loss": 0.007725193630903959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725193427177146e-05, + "grad_norm": 3.7566757202148438, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8865009546279907, + "num_tokens": 386739758.0, + "step": 10134 + }, + { + "epoch": 1.28927617351482, + "ewc_loss": 0.007670274470001459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.67027449910529e-05, + "grad_norm": 3.8032166957855225, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8814592957496643, + "num_tokens": 386778287.0, + "step": 10135 + }, + { + "epoch": 1.2894033837934105, + "ewc_loss": 0.0077019548043608665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.701954746153206e-05, + "grad_norm": 3.8349967002868652, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8824794292449951, + "num_tokens": 386812348.0, + "step": 10136 + }, + { + "epoch": 1.289530594072001, + "ewc_loss": 0.00767545634880662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.675456436118111e-05, + "grad_norm": 3.733093738555908, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8790837526321411, + "num_tokens": 386855540.0, + "step": 10137 + }, + { + "epoch": 1.2896578043505915, + "ewc_loss": 0.007612391375005245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.612391345901415e-05, + "grad_norm": 3.7835235595703125, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8685826063156128, + "num_tokens": 386893775.0, + "step": 10138 + }, + { + "epoch": 1.289785014629182, + "ewc_loss": 0.0076843686401844025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.684368756599724e-05, + "grad_norm": 3.8476006984710693, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8740782737731934, + "num_tokens": 386929553.0, + "step": 10139 + }, + { + "epoch": 1.2899122249077726, + "ewc_loss": 0.00771013367921114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.710133650107309e-05, + "grad_norm": 3.788623809814453, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8674816489219666, + "num_tokens": 386968033.0, + "step": 10140 + }, + { + "epoch": 1.290039435186363, + "ewc_loss": 0.007661676499992609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.661676499992609e-05, + "grad_norm": 3.8281514644622803, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8717660903930664, + "num_tokens": 387004967.0, + "step": 10141 + }, + { + "epoch": 1.2901666454649536, + "ewc_loss": 0.007706587202847004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706587348366156e-05, + "grad_norm": 3.8131394386291504, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8626629114151001, + "num_tokens": 387038086.0, + "step": 10142 + }, + { + "epoch": 1.2902938557435442, + "ewc_loss": 0.007688311394304037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.68831159803085e-05, + "grad_norm": 3.8306970596313477, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8478766679763794, + "num_tokens": 387077112.0, + "step": 10143 + }, + { + "epoch": 1.2904210660221347, + "ewc_loss": 0.007716975174844265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716975233051926e-05, + "grad_norm": 3.7693004608154297, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.856266975402832, + "num_tokens": 387118800.0, + "step": 10144 + }, + { + "epoch": 1.2905482763007252, + "ewc_loss": 0.007694877218455076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69487742218189e-05, + "grad_norm": 3.7676854133605957, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8676797151565552, + "num_tokens": 387160321.0, + "step": 10145 + }, + { + "epoch": 1.2906754865793157, + "ewc_loss": 0.007720984518527985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.720984285697341e-05, + "grad_norm": 3.8410332202911377, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.87110835313797, + "num_tokens": 387199387.0, + "step": 10146 + }, + { + "epoch": 1.290802696857906, + "ewc_loss": 0.007762497290968895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.762497261865065e-05, + "grad_norm": 3.7640607357025146, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.877922534942627, + "num_tokens": 387238859.0, + "step": 10147 + }, + { + "epoch": 1.2909299071364966, + "ewc_loss": 0.007689257618039846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.689257472520694e-05, + "grad_norm": 3.808257818222046, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8770028352737427, + "num_tokens": 387272149.0, + "step": 10148 + }, + { + "epoch": 1.291057117415087, + "ewc_loss": 0.007746866438537836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.746866322122514e-05, + "grad_norm": 3.7749390602111816, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8715436458587646, + "num_tokens": 387311617.0, + "step": 10149 + }, + { + "epoch": 1.2911843276936776, + "ewc_loss": 0.007696957793086767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696957618463784e-05, + "grad_norm": 3.8540256023406982, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8596994280815125, + "num_tokens": 387344281.0, + "step": 10150 + }, + { + "epoch": 1.2913115379722682, + "ewc_loss": 0.007758875843137503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.758876017760485e-05, + "grad_norm": 3.761082172393799, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8623272180557251, + "num_tokens": 387385959.0, + "step": 10151 + }, + { + "epoch": 1.2914387482508587, + "ewc_loss": 0.007682933937758207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.682933937758207e-05, + "grad_norm": 3.7859280109405518, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8677566647529602, + "num_tokens": 387426197.0, + "step": 10152 + }, + { + "epoch": 1.2915659585294492, + "ewc_loss": 0.007736359257251024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.736359111731872e-05, + "grad_norm": 3.7835538387298584, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8794562816619873, + "num_tokens": 387462250.0, + "step": 10153 + }, + { + "epoch": 1.2916931688080397, + "ewc_loss": 0.007728819735348225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.728819764452055e-05, + "grad_norm": 3.8103997707366943, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.872235894203186, + "num_tokens": 387499903.0, + "step": 10154 + }, + { + "epoch": 1.2918203790866303, + "ewc_loss": 0.007730480283498764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730480137979612e-05, + "grad_norm": 3.898900270462036, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.878905177116394, + "num_tokens": 387535361.0, + "step": 10155 + }, + { + "epoch": 1.2919475893652206, + "ewc_loss": 0.007762514520436525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.762514724163339e-05, + "grad_norm": 3.778474807739258, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8692167401313782, + "num_tokens": 387574300.0, + "step": 10156 + }, + { + "epoch": 1.2920747996438111, + "ewc_loss": 0.007679247297346592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6792472100351e-05, + "grad_norm": 3.851702928543091, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.873662531375885, + "num_tokens": 387606860.0, + "step": 10157 + }, + { + "epoch": 1.2922020099224016, + "ewc_loss": 0.007759484462440014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.759484287817031e-05, + "grad_norm": 3.7926719188690186, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8649880886077881, + "num_tokens": 387646519.0, + "step": 10158 + }, + { + "epoch": 1.2923292202009922, + "ewc_loss": 0.0076999240554869175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699924026383087e-05, + "grad_norm": 3.822835683822632, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8827899098396301, + "num_tokens": 387681100.0, + "step": 10159 + }, + { + "epoch": 1.2924564304795827, + "ewc_loss": 0.007719157263636589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71915729274042e-05, + "grad_norm": 3.761664628982544, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8940053582191467, + "num_tokens": 387715962.0, + "step": 10160 + }, + { + "epoch": 1.2925836407581732, + "ewc_loss": 0.007682271301746368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.682271098019555e-05, + "grad_norm": 3.772353410720825, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8705155849456787, + "num_tokens": 387755856.0, + "step": 10161 + }, + { + "epoch": 1.2927108510367638, + "ewc_loss": 0.007726271636784077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.726271724095568e-05, + "grad_norm": 3.777251958847046, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8799460530281067, + "num_tokens": 387793256.0, + "step": 10162 + }, + { + "epoch": 1.2928380613153543, + "ewc_loss": 0.007725571747869253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725571776973084e-05, + "grad_norm": 3.7649834156036377, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8686025142669678, + "num_tokens": 387834696.0, + "step": 10163 + }, + { + "epoch": 1.2929652715939448, + "ewc_loss": 0.007731638383120298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73163847043179e-05, + "grad_norm": 3.8209753036499023, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8624157905578613, + "num_tokens": 387874973.0, + "step": 10164 + }, + { + "epoch": 1.2930924818725353, + "ewc_loss": 0.007755259983241558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.755259866826236e-05, + "grad_norm": 3.910306692123413, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8619128465652466, + "num_tokens": 387904887.0, + "step": 10165 + }, + { + "epoch": 1.2932196921511259, + "ewc_loss": 0.007795341778546572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79534166213125e-05, + "grad_norm": 3.7676475048065186, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.879153847694397, + "num_tokens": 387944676.0, + "step": 10166 + }, + { + "epoch": 1.2933469024297164, + "ewc_loss": 0.007685238029807806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.685238233534619e-05, + "grad_norm": 3.7679550647735596, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8739151358604431, + "num_tokens": 387982234.0, + "step": 10167 + }, + { + "epoch": 1.293474112708307, + "ewc_loss": 0.007756083272397518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.756083505228162e-05, + "grad_norm": 3.781442880630493, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8742964267730713, + "num_tokens": 388018001.0, + "step": 10168 + }, + { + "epoch": 1.2936013229868975, + "ewc_loss": 0.0077532073482871056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.753207319183275e-05, + "grad_norm": 3.771348714828491, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8677747249603271, + "num_tokens": 388057476.0, + "step": 10169 + }, + { + "epoch": 1.293728533265488, + "ewc_loss": 0.007757266517728567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757266575936228e-05, + "grad_norm": 3.7700159549713135, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8898097276687622, + "num_tokens": 388095351.0, + "step": 10170 + }, + { + "epoch": 1.2938557435440783, + "ewc_loss": 0.0077605172991752625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.760517473798245e-05, + "grad_norm": 3.7990503311157227, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8714781999588013, + "num_tokens": 388135366.0, + "step": 10171 + }, + { + "epoch": 1.2939829538226688, + "ewc_loss": 0.007780033629387617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78003377490677e-05, + "grad_norm": 3.8276736736297607, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8618000745773315, + "num_tokens": 388175003.0, + "step": 10172 + }, + { + "epoch": 1.2941101641012593, + "ewc_loss": 0.007774599362164736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774599362164736e-05, + "grad_norm": 3.884550094604492, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8549205660820007, + "num_tokens": 388207083.0, + "step": 10173 + }, + { + "epoch": 1.2942373743798499, + "ewc_loss": 0.007802573963999748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.802573963999748e-05, + "grad_norm": 3.765307903289795, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8873512744903564, + "num_tokens": 388245038.0, + "step": 10174 + }, + { + "epoch": 1.2943645846584404, + "ewc_loss": 0.007716589607298374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716589607298374e-05, + "grad_norm": 3.8047373294830322, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8684188723564148, + "num_tokens": 388285035.0, + "step": 10175 + }, + { + "epoch": 1.294491794937031, + "ewc_loss": 0.007787567097693682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.787567301420495e-05, + "grad_norm": 3.7804629802703857, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8780708312988281, + "num_tokens": 388323971.0, + "step": 10176 + }, + { + "epoch": 1.2946190052156215, + "ewc_loss": 0.007735215127468109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735215331194922e-05, + "grad_norm": 3.8215110301971436, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8694148659706116, + "num_tokens": 388362256.0, + "step": 10177 + }, + { + "epoch": 1.294746215494212, + "ewc_loss": 0.007771925535053015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.771925447741523e-05, + "grad_norm": 3.7444815635681152, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8921506404876709, + "num_tokens": 388401120.0, + "step": 10178 + }, + { + "epoch": 1.2948734257728025, + "ewc_loss": 0.007722287438809872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722287409706041e-05, + "grad_norm": 3.7967379093170166, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8751928806304932, + "num_tokens": 388446833.0, + "step": 10179 + }, + { + "epoch": 1.2950006360513928, + "ewc_loss": 0.007744550239294767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.74455038481392e-05, + "grad_norm": 3.8118996620178223, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8855460286140442, + "num_tokens": 388482266.0, + "step": 10180 + }, + { + "epoch": 1.2951278463299833, + "ewc_loss": 0.007738517131656408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738517160760239e-05, + "grad_norm": 3.8301901817321777, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8744230270385742, + "num_tokens": 388518024.0, + "step": 10181 + }, + { + "epoch": 1.2952550566085739, + "ewc_loss": 0.007716775871813297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716775871813297e-05, + "grad_norm": 3.720177412033081, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8750596046447754, + "num_tokens": 388564246.0, + "step": 10182 + }, + { + "epoch": 1.2953822668871644, + "ewc_loss": 0.007661218289285898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.661218114662915e-05, + "grad_norm": 3.8223862648010254, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8760882616043091, + "num_tokens": 388603556.0, + "step": 10183 + }, + { + "epoch": 1.295509477165755, + "ewc_loss": 0.007745712995529175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745713082840666e-05, + "grad_norm": 3.7798655033111572, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8807768821716309, + "num_tokens": 388643615.0, + "step": 10184 + }, + { + "epoch": 1.2956366874443455, + "ewc_loss": 0.007672415114939213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.672415085835382e-05, + "grad_norm": 3.8008546829223633, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8753695487976074, + "num_tokens": 388682917.0, + "step": 10185 + }, + { + "epoch": 1.295763897722936, + "ewc_loss": 0.0076895421370863914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.689541962463409e-05, + "grad_norm": 3.825684070587158, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8751511573791504, + "num_tokens": 388720269.0, + "step": 10186 + }, + { + "epoch": 1.2958911080015265, + "ewc_loss": 0.007704687770456076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704687595833093e-05, + "grad_norm": 3.8169784545898438, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8663158416748047, + "num_tokens": 388759245.0, + "step": 10187 + }, + { + "epoch": 1.296018318280117, + "ewc_loss": 0.007678163703531027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678163819946349e-05, + "grad_norm": 3.814971685409546, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.868466854095459, + "num_tokens": 388795673.0, + "step": 10188 + }, + { + "epoch": 1.2961455285587076, + "ewc_loss": 0.007688954938203096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.688954792683944e-05, + "grad_norm": 3.817495822906494, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8761132955551147, + "num_tokens": 388832926.0, + "step": 10189 + }, + { + "epoch": 1.296272738837298, + "ewc_loss": 0.007698379922658205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.698380068177357e-05, + "grad_norm": 3.8229422569274902, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8768206238746643, + "num_tokens": 388869767.0, + "step": 10190 + }, + { + "epoch": 1.2963999491158886, + "ewc_loss": 0.007696135435253382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696135435253382e-05, + "grad_norm": 3.744205951690674, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8773122429847717, + "num_tokens": 388913347.0, + "step": 10191 + }, + { + "epoch": 1.2965271593944792, + "ewc_loss": 0.0076460628770291805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.646063022548333e-05, + "grad_norm": 3.791107177734375, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8845831155776978, + "num_tokens": 388948211.0, + "step": 10192 + }, + { + "epoch": 1.2966543696730697, + "ewc_loss": 0.007710217498242855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.710217323619872e-05, + "grad_norm": 3.768921136856079, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8784528970718384, + "num_tokens": 388989198.0, + "step": 10193 + }, + { + "epoch": 1.2967815799516602, + "ewc_loss": 0.007661423180252314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.661423296667635e-05, + "grad_norm": 3.7889158725738525, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8646352291107178, + "num_tokens": 389029076.0, + "step": 10194 + }, + { + "epoch": 1.2969087902302507, + "ewc_loss": 0.007681910414248705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.681910210521892e-05, + "grad_norm": 3.844498634338379, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8743970394134521, + "num_tokens": 389066751.0, + "step": 10195 + }, + { + "epoch": 1.297036000508841, + "ewc_loss": 0.007713994476944208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713994273217395e-05, + "grad_norm": 3.7884135246276855, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8716033101081848, + "num_tokens": 389104207.0, + "step": 10196 + }, + { + "epoch": 1.2971632107874316, + "ewc_loss": 0.0076554324477910995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.655432273168117e-05, + "grad_norm": 3.8346807956695557, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8774582743644714, + "num_tokens": 389141941.0, + "step": 10197 + }, + { + "epoch": 1.297290421066022, + "ewc_loss": 0.007683433126658201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.68343306845054e-05, + "grad_norm": 3.7158193588256836, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8815250396728516, + "num_tokens": 389185326.0, + "step": 10198 + }, + { + "epoch": 1.2974176313446126, + "ewc_loss": 0.007627411745488644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.627411832800135e-05, + "grad_norm": 3.815633773803711, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8821424841880798, + "num_tokens": 389221512.0, + "step": 10199 + }, + { + "epoch": 1.2975448416232032, + "ewc_loss": 0.007725509349256754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725509203737602e-05, + "grad_norm": 3.7616934776306152, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8814241886138916, + "num_tokens": 389259388.0, + "step": 10200 + }, + { + "epoch": 1.2976720519017937, + "ewc_loss": 0.007638433948159218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.638434180989861e-05, + "grad_norm": 3.832777261734009, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8610606789588928, + "num_tokens": 389298773.0, + "step": 10201 + }, + { + "epoch": 1.2977992621803842, + "ewc_loss": 0.0077053154818713665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705315510975197e-05, + "grad_norm": 3.750509262084961, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8733910918235779, + "num_tokens": 389339680.0, + "step": 10202 + }, + { + "epoch": 1.2979264724589747, + "ewc_loss": 0.007636193186044693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.636193186044693e-05, + "grad_norm": 3.768233060836792, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8750827312469482, + "num_tokens": 389380755.0, + "step": 10203 + }, + { + "epoch": 1.2980536827375653, + "ewc_loss": 0.007661969866603613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66196972108446e-05, + "grad_norm": 3.87463116645813, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8792204856872559, + "num_tokens": 389412031.0, + "step": 10204 + }, + { + "epoch": 1.2981808930161556, + "ewc_loss": 0.007718028966337442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71802879171446e-05, + "grad_norm": 3.827730178833008, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8840405344963074, + "num_tokens": 389448299.0, + "step": 10205 + }, + { + "epoch": 1.298308103294746, + "ewc_loss": 0.007644303608685732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.644303695997223e-05, + "grad_norm": 3.834615468978882, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8678470849990845, + "num_tokens": 389483962.0, + "step": 10206 + }, + { + "epoch": 1.2984353135733366, + "ewc_loss": 0.00767613435164094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.676134555367753e-05, + "grad_norm": 3.814617872238159, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8648106455802917, + "num_tokens": 389521989.0, + "step": 10207 + }, + { + "epoch": 1.2985625238519272, + "ewc_loss": 0.007665577810257673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.665577868465334e-05, + "grad_norm": 3.803715944290161, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8678706288337708, + "num_tokens": 389558228.0, + "step": 10208 + }, + { + "epoch": 1.2986897341305177, + "ewc_loss": 0.007673688232898712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.673688378417864e-05, + "grad_norm": 3.7736403942108154, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8744115829467773, + "num_tokens": 389597164.0, + "step": 10209 + }, + { + "epoch": 1.2988169444091082, + "ewc_loss": 0.00765810115262866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658101094420999e-05, + "grad_norm": 3.8074276447296143, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8725721836090088, + "num_tokens": 389636435.0, + "step": 10210 + }, + { + "epoch": 1.2989441546876987, + "ewc_loss": 0.007706991396844387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706991164013743e-05, + "grad_norm": 3.8962795734405518, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8666940927505493, + "num_tokens": 389670498.0, + "step": 10211 + }, + { + "epoch": 1.2990713649662893, + "ewc_loss": 0.007748448755592108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.748448842903599e-05, + "grad_norm": 3.794577121734619, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8764190673828125, + "num_tokens": 389707433.0, + "step": 10212 + }, + { + "epoch": 1.2991985752448798, + "ewc_loss": 0.007648950908333063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.648950850125402e-05, + "grad_norm": 3.772993326187134, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8738968968391418, + "num_tokens": 389747300.0, + "step": 10213 + }, + { + "epoch": 1.2993257855234703, + "ewc_loss": 0.007708888500928879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708888733759522e-05, + "grad_norm": 3.794675588607788, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8676419854164124, + "num_tokens": 389788381.0, + "step": 10214 + }, + { + "epoch": 1.2994529958020609, + "ewc_loss": 0.0077166142873466015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716614345554262e-05, + "grad_norm": 3.8378798961639404, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8779784440994263, + "num_tokens": 389821792.0, + "step": 10215 + }, + { + "epoch": 1.2995802060806514, + "ewc_loss": 0.007729954086244106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729954086244106e-05, + "grad_norm": 3.7839107513427734, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8844761848449707, + "num_tokens": 389859370.0, + "step": 10216 + }, + { + "epoch": 1.299707416359242, + "ewc_loss": 0.007685481104999781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.685481250518933e-05, + "grad_norm": 3.7962052822113037, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8649734258651733, + "num_tokens": 389897020.0, + "step": 10217 + }, + { + "epoch": 1.2998346266378324, + "ewc_loss": 0.007717561908066273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.717561675235629e-05, + "grad_norm": 3.9132351875305176, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.849097728729248, + "num_tokens": 389927107.0, + "step": 10218 + }, + { + "epoch": 1.299961836916423, + "ewc_loss": 0.007786309812217951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786310015944764e-05, + "grad_norm": 3.834867477416992, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8708980083465576, + "num_tokens": 389966323.0, + "step": 10219 + }, + { + "epoch": 1.3000890471950133, + "ewc_loss": 0.007715458981692791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71545892348513e-05, + "grad_norm": 3.7983319759368896, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8726879358291626, + "num_tokens": 390001416.0, + "step": 10220 + }, + { + "epoch": 1.3002162574736038, + "ewc_loss": 0.007737481966614723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73748179199174e-05, + "grad_norm": 3.7880806922912598, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.877334713935852, + "num_tokens": 390042326.0, + "step": 10221 + }, + { + "epoch": 1.3003434677521943, + "ewc_loss": 0.00773219158872962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732191443210468e-05, + "grad_norm": 3.778615951538086, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8644053339958191, + "num_tokens": 390084443.0, + "step": 10222 + }, + { + "epoch": 1.3004706780307849, + "ewc_loss": 0.00773976556956768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.739765715086833e-05, + "grad_norm": 3.7658333778381348, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8658778667449951, + "num_tokens": 390126309.0, + "step": 10223 + }, + { + "epoch": 1.3005978883093754, + "ewc_loss": 0.007727037183940411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72703715483658e-05, + "grad_norm": 3.807743787765503, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8764393329620361, + "num_tokens": 390161463.0, + "step": 10224 + }, + { + "epoch": 1.300725098587966, + "ewc_loss": 0.007755479775369167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.755479600746185e-05, + "grad_norm": 3.7210335731506348, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8841427564620972, + "num_tokens": 390203221.0, + "step": 10225 + }, + { + "epoch": 1.3008523088665565, + "ewc_loss": 0.007686414755880833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.686414755880833e-05, + "grad_norm": 3.794858932495117, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8887953162193298, + "num_tokens": 390243269.0, + "step": 10226 + }, + { + "epoch": 1.300979519145147, + "ewc_loss": 0.007756723091006279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.756723061902449e-05, + "grad_norm": 3.7717485427856445, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.874550461769104, + "num_tokens": 390282488.0, + "step": 10227 + }, + { + "epoch": 1.3011067294237375, + "ewc_loss": 0.0076944706961512566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.694470696151257e-05, + "grad_norm": 3.8362245559692383, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8701589107513428, + "num_tokens": 390320268.0, + "step": 10228 + }, + { + "epoch": 1.3012339397023278, + "ewc_loss": 0.007740662898868322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740662840660661e-05, + "grad_norm": 3.8326969146728516, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8654896020889282, + "num_tokens": 390354733.0, + "step": 10229 + }, + { + "epoch": 1.3013611499809183, + "ewc_loss": 0.007714367471635342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.714367529843003e-05, + "grad_norm": 3.8318803310394287, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8744474649429321, + "num_tokens": 390388161.0, + "step": 10230 + }, + { + "epoch": 1.3014883602595089, + "ewc_loss": 0.007715826388448477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.715826359344646e-05, + "grad_norm": 3.7939841747283936, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8834244012832642, + "num_tokens": 390424609.0, + "step": 10231 + }, + { + "epoch": 1.3016155705380994, + "ewc_loss": 0.007693595252931118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69359539845027e-05, + "grad_norm": 3.787275552749634, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.890403687953949, + "num_tokens": 390460809.0, + "step": 10232 + }, + { + "epoch": 1.30174278081669, + "ewc_loss": 0.007691905368119478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.691905193496495e-05, + "grad_norm": 3.7848570346832275, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8639875054359436, + "num_tokens": 390500198.0, + "step": 10233 + }, + { + "epoch": 1.3018699910952805, + "ewc_loss": 0.007683593779802322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683593867113814e-05, + "grad_norm": 3.770606756210327, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8671479225158691, + "num_tokens": 390545588.0, + "step": 10234 + }, + { + "epoch": 1.301997201373871, + "ewc_loss": 0.007673595100641251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.673595246160403e-05, + "grad_norm": 3.852778196334839, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8639211654663086, + "num_tokens": 390581681.0, + "step": 10235 + }, + { + "epoch": 1.3021244116524615, + "ewc_loss": 0.00773065397515893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730654033366591e-05, + "grad_norm": 3.722815990447998, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8908138275146484, + "num_tokens": 390619122.0, + "step": 10236 + }, + { + "epoch": 1.302251621931052, + "ewc_loss": 0.00764366053044796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.64366050134413e-05, + "grad_norm": 3.758193254470825, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8613473176956177, + "num_tokens": 390667974.0, + "step": 10237 + }, + { + "epoch": 1.3023788322096426, + "ewc_loss": 0.007706891279667616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706891483394429e-05, + "grad_norm": 3.851285696029663, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.875455379486084, + "num_tokens": 390705188.0, + "step": 10238 + }, + { + "epoch": 1.302506042488233, + "ewc_loss": 0.007726302836090326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.726303010713309e-05, + "grad_norm": 3.771693468093872, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8822007179260254, + "num_tokens": 390746892.0, + "step": 10239 + }, + { + "epoch": 1.3026332527668236, + "ewc_loss": 0.007627921178936958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.627921149833128e-05, + "grad_norm": 3.7273478507995605, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8883302807807922, + "num_tokens": 390787727.0, + "step": 10240 + }, + { + "epoch": 1.3027604630454142, + "ewc_loss": 0.007645234931260347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.645235018571839e-05, + "grad_norm": 3.8963215351104736, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8665896058082581, + "num_tokens": 390823702.0, + "step": 10241 + }, + { + "epoch": 1.3028876733240047, + "ewc_loss": 0.007745820563286543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745820767013356e-05, + "grad_norm": 3.7898855209350586, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8727521896362305, + "num_tokens": 390860256.0, + "step": 10242 + }, + { + "epoch": 1.3030148836025952, + "ewc_loss": 0.007616469636559486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.616469520144165e-05, + "grad_norm": 3.7628180980682373, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8850730657577515, + "num_tokens": 390901676.0, + "step": 10243 + }, + { + "epoch": 1.3031420938811857, + "ewc_loss": 0.007638975512236357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.638975512236357e-05, + "grad_norm": 3.8151814937591553, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8556939363479614, + "num_tokens": 390936911.0, + "step": 10244 + }, + { + "epoch": 1.303269304159776, + "ewc_loss": 0.007675638422369957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.675638335058466e-05, + "grad_norm": 3.8257639408111572, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8615160584449768, + "num_tokens": 390976185.0, + "step": 10245 + }, + { + "epoch": 1.3033965144383666, + "ewc_loss": 0.007660116534680128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.660116534680128e-05, + "grad_norm": 3.8072638511657715, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.87444007396698, + "num_tokens": 391014661.0, + "step": 10246 + }, + { + "epoch": 1.303523724716957, + "ewc_loss": 0.0076501332223415375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.650133193237707e-05, + "grad_norm": 3.78865122795105, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8663454055786133, + "num_tokens": 391056205.0, + "step": 10247 + }, + { + "epoch": 1.3036509349955476, + "ewc_loss": 0.007652981206774712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.652981003047898e-05, + "grad_norm": 3.823364496231079, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8767094016075134, + "num_tokens": 391092187.0, + "step": 10248 + }, + { + "epoch": 1.3037781452741382, + "ewc_loss": 0.007680335082113743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.680334965698421e-05, + "grad_norm": 3.7791848182678223, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.889441728591919, + "num_tokens": 391130890.0, + "step": 10249 + }, + { + "epoch": 1.3039053555527287, + "ewc_loss": 0.00764009403064847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640093826921657e-05, + "grad_norm": 3.7661404609680176, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.882681131362915, + "num_tokens": 391172783.0, + "step": 10250 + }, + { + "epoch": 1.3040325658313192, + "ewc_loss": 0.007661627605557442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.661627751076594e-05, + "grad_norm": 3.8129894733428955, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8583918213844299, + "num_tokens": 391215031.0, + "step": 10251 + }, + { + "epoch": 1.3041597761099097, + "ewc_loss": 0.0076904138550162315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.690413622185588e-05, + "grad_norm": 3.818579912185669, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8759397268295288, + "num_tokens": 391251400.0, + "step": 10252 + }, + { + "epoch": 1.3042869863885003, + "ewc_loss": 0.007690082769840956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.690082566114143e-05, + "grad_norm": 3.838606595993042, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8650510311126709, + "num_tokens": 391285895.0, + "step": 10253 + }, + { + "epoch": 1.3044141966670906, + "ewc_loss": 0.007704013958573341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70401384215802e-05, + "grad_norm": 3.757106304168701, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8752888441085815, + "num_tokens": 391328361.0, + "step": 10254 + }, + { + "epoch": 1.304541406945681, + "ewc_loss": 0.0076456451788544655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.645645382581279e-05, + "grad_norm": 3.8077149391174316, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.873099684715271, + "num_tokens": 391365019.0, + "step": 10255 + }, + { + "epoch": 1.3046686172242716, + "ewc_loss": 0.007728157564997673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.728157652309164e-05, + "grad_norm": 3.8277528285980225, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8575704097747803, + "num_tokens": 391400407.0, + "step": 10256 + }, + { + "epoch": 1.3047958275028622, + "ewc_loss": 0.007704093120992184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704093150096014e-05, + "grad_norm": 3.8015811443328857, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8573914766311646, + "num_tokens": 391440660.0, + "step": 10257 + }, + { + "epoch": 1.3049230377814527, + "ewc_loss": 0.007699243724346161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699243724346161e-05, + "grad_norm": 3.7769556045532227, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8751121163368225, + "num_tokens": 391481096.0, + "step": 10258 + }, + { + "epoch": 1.3050502480600432, + "ewc_loss": 0.007695795502513647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6957956480328e-05, + "grad_norm": 3.966404914855957, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8743768334388733, + "num_tokens": 391516356.0, + "step": 10259 + }, + { + "epoch": 1.3051774583386337, + "ewc_loss": 0.007822216488420963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.822216866770759e-05, + "grad_norm": 3.8343183994293213, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8624618053436279, + "num_tokens": 391554691.0, + "step": 10260 + }, + { + "epoch": 1.3053046686172243, + "ewc_loss": 0.007664013653993607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.664013537578285e-05, + "grad_norm": 3.759425401687622, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8590875267982483, + "num_tokens": 391592473.0, + "step": 10261 + }, + { + "epoch": 1.3054318788958148, + "ewc_loss": 0.0077041033655405045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704103336436674e-05, + "grad_norm": 3.7633023262023926, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8782524466514587, + "num_tokens": 391632091.0, + "step": 10262 + }, + { + "epoch": 1.3055590891744053, + "ewc_loss": 0.007728320546448231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.728320633759722e-05, + "grad_norm": 3.7657947540283203, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8663174510002136, + "num_tokens": 391676121.0, + "step": 10263 + }, + { + "epoch": 1.3056862994529959, + "ewc_loss": 0.007721724919974804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721724978182465e-05, + "grad_norm": 3.8025429248809814, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.870839536190033, + "num_tokens": 391709664.0, + "step": 10264 + }, + { + "epoch": 1.3058135097315864, + "ewc_loss": 0.007740399334579706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740399450995028e-05, + "grad_norm": 3.743168354034424, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.862816333770752, + "num_tokens": 391755828.0, + "step": 10265 + }, + { + "epoch": 1.305940720010177, + "ewc_loss": 0.007697685156017542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.697685214225203e-05, + "grad_norm": 3.748471975326538, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8884941339492798, + "num_tokens": 391800232.0, + "step": 10266 + }, + { + "epoch": 1.3060679302887674, + "ewc_loss": 0.007714192848652601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.714192906860262e-05, + "grad_norm": 3.7968382835388184, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8755844235420227, + "num_tokens": 391837612.0, + "step": 10267 + }, + { + "epoch": 1.306195140567358, + "ewc_loss": 0.007743611931800842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.74361178628169e-05, + "grad_norm": 3.83925461769104, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8835973739624023, + "num_tokens": 391871193.0, + "step": 10268 + }, + { + "epoch": 1.3063223508459483, + "ewc_loss": 0.007747050374746323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747050403850153e-05, + "grad_norm": 3.850754499435425, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8640892505645752, + "num_tokens": 391908203.0, + "step": 10269 + }, + { + "epoch": 1.3064495611245388, + "ewc_loss": 0.007742707617580891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742707384750247e-05, + "grad_norm": 3.8656728267669678, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8785477876663208, + "num_tokens": 391938744.0, + "step": 10270 + }, + { + "epoch": 1.3065767714031293, + "ewc_loss": 0.0077358330599963665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735833059996367e-05, + "grad_norm": 3.8143205642700195, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8762776255607605, + "num_tokens": 391976020.0, + "step": 10271 + }, + { + "epoch": 1.3067039816817199, + "ewc_loss": 0.007702381815761328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702381844865158e-05, + "grad_norm": 3.770087957382202, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.863869845867157, + "num_tokens": 392018006.0, + "step": 10272 + }, + { + "epoch": 1.3068311919603104, + "ewc_loss": 0.007688743993639946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.688743789913133e-05, + "grad_norm": 3.810482978820801, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8715968728065491, + "num_tokens": 392055358.0, + "step": 10273 + }, + { + "epoch": 1.306958402238901, + "ewc_loss": 0.007734745275229216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734745304333046e-05, + "grad_norm": 3.7456789016723633, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8711812496185303, + "num_tokens": 392097727.0, + "step": 10274 + }, + { + "epoch": 1.3070856125174914, + "ewc_loss": 0.007688723970204592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.688724144827574e-05, + "grad_norm": 3.7230515480041504, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8779008388519287, + "num_tokens": 392144641.0, + "step": 10275 + }, + { + "epoch": 1.307212822796082, + "ewc_loss": 0.007690758444368839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6907585025765e-05, + "grad_norm": 3.8991684913635254, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8804781436920166, + "num_tokens": 392178810.0, + "step": 10276 + }, + { + "epoch": 1.3073400330746725, + "ewc_loss": 0.007797666825354099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797667058184743e-05, + "grad_norm": 3.8631818294525146, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8688013553619385, + "num_tokens": 392213571.0, + "step": 10277 + }, + { + "epoch": 1.3074672433532628, + "ewc_loss": 0.007711603306233883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711603393545374e-05, + "grad_norm": 3.7638564109802246, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8694226741790771, + "num_tokens": 392254628.0, + "step": 10278 + }, + { + "epoch": 1.3075944536318533, + "ewc_loss": 0.007670976687222719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.670976629015058e-05, + "grad_norm": 3.777387857437134, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8724606037139893, + "num_tokens": 392297243.0, + "step": 10279 + }, + { + "epoch": 1.3077216639104439, + "ewc_loss": 0.0077293445356190205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729344360996038e-05, + "grad_norm": 3.9770421981811523, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8755871653556824, + "num_tokens": 392329116.0, + "step": 10280 + }, + { + "epoch": 1.3078488741890344, + "ewc_loss": 0.007815893739461899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815894059604034e-05, + "grad_norm": 3.8727173805236816, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8702476024627686, + "num_tokens": 392370542.0, + "step": 10281 + }, + { + "epoch": 1.307976084467625, + "ewc_loss": 0.00767609803006053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.676098175579682e-05, + "grad_norm": 3.7497196197509766, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8772042989730835, + "num_tokens": 392411155.0, + "step": 10282 + }, + { + "epoch": 1.3081032947462155, + "ewc_loss": 0.007654788438230753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.654788350919262e-05, + "grad_norm": 3.77944016456604, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8734917044639587, + "num_tokens": 392450440.0, + "step": 10283 + }, + { + "epoch": 1.308230505024806, + "ewc_loss": 0.007721011992543936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721011934336275e-05, + "grad_norm": 3.760566473007202, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8826945424079895, + "num_tokens": 392488545.0, + "step": 10284 + }, + { + "epoch": 1.3083577153033965, + "ewc_loss": 0.007676904555410147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.676904351683334e-05, + "grad_norm": 3.7644710540771484, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8724151849746704, + "num_tokens": 392527138.0, + "step": 10285 + }, + { + "epoch": 1.308484925581987, + "ewc_loss": 0.0077016884461045265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.701688446104527e-05, + "grad_norm": 3.797025442123413, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8724784255027771, + "num_tokens": 392568181.0, + "step": 10286 + }, + { + "epoch": 1.3086121358605776, + "ewc_loss": 0.007726978976279497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.726978947175667e-05, + "grad_norm": 3.756554126739502, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8879541158676147, + "num_tokens": 392610900.0, + "step": 10287 + }, + { + "epoch": 1.308739346139168, + "ewc_loss": 0.007687552832067013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.687552715651691e-05, + "grad_norm": 3.8775041103363037, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8651995658874512, + "num_tokens": 392646624.0, + "step": 10288 + }, + { + "epoch": 1.3088665564177586, + "ewc_loss": 0.007766271475702524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766271301079541e-05, + "grad_norm": 3.786777973175049, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8737176656723022, + "num_tokens": 392683103.0, + "step": 10289 + }, + { + "epoch": 1.3089937666963491, + "ewc_loss": 0.007668378297239542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.668378384551033e-05, + "grad_norm": 3.767914295196533, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8750317096710205, + "num_tokens": 392723169.0, + "step": 10290 + }, + { + "epoch": 1.3091209769749397, + "ewc_loss": 0.007696134503930807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69613470765762e-05, + "grad_norm": 3.753873109817505, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8872803449630737, + "num_tokens": 392762323.0, + "step": 10291 + }, + { + "epoch": 1.3092481872535302, + "ewc_loss": 0.007681715302169323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.681715214857832e-05, + "grad_norm": 3.7732810974121094, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8918812274932861, + "num_tokens": 392797555.0, + "step": 10292 + }, + { + "epoch": 1.3093753975321207, + "ewc_loss": 0.007689946331083775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.689946505706757e-05, + "grad_norm": 3.7427499294281006, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8839632272720337, + "num_tokens": 392840001.0, + "step": 10293 + }, + { + "epoch": 1.309502607810711, + "ewc_loss": 0.0076587386429309845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658738468308002e-05, + "grad_norm": 3.7943360805511475, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8729747533798218, + "num_tokens": 392880871.0, + "step": 10294 + }, + { + "epoch": 1.3096298180893016, + "ewc_loss": 0.007700533606112003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.700533751631156e-05, + "grad_norm": 3.8117315769195557, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8850618600845337, + "num_tokens": 392917554.0, + "step": 10295 + }, + { + "epoch": 1.309757028367892, + "ewc_loss": 0.007683487609028816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683487638132647e-05, + "grad_norm": 3.755079984664917, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8901534080505371, + "num_tokens": 392958601.0, + "step": 10296 + }, + { + "epoch": 1.3098842386464826, + "ewc_loss": 0.007625631522387266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.625631405971944e-05, + "grad_norm": 3.766068935394287, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8743128180503845, + "num_tokens": 393000108.0, + "step": 10297 + }, + { + "epoch": 1.3100114489250732, + "ewc_loss": 0.007656184025108814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.656183879589662e-05, + "grad_norm": 3.855149507522583, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8936391472816467, + "num_tokens": 393032321.0, + "step": 10298 + }, + { + "epoch": 1.3101386592036637, + "ewc_loss": 0.007672146428376436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.672146602999419e-05, + "grad_norm": 3.8661699295043945, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8736583590507507, + "num_tokens": 393068347.0, + "step": 10299 + }, + { + "epoch": 1.3102658694822542, + "ewc_loss": 0.007642208132892847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.642208220204338e-05, + "grad_norm": 3.8452346324920654, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8901785016059875, + "num_tokens": 393102509.0, + "step": 10300 + }, + { + "epoch": 1.3103930797608447, + "ewc_loss": 0.0076432437635958195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.643243588972837e-05, + "grad_norm": 3.785985231399536, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8826680183410645, + "num_tokens": 393140974.0, + "step": 10301 + }, + { + "epoch": 1.3105202900394353, + "ewc_loss": 0.007615352515131235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.615352660650387e-05, + "grad_norm": 3.846935272216797, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8823564052581787, + "num_tokens": 393177214.0, + "step": 10302 + }, + { + "epoch": 1.3106475003180256, + "ewc_loss": 0.007682632189244032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.682631985517219e-05, + "grad_norm": 3.7840075492858887, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8773502111434937, + "num_tokens": 393215801.0, + "step": 10303 + }, + { + "epoch": 1.310774710596616, + "ewc_loss": 0.00762563943862915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62563940952532e-05, + "grad_norm": 3.767507314682007, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8844508528709412, + "num_tokens": 393256444.0, + "step": 10304 + }, + { + "epoch": 1.3109019208752066, + "ewc_loss": 0.00763304578140378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633045606780797e-05, + "grad_norm": 3.7619755268096924, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8695913553237915, + "num_tokens": 393300238.0, + "step": 10305 + }, + { + "epoch": 1.3110291311537972, + "ewc_loss": 0.007640832103788853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640832336619496e-05, + "grad_norm": 3.7839114665985107, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8622461557388306, + "num_tokens": 393344087.0, + "step": 10306 + }, + { + "epoch": 1.3111563414323877, + "ewc_loss": 0.00766851007938385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66851007938385e-05, + "grad_norm": 3.8335697650909424, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.878700852394104, + "num_tokens": 393381678.0, + "step": 10307 + }, + { + "epoch": 1.3112835517109782, + "ewc_loss": 0.007677601184695959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.677601388422772e-05, + "grad_norm": 3.875113010406494, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.873522162437439, + "num_tokens": 393415740.0, + "step": 10308 + }, + { + "epoch": 1.3114107619895687, + "ewc_loss": 0.007699456065893173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699456182308495e-05, + "grad_norm": 3.843888998031616, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8520770072937012, + "num_tokens": 393457692.0, + "step": 10309 + }, + { + "epoch": 1.3115379722681593, + "ewc_loss": 0.007659059949219227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.659060065634549e-05, + "grad_norm": 3.752502918243408, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8701950311660767, + "num_tokens": 393499003.0, + "step": 10310 + }, + { + "epoch": 1.3116651825467498, + "ewc_loss": 0.00763248885050416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.632488996023312e-05, + "grad_norm": 3.8449299335479736, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8654775619506836, + "num_tokens": 393533842.0, + "step": 10311 + }, + { + "epoch": 1.3117923928253403, + "ewc_loss": 0.007724201772361994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.724201714154333e-05, + "grad_norm": 3.7508468627929688, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.882226824760437, + "num_tokens": 393573906.0, + "step": 10312 + }, + { + "epoch": 1.3119196031039309, + "ewc_loss": 0.0076318662613630295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.631866174051538e-05, + "grad_norm": 3.7951743602752686, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8724987506866455, + "num_tokens": 393615298.0, + "step": 10313 + }, + { + "epoch": 1.3120468133825214, + "ewc_loss": 0.007704400923103094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704400923103094e-05, + "grad_norm": 3.83493709564209, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8648949861526489, + "num_tokens": 393654690.0, + "step": 10314 + }, + { + "epoch": 1.312174023661112, + "ewc_loss": 0.007696518208831549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696518150623888e-05, + "grad_norm": 3.7676732540130615, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8766505122184753, + "num_tokens": 393696222.0, + "step": 10315 + }, + { + "epoch": 1.3123012339397024, + "ewc_loss": 0.007651384454220533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.651384657947347e-05, + "grad_norm": 3.787813663482666, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8708831071853638, + "num_tokens": 393736748.0, + "step": 10316 + }, + { + "epoch": 1.312428444218293, + "ewc_loss": 0.007690981030464172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.690981146879494e-05, + "grad_norm": 3.8583874702453613, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8643584251403809, + "num_tokens": 393773077.0, + "step": 10317 + }, + { + "epoch": 1.3125556544968833, + "ewc_loss": 0.007709991652518511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709991768933833e-05, + "grad_norm": 3.811960220336914, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8870535492897034, + "num_tokens": 393806466.0, + "step": 10318 + }, + { + "epoch": 1.3126828647754738, + "ewc_loss": 0.007679758593440056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.679758709855378e-05, + "grad_norm": 3.879750967025757, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8714431524276733, + "num_tokens": 393840922.0, + "step": 10319 + }, + { + "epoch": 1.3128100750540643, + "ewc_loss": 0.007733338978141546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.733338861726224e-05, + "grad_norm": 3.8449156284332275, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.878791868686676, + "num_tokens": 393874544.0, + "step": 10320 + }, + { + "epoch": 1.3129372853326549, + "ewc_loss": 0.007696128915995359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696128886891529e-05, + "grad_norm": 3.7419185638427734, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8703881502151489, + "num_tokens": 393917773.0, + "step": 10321 + }, + { + "epoch": 1.3130644956112454, + "ewc_loss": 0.007647192105650902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.647192251170054e-05, + "grad_norm": 3.8646841049194336, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8699564337730408, + "num_tokens": 393951525.0, + "step": 10322 + }, + { + "epoch": 1.313191705889836, + "ewc_loss": 0.007765999063849449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.765999180264771e-05, + "grad_norm": 3.769477128982544, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8796502947807312, + "num_tokens": 393989796.0, + "step": 10323 + }, + { + "epoch": 1.3133189161684264, + "ewc_loss": 0.0076589807868003845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.658980757696554e-05, + "grad_norm": 3.87357234954834, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8728724718093872, + "num_tokens": 394028746.0, + "step": 10324 + }, + { + "epoch": 1.313446126447017, + "ewc_loss": 0.0077517712488770485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.751771045150235e-05, + "grad_norm": 3.802682876586914, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8823992013931274, + "num_tokens": 394065241.0, + "step": 10325 + }, + { + "epoch": 1.3135733367256075, + "ewc_loss": 0.007689247373491526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.689247286180034e-05, + "grad_norm": 3.792201280593872, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8691445589065552, + "num_tokens": 394110113.0, + "step": 10326 + }, + { + "epoch": 1.3137005470041978, + "ewc_loss": 0.00770312175154686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70312180975452e-05, + "grad_norm": 3.812666654586792, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8584071397781372, + "num_tokens": 394146577.0, + "step": 10327 + }, + { + "epoch": 1.3138277572827883, + "ewc_loss": 0.007716905325651169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71690538385883e-05, + "grad_norm": 3.779606819152832, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8747043609619141, + "num_tokens": 394185392.0, + "step": 10328 + }, + { + "epoch": 1.3139549675613789, + "ewc_loss": 0.007698700297623873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.698700210312381e-05, + "grad_norm": 3.778073310852051, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8663187026977539, + "num_tokens": 394226301.0, + "step": 10329 + }, + { + "epoch": 1.3140821778399694, + "ewc_loss": 0.007711348123848438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711348007433116e-05, + "grad_norm": 3.8443238735198975, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8656898140907288, + "num_tokens": 394262654.0, + "step": 10330 + }, + { + "epoch": 1.31420938811856, + "ewc_loss": 0.00775860296562314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.758603169349954e-05, + "grad_norm": 3.8082847595214844, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8862994909286499, + "num_tokens": 394299185.0, + "step": 10331 + }, + { + "epoch": 1.3143365983971504, + "ewc_loss": 0.00771730113774538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.717301195953041e-05, + "grad_norm": 3.79539155960083, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8755154609680176, + "num_tokens": 394331364.0, + "step": 10332 + }, + { + "epoch": 1.314463808675741, + "ewc_loss": 0.0077222250401973724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722224836470559e-05, + "grad_norm": 3.780809164047241, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8625310659408569, + "num_tokens": 394375479.0, + "step": 10333 + }, + { + "epoch": 1.3145910189543315, + "ewc_loss": 0.007723695132881403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.723695307504386e-05, + "grad_norm": 3.8173036575317383, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8792375922203064, + "num_tokens": 394414913.0, + "step": 10334 + }, + { + "epoch": 1.314718229232922, + "ewc_loss": 0.00774586945772171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745869515929371e-05, + "grad_norm": 3.820791244506836, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8748282194137573, + "num_tokens": 394450889.0, + "step": 10335 + }, + { + "epoch": 1.3148454395115126, + "ewc_loss": 0.0077352519147098064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735251710982993e-05, + "grad_norm": 3.7600693702697754, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8588389754295349, + "num_tokens": 394495280.0, + "step": 10336 + }, + { + "epoch": 1.314972649790103, + "ewc_loss": 0.0077081285417079926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70812839618884e-05, + "grad_norm": 3.7618048191070557, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.881434977054596, + "num_tokens": 394536811.0, + "step": 10337 + }, + { + "epoch": 1.3150998600686936, + "ewc_loss": 0.007722918875515461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722918962826952e-05, + "grad_norm": 3.828112840652466, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8714276552200317, + "num_tokens": 394573353.0, + "step": 10338 + }, + { + "epoch": 1.3152270703472841, + "ewc_loss": 0.007777344435453415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.777344580972567e-05, + "grad_norm": 3.9050328731536865, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8663312792778015, + "num_tokens": 394603517.0, + "step": 10339 + }, + { + "epoch": 1.3153542806258747, + "ewc_loss": 0.007775573525577784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.775573612889275e-05, + "grad_norm": 3.7435662746429443, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8796177506446838, + "num_tokens": 394650547.0, + "step": 10340 + }, + { + "epoch": 1.3154814909044652, + "ewc_loss": 0.007664660457521677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.664660370210186e-05, + "grad_norm": 3.816577434539795, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8777902722358704, + "num_tokens": 394684460.0, + "step": 10341 + }, + { + "epoch": 1.3156087011830557, + "ewc_loss": 0.007782070431858301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78207031544298e-05, + "grad_norm": 3.749990463256836, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8888833522796631, + "num_tokens": 394733520.0, + "step": 10342 + }, + { + "epoch": 1.315735911461646, + "ewc_loss": 0.007694245781749487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.694245869060978e-05, + "grad_norm": 3.7743098735809326, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8683565855026245, + "num_tokens": 394775498.0, + "step": 10343 + }, + { + "epoch": 1.3158631217402366, + "ewc_loss": 0.007734432350844145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734432438155636e-05, + "grad_norm": 3.8260161876678467, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8622406721115112, + "num_tokens": 394815138.0, + "step": 10344 + }, + { + "epoch": 1.315990332018827, + "ewc_loss": 0.007744891569018364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.744891627226025e-05, + "grad_norm": 3.83943772315979, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8665878772735596, + "num_tokens": 394850447.0, + "step": 10345 + }, + { + "epoch": 1.3161175422974176, + "ewc_loss": 0.007731970399618149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.731970254098997e-05, + "grad_norm": 3.7982656955718994, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8683086633682251, + "num_tokens": 394891800.0, + "step": 10346 + }, + { + "epoch": 1.3162447525760081, + "ewc_loss": 0.007720800116658211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.720800203969702e-05, + "grad_norm": 3.8054141998291016, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8810000419616699, + "num_tokens": 394930087.0, + "step": 10347 + }, + { + "epoch": 1.3163719628545987, + "ewc_loss": 0.007733335718512535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.733335951343179e-05, + "grad_norm": 3.881709575653076, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8828420639038086, + "num_tokens": 394964497.0, + "step": 10348 + }, + { + "epoch": 1.3164991731331892, + "ewc_loss": 0.007761147804558277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761147571727633e-05, + "grad_norm": 3.8256754875183105, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.858424723148346, + "num_tokens": 395003127.0, + "step": 10349 + }, + { + "epoch": 1.3166263834117797, + "ewc_loss": 0.007703172974288464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70317274145782e-05, + "grad_norm": 3.856518030166626, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8642817735671997, + "num_tokens": 395040570.0, + "step": 10350 + }, + { + "epoch": 1.3167535936903703, + "ewc_loss": 0.007752330508083105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752330566290766e-05, + "grad_norm": 3.7972183227539062, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8745969533920288, + "num_tokens": 395075205.0, + "step": 10351 + }, + { + "epoch": 1.3168808039689606, + "ewc_loss": 0.00770510034635663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705100142629817e-05, + "grad_norm": 3.7871813774108887, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8726744651794434, + "num_tokens": 395117492.0, + "step": 10352 + }, + { + "epoch": 1.317008014247551, + "ewc_loss": 0.0077296835370361805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729683420620859e-05, + "grad_norm": 3.734011173248291, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8710150718688965, + "num_tokens": 395162449.0, + "step": 10353 + }, + { + "epoch": 1.3171352245261416, + "ewc_loss": 0.007702139671891928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702139555476606e-05, + "grad_norm": 3.822512149810791, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8813787698745728, + "num_tokens": 395197310.0, + "step": 10354 + }, + { + "epoch": 1.3172624348047322, + "ewc_loss": 0.007782197091728449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782196917105466e-05, + "grad_norm": 3.7885098457336426, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8844046592712402, + "num_tokens": 395232420.0, + "step": 10355 + }, + { + "epoch": 1.3173896450833227, + "ewc_loss": 0.007729374337941408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729374192422256e-05, + "grad_norm": 3.8779690265655518, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8524717092514038, + "num_tokens": 395269392.0, + "step": 10356 + }, + { + "epoch": 1.3175168553619132, + "ewc_loss": 0.007782916538417339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782916509313509e-05, + "grad_norm": 3.8487138748168945, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8621470928192139, + "num_tokens": 395305496.0, + "step": 10357 + }, + { + "epoch": 1.3176440656405037, + "ewc_loss": 0.007743342779576778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.743342575849965e-05, + "grad_norm": 3.785013198852539, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8826550245285034, + "num_tokens": 395347089.0, + "step": 10358 + }, + { + "epoch": 1.3177712759190943, + "ewc_loss": 0.007716878317296505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716878462815657e-05, + "grad_norm": 3.799757957458496, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8830181360244751, + "num_tokens": 395382885.0, + "step": 10359 + }, + { + "epoch": 1.3178984861976848, + "ewc_loss": 0.00774619122967124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.746191113255918e-05, + "grad_norm": 3.8277699947357178, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8876379132270813, + "num_tokens": 395416260.0, + "step": 10360 + }, + { + "epoch": 1.3180256964762753, + "ewc_loss": 0.007767337374389172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.767337228870019e-05, + "grad_norm": 3.8522143363952637, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8638099431991577, + "num_tokens": 395454179.0, + "step": 10361 + }, + { + "epoch": 1.3181529067548658, + "ewc_loss": 0.007756042759865522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.756042759865522e-05, + "grad_norm": 3.893176317214966, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8630175590515137, + "num_tokens": 395489173.0, + "step": 10362 + }, + { + "epoch": 1.3182801170334564, + "ewc_loss": 0.007780386134982109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.780385931255296e-05, + "grad_norm": 3.8007616996765137, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8673728108406067, + "num_tokens": 395532057.0, + "step": 10363 + }, + { + "epoch": 1.318407327312047, + "ewc_loss": 0.007720903493463993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.720903522567824e-05, + "grad_norm": 3.786064863204956, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.880358874797821, + "num_tokens": 395570321.0, + "step": 10364 + }, + { + "epoch": 1.3185345375906374, + "ewc_loss": 0.007751092314720154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.751092198304832e-05, + "grad_norm": 3.769296407699585, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8920512199401855, + "num_tokens": 395609041.0, + "step": 10365 + }, + { + "epoch": 1.318661747869228, + "ewc_loss": 0.00773846497759223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738464773865417e-05, + "grad_norm": 3.8314664363861084, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8528136014938354, + "num_tokens": 395649961.0, + "step": 10366 + }, + { + "epoch": 1.3187889581478183, + "ewc_loss": 0.007774715777486563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774715777486563e-05, + "grad_norm": 3.7627532482147217, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8731794357299805, + "num_tokens": 395691882.0, + "step": 10367 + }, + { + "epoch": 1.3189161684264088, + "ewc_loss": 0.0077070677652955055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.707067561568692e-05, + "grad_norm": 3.826580762863159, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8612143993377686, + "num_tokens": 395731496.0, + "step": 10368 + }, + { + "epoch": 1.3190433787049993, + "ewc_loss": 0.007766637951135635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766638009343296e-05, + "grad_norm": 3.7829971313476562, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8781719207763672, + "num_tokens": 395772618.0, + "step": 10369 + }, + { + "epoch": 1.3191705889835899, + "ewc_loss": 0.007709525991231203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709526107646525e-05, + "grad_norm": 3.7876381874084473, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8582819104194641, + "num_tokens": 395811907.0, + "step": 10370 + }, + { + "epoch": 1.3192977992621804, + "ewc_loss": 0.007739880587905645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.739880675217137e-05, + "grad_norm": 3.825634241104126, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8694717288017273, + "num_tokens": 395851186.0, + "step": 10371 + }, + { + "epoch": 1.319425009540771, + "ewc_loss": 0.007767700124531984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.767700299154967e-05, + "grad_norm": 3.7777068614959717, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8732743859291077, + "num_tokens": 395893868.0, + "step": 10372 + }, + { + "epoch": 1.3195522198193614, + "ewc_loss": 0.007702925242483616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702925358898938e-05, + "grad_norm": 3.798016309738159, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.882019579410553, + "num_tokens": 395930519.0, + "step": 10373 + }, + { + "epoch": 1.319679430097952, + "ewc_loss": 0.007741203531622887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741203444311395e-05, + "grad_norm": 3.804234743118286, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8894067406654358, + "num_tokens": 395965692.0, + "step": 10374 + }, + { + "epoch": 1.3198066403765425, + "ewc_loss": 0.007737763226032257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73776337155141e-05, + "grad_norm": 3.854471206665039, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8761163949966431, + "num_tokens": 396002316.0, + "step": 10375 + }, + { + "epoch": 1.3199338506551328, + "ewc_loss": 0.0077340127900242805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734012615401298e-05, + "grad_norm": 3.741722345352173, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.879889965057373, + "num_tokens": 396043392.0, + "step": 10376 + }, + { + "epoch": 1.3200610609337233, + "ewc_loss": 0.0076474882662296295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.647488382644951e-05, + "grad_norm": 3.8495192527770996, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8710639476776123, + "num_tokens": 396078603.0, + "step": 10377 + }, + { + "epoch": 1.3201882712123139, + "ewc_loss": 0.00774775305762887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747753261355683e-05, + "grad_norm": 3.7577812671661377, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8650016784667969, + "num_tokens": 396121132.0, + "step": 10378 + }, + { + "epoch": 1.3203154814909044, + "ewc_loss": 0.007651844993233681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.651845226064324e-05, + "grad_norm": 3.870030641555786, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8826993107795715, + "num_tokens": 396155730.0, + "step": 10379 + }, + { + "epoch": 1.320442691769495, + "ewc_loss": 0.007751225493848324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.751225348329172e-05, + "grad_norm": 3.843803644180298, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8592544794082642, + "num_tokens": 396190976.0, + "step": 10380 + }, + { + "epoch": 1.3205699020480854, + "ewc_loss": 0.007702930364757776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702930452069268e-05, + "grad_norm": 3.829270362854004, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8720851540565491, + "num_tokens": 396229571.0, + "step": 10381 + }, + { + "epoch": 1.320697112326676, + "ewc_loss": 0.007713335566222668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713335799053311e-05, + "grad_norm": 3.8245689868927, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8734279274940491, + "num_tokens": 396265586.0, + "step": 10382 + }, + { + "epoch": 1.3208243226052665, + "ewc_loss": 0.007713409140706062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713409286225215e-05, + "grad_norm": 3.825249433517456, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8792387247085571, + "num_tokens": 396303810.0, + "step": 10383 + }, + { + "epoch": 1.320951532883857, + "ewc_loss": 0.007722478825598955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722478767391294e-05, + "grad_norm": 3.916465997695923, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8925848007202148, + "num_tokens": 396345671.0, + "step": 10384 + }, + { + "epoch": 1.3210787431624476, + "ewc_loss": 0.007793073542416096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793073746142909e-05, + "grad_norm": 3.7316882610321045, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8898113965988159, + "num_tokens": 396388062.0, + "step": 10385 + }, + { + "epoch": 1.321205953441038, + "ewc_loss": 0.007640373893082142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640373951289803e-05, + "grad_norm": 3.833235263824463, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8865422010421753, + "num_tokens": 396425189.0, + "step": 10386 + }, + { + "epoch": 1.3213331637196286, + "ewc_loss": 0.007767973933368921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76797387516126e-05, + "grad_norm": 3.8609325885772705, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.863688588142395, + "num_tokens": 396461861.0, + "step": 10387 + }, + { + "epoch": 1.3214603739982191, + "ewc_loss": 0.007735878694802523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735878898529336e-05, + "grad_norm": 3.7811951637268066, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8842487335205078, + "num_tokens": 396506190.0, + "step": 10388 + }, + { + "epoch": 1.3215875842768097, + "ewc_loss": 0.0076697529293596745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.669752812944353e-05, + "grad_norm": 3.856811761856079, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.87459397315979, + "num_tokens": 396538900.0, + "step": 10389 + }, + { + "epoch": 1.3217147945554002, + "ewc_loss": 0.007753252517431974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.753252430120483e-05, + "grad_norm": 3.795362949371338, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.883859395980835, + "num_tokens": 396577029.0, + "step": 10390 + }, + { + "epoch": 1.3218420048339907, + "ewc_loss": 0.007672165520489216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.672165520489216e-05, + "grad_norm": 3.7931299209594727, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8581643104553223, + "num_tokens": 396616917.0, + "step": 10391 + }, + { + "epoch": 1.321969215112581, + "ewc_loss": 0.00770265469327569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70265469327569e-05, + "grad_norm": 3.7974448204040527, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8837876319885254, + "num_tokens": 396651726.0, + "step": 10392 + }, + { + "epoch": 1.3220964253911716, + "ewc_loss": 0.007715256884694099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.715256651863456e-05, + "grad_norm": 3.8284740447998047, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8610773682594299, + "num_tokens": 396693796.0, + "step": 10393 + }, + { + "epoch": 1.322223635669762, + "ewc_loss": 0.0077292462810873985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729246135568246e-05, + "grad_norm": 3.8703243732452393, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8633382320404053, + "num_tokens": 396731947.0, + "step": 10394 + }, + { + "epoch": 1.3223508459483526, + "ewc_loss": 0.007731256540864706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.731256482657045e-05, + "grad_norm": 3.8257813453674316, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8769924640655518, + "num_tokens": 396767062.0, + "step": 10395 + }, + { + "epoch": 1.3224780562269431, + "ewc_loss": 0.007706699427217245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706699398113415e-05, + "grad_norm": 3.8490405082702637, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8769804239273071, + "num_tokens": 396803918.0, + "step": 10396 + }, + { + "epoch": 1.3226052665055337, + "ewc_loss": 0.0077127558179199696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.712755905231461e-05, + "grad_norm": 3.8101441860198975, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8475666642189026, + "num_tokens": 396850314.0, + "step": 10397 + }, + { + "epoch": 1.3227324767841242, + "ewc_loss": 0.0076794312335550785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.67943129176274e-05, + "grad_norm": 3.828826427459717, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8670037388801575, + "num_tokens": 396887920.0, + "step": 10398 + }, + { + "epoch": 1.3228596870627147, + "ewc_loss": 0.007706628181040287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706628093728796e-05, + "grad_norm": 3.836233377456665, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8617367744445801, + "num_tokens": 396923369.0, + "step": 10399 + }, + { + "epoch": 1.3229868973413053, + "ewc_loss": 0.007718825712800026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.718825509073213e-05, + "grad_norm": 3.810248613357544, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8711800575256348, + "num_tokens": 396962338.0, + "step": 10400 + }, + { + "epoch": 1.3231141076198956, + "ewc_loss": 0.0076960232108831406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696023385506123e-05, + "grad_norm": 3.8075690269470215, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8830528259277344, + "num_tokens": 396998275.0, + "step": 10401 + }, + { + "epoch": 1.323241317898486, + "ewc_loss": 0.007708825636655092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708825432928279e-05, + "grad_norm": 3.807955503463745, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8716413378715515, + "num_tokens": 397040034.0, + "step": 10402 + }, + { + "epoch": 1.3233685281770766, + "ewc_loss": 0.007707597222179174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.707597251283005e-05, + "grad_norm": 3.7535996437072754, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8915137052536011, + "num_tokens": 397080549.0, + "step": 10403 + }, + { + "epoch": 1.3234957384556671, + "ewc_loss": 0.007676555775105953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.676555833313614e-05, + "grad_norm": 3.7976512908935547, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8855103850364685, + "num_tokens": 397120803.0, + "step": 10404 + }, + { + "epoch": 1.3236229487342577, + "ewc_loss": 0.007711246144026518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711246144026518e-05, + "grad_norm": 3.783963680267334, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.877122163772583, + "num_tokens": 397160463.0, + "step": 10405 + }, + { + "epoch": 1.3237501590128482, + "ewc_loss": 0.007680203765630722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.680203998461366e-05, + "grad_norm": 3.7423019409179688, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8670048117637634, + "num_tokens": 397205425.0, + "step": 10406 + }, + { + "epoch": 1.3238773692914387, + "ewc_loss": 0.0076624732464551926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.662473217351362e-05, + "grad_norm": 3.8663506507873535, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8694196939468384, + "num_tokens": 397238907.0, + "step": 10407 + }, + { + "epoch": 1.3240045795700293, + "ewc_loss": 0.007745703682303429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745703624095768e-05, + "grad_norm": 3.7867960929870605, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8730175495147705, + "num_tokens": 397281639.0, + "step": 10408 + }, + { + "epoch": 1.3241317898486198, + "ewc_loss": 0.007664171978831291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.664172153454274e-05, + "grad_norm": 3.82185435295105, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8618550896644592, + "num_tokens": 397323350.0, + "step": 10409 + }, + { + "epoch": 1.3242590001272103, + "ewc_loss": 0.007707669865339994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.707670010859147e-05, + "grad_norm": 3.794762372970581, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8689850568771362, + "num_tokens": 397364438.0, + "step": 10410 + }, + { + "epoch": 1.3243862104058008, + "ewc_loss": 0.007680188864469528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.680188718950376e-05, + "grad_norm": 3.8133842945098877, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8622636198997498, + "num_tokens": 397406881.0, + "step": 10411 + }, + { + "epoch": 1.3245134206843914, + "ewc_loss": 0.007683965843170881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683965668547899e-05, + "grad_norm": 3.7440872192382812, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8738986253738403, + "num_tokens": 397451190.0, + "step": 10412 + }, + { + "epoch": 1.324640630962982, + "ewc_loss": 0.007635458838194609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.635459041921422e-05, + "grad_norm": 3.791006326675415, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8753026127815247, + "num_tokens": 397491063.0, + "step": 10413 + }, + { + "epoch": 1.3247678412415724, + "ewc_loss": 0.007678907364606857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678907422814518e-05, + "grad_norm": 3.7971720695495605, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8744145631790161, + "num_tokens": 397529961.0, + "step": 10414 + }, + { + "epoch": 1.324895051520163, + "ewc_loss": 0.007675886619836092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.675886445213109e-05, + "grad_norm": 3.8348774909973145, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8840312957763672, + "num_tokens": 397565616.0, + "step": 10415 + }, + { + "epoch": 1.3250222617987533, + "ewc_loss": 0.007681005168706179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.681005081394687e-05, + "grad_norm": 3.8274502754211426, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8800568580627441, + "num_tokens": 397602223.0, + "step": 10416 + }, + { + "epoch": 1.3251494720773438, + "ewc_loss": 0.0076591139659285545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.659113907720894e-05, + "grad_norm": 3.889406681060791, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8515859842300415, + "num_tokens": 397638910.0, + "step": 10417 + }, + { + "epoch": 1.3252766823559343, + "ewc_loss": 0.007703990675508976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.703990559093654e-05, + "grad_norm": 3.794238567352295, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8661907911300659, + "num_tokens": 397681200.0, + "step": 10418 + }, + { + "epoch": 1.3254038926345248, + "ewc_loss": 0.007621517404913902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.621517579536885e-05, + "grad_norm": 3.753195285797119, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8767019510269165, + "num_tokens": 397724832.0, + "step": 10419 + }, + { + "epoch": 1.3255311029131154, + "ewc_loss": 0.007650351617485285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.650351471966133e-05, + "grad_norm": 3.7934443950653076, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8660933971405029, + "num_tokens": 397765537.0, + "step": 10420 + }, + { + "epoch": 1.325658313191706, + "ewc_loss": 0.007685535121709108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.685535092605278e-05, + "grad_norm": 3.8218777179718018, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8647627234458923, + "num_tokens": 397800945.0, + "step": 10421 + }, + { + "epoch": 1.3257855234702964, + "ewc_loss": 0.007677960209548473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.677960093133152e-05, + "grad_norm": 3.815448760986328, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8792783617973328, + "num_tokens": 397840786.0, + "step": 10422 + }, + { + "epoch": 1.325912733748887, + "ewc_loss": 0.007681531831622124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.681531860725954e-05, + "grad_norm": 3.8477249145507812, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8783938884735107, + "num_tokens": 397875607.0, + "step": 10423 + }, + { + "epoch": 1.3260399440274775, + "ewc_loss": 0.007700965274125338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.700965215917677e-05, + "grad_norm": 3.779787063598633, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8898448944091797, + "num_tokens": 397909524.0, + "step": 10424 + }, + { + "epoch": 1.3261671543060678, + "ewc_loss": 0.00762915937229991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.629159517819062e-05, + "grad_norm": 3.82940411567688, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.859994113445282, + "num_tokens": 397947862.0, + "step": 10425 + }, + { + "epoch": 1.3262943645846583, + "ewc_loss": 0.007708224467933178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708224438829347e-05, + "grad_norm": 3.82112717628479, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.886202335357666, + "num_tokens": 397980958.0, + "step": 10426 + }, + { + "epoch": 1.3264215748632489, + "ewc_loss": 0.007678591646254063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678591646254063e-05, + "grad_norm": 3.79950213432312, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8701249361038208, + "num_tokens": 398021231.0, + "step": 10427 + }, + { + "epoch": 1.3265487851418394, + "ewc_loss": 0.007694872096180916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69487232901156e-05, + "grad_norm": 3.7986221313476562, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8722591996192932, + "num_tokens": 398059202.0, + "step": 10428 + }, + { + "epoch": 1.32667599542043, + "ewc_loss": 0.007689209654927254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.68920945120044e-05, + "grad_norm": 3.7718265056610107, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8946428894996643, + "num_tokens": 398097121.0, + "step": 10429 + }, + { + "epoch": 1.3268032056990204, + "ewc_loss": 0.007681948598474264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.681948773097247e-05, + "grad_norm": 3.876906394958496, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8733452558517456, + "num_tokens": 398130413.0, + "step": 10430 + }, + { + "epoch": 1.326930415977611, + "ewc_loss": 0.007751744240522385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.751744124107063e-05, + "grad_norm": 3.7866804599761963, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8711575269699097, + "num_tokens": 398172535.0, + "step": 10431 + }, + { + "epoch": 1.3270576262562015, + "ewc_loss": 0.007657095789909363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.657095557078719e-05, + "grad_norm": 3.8547744750976562, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8755789995193481, + "num_tokens": 398206681.0, + "step": 10432 + }, + { + "epoch": 1.327184836534792, + "ewc_loss": 0.007730486802756786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730486686341465e-05, + "grad_norm": 3.791497230529785, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8767973184585571, + "num_tokens": 398245206.0, + "step": 10433 + }, + { + "epoch": 1.3273120468133826, + "ewc_loss": 0.00767280301079154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.672802894376218e-05, + "grad_norm": 3.8597662448883057, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8715367913246155, + "num_tokens": 398278902.0, + "step": 10434 + }, + { + "epoch": 1.327439257091973, + "ewc_loss": 0.007746140006929636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.746140181552619e-05, + "grad_norm": 3.8059608936309814, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8756811618804932, + "num_tokens": 398315833.0, + "step": 10435 + }, + { + "epoch": 1.3275664673705636, + "ewc_loss": 0.007684516254812479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.684516458539292e-05, + "grad_norm": 3.8208377361297607, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8586527109146118, + "num_tokens": 398356496.0, + "step": 10436 + }, + { + "epoch": 1.3276936776491541, + "ewc_loss": 0.007727000862360001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.727000775048509e-05, + "grad_norm": 3.805136203765869, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8794859051704407, + "num_tokens": 398393661.0, + "step": 10437 + }, + { + "epoch": 1.3278208879277447, + "ewc_loss": 0.007708220276981592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708220073254779e-05, + "grad_norm": 3.801668882369995, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8907865285873413, + "num_tokens": 398430463.0, + "step": 10438 + }, + { + "epoch": 1.3279480982063352, + "ewc_loss": 0.0077112819999456406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711281796218827e-05, + "grad_norm": 3.8348350524902344, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8727790117263794, + "num_tokens": 398470382.0, + "step": 10439 + }, + { + "epoch": 1.3280753084849257, + "ewc_loss": 0.007733969017863274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.733968959655613e-05, + "grad_norm": 3.898169994354248, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8571621775627136, + "num_tokens": 398504793.0, + "step": 10440 + }, + { + "epoch": 1.328202518763516, + "ewc_loss": 0.007747470401227474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747470226604491e-05, + "grad_norm": 3.830562114715576, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8660719394683838, + "num_tokens": 398541038.0, + "step": 10441 + }, + { + "epoch": 1.3283297290421066, + "ewc_loss": 0.007692357059568167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.692357030464336e-05, + "grad_norm": 3.8322088718414307, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8570899963378906, + "num_tokens": 398578047.0, + "step": 10442 + }, + { + "epoch": 1.328456939320697, + "ewc_loss": 0.007732611149549484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732611265964806e-05, + "grad_norm": 3.811811685562134, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8634303212165833, + "num_tokens": 398618946.0, + "step": 10443 + }, + { + "epoch": 1.3285841495992876, + "ewc_loss": 0.007711156737059355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711156649747863e-05, + "grad_norm": 3.808187246322632, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.879080057144165, + "num_tokens": 398656895.0, + "step": 10444 + }, + { + "epoch": 1.3287113598778781, + "ewc_loss": 0.007725652307271957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725652540102601e-05, + "grad_norm": 3.787461757659912, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8740543127059937, + "num_tokens": 398696521.0, + "step": 10445 + }, + { + "epoch": 1.3288385701564687, + "ewc_loss": 0.007717951666563749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.717951666563749e-05, + "grad_norm": 3.8024063110351562, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8742364048957825, + "num_tokens": 398735275.0, + "step": 10446 + }, + { + "epoch": 1.3289657804350592, + "ewc_loss": 0.00773445051163435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734450628049672e-05, + "grad_norm": 3.83510160446167, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8591948747634888, + "num_tokens": 398774753.0, + "step": 10447 + }, + { + "epoch": 1.3290929907136497, + "ewc_loss": 0.007733636535704136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.733636448392645e-05, + "grad_norm": 3.843353271484375, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8670001029968262, + "num_tokens": 398810893.0, + "step": 10448 + }, + { + "epoch": 1.3292202009922403, + "ewc_loss": 0.007726338692009449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.726338662905619e-05, + "grad_norm": 3.8121819496154785, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8678277134895325, + "num_tokens": 398849741.0, + "step": 10449 + }, + { + "epoch": 1.3293474112708306, + "ewc_loss": 0.007722318638116121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722318696323782e-05, + "grad_norm": 3.8743252754211426, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8691102266311646, + "num_tokens": 398881186.0, + "step": 10450 + }, + { + "epoch": 1.329474621549421, + "ewc_loss": 0.00777549808844924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.775497942930087e-05, + "grad_norm": 3.783360242843628, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8901578187942505, + "num_tokens": 398918677.0, + "step": 10451 + }, + { + "epoch": 1.3296018318280116, + "ewc_loss": 0.007706783711910248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706783799221739e-05, + "grad_norm": 3.844799518585205, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.87077796459198, + "num_tokens": 398953435.0, + "step": 10452 + }, + { + "epoch": 1.3297290421066021, + "ewc_loss": 0.007771021220833063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77102104621008e-05, + "grad_norm": 3.825849771499634, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8800520896911621, + "num_tokens": 398991799.0, + "step": 10453 + }, + { + "epoch": 1.3298562523851927, + "ewc_loss": 0.007738243788480759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738243584753945e-05, + "grad_norm": 3.8506791591644287, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8798186182975769, + "num_tokens": 399023211.0, + "step": 10454 + }, + { + "epoch": 1.3299834626637832, + "ewc_loss": 0.007768584415316582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76858432800509e-05, + "grad_norm": 3.817481756210327, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8704099655151367, + "num_tokens": 399061431.0, + "step": 10455 + }, + { + "epoch": 1.3301106729423737, + "ewc_loss": 0.00774840684607625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.748406642349437e-05, + "grad_norm": 3.878840446472168, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.875105619430542, + "num_tokens": 399097384.0, + "step": 10456 + }, + { + "epoch": 1.3302378832209643, + "ewc_loss": 0.007788403425365686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.788403308950365e-05, + "grad_norm": 3.785083532333374, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8758398294448853, + "num_tokens": 399133935.0, + "step": 10457 + }, + { + "epoch": 1.3303650934995548, + "ewc_loss": 0.007713460363447666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713460217928514e-05, + "grad_norm": 3.8696749210357666, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8773126602172852, + "num_tokens": 399165709.0, + "step": 10458 + }, + { + "epoch": 1.3304923037781453, + "ewc_loss": 0.007786013185977936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786013156874105e-05, + "grad_norm": 3.7937734127044678, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8716636896133423, + "num_tokens": 399203548.0, + "step": 10459 + }, + { + "epoch": 1.3306195140567358, + "ewc_loss": 0.007731097284704447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.731097139185295e-05, + "grad_norm": 3.7996997833251953, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8714216947555542, + "num_tokens": 399244341.0, + "step": 10460 + }, + { + "epoch": 1.3307467243353264, + "ewc_loss": 0.0077566239051520824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.756624108878896e-05, + "grad_norm": 3.79790997505188, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8778449296951294, + "num_tokens": 399282284.0, + "step": 10461 + }, + { + "epoch": 1.330873934613917, + "ewc_loss": 0.007754621561616659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.754621765343472e-05, + "grad_norm": 3.7862024307250977, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8865917921066284, + "num_tokens": 399320523.0, + "step": 10462 + }, + { + "epoch": 1.3310011448925074, + "ewc_loss": 0.007739208173006773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.739208376733586e-05, + "grad_norm": 3.77799916267395, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8745322227478027, + "num_tokens": 399357489.0, + "step": 10463 + }, + { + "epoch": 1.331128355171098, + "ewc_loss": 0.007747326977550983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747326890239492e-05, + "grad_norm": 3.834099769592285, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8629100322723389, + "num_tokens": 399393561.0, + "step": 10464 + }, + { + "epoch": 1.3312555654496883, + "ewc_loss": 0.0077780503779649734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778050348861143e-05, + "grad_norm": 3.8618524074554443, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8811638355255127, + "num_tokens": 399424514.0, + "step": 10465 + }, + { + "epoch": 1.3313827757282788, + "ewc_loss": 0.007758500520139933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.758500578347594e-05, + "grad_norm": 3.80005145072937, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8770943284034729, + "num_tokens": 399461372.0, + "step": 10466 + }, + { + "epoch": 1.3315099860068693, + "ewc_loss": 0.007733426988124847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.733426900813356e-05, + "grad_norm": 3.769944667816162, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.888321042060852, + "num_tokens": 399502503.0, + "step": 10467 + }, + { + "epoch": 1.3316371962854598, + "ewc_loss": 0.007732807192951441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732806989224628e-05, + "grad_norm": 3.8463141918182373, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.867275595664978, + "num_tokens": 399537836.0, + "step": 10468 + }, + { + "epoch": 1.3317644065640504, + "ewc_loss": 0.007770705036818981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770705269649625e-05, + "grad_norm": 3.831613063812256, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8692278861999512, + "num_tokens": 399575189.0, + "step": 10469 + }, + { + "epoch": 1.331891616842641, + "ewc_loss": 0.007727699354290962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.727699266979471e-05, + "grad_norm": 3.8319671154022217, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8566919565200806, + "num_tokens": 399614406.0, + "step": 10470 + }, + { + "epoch": 1.3320188271212314, + "ewc_loss": 0.00773673877120018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.736738916719332e-05, + "grad_norm": 3.8123841285705566, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8784859776496887, + "num_tokens": 399652847.0, + "step": 10471 + }, + { + "epoch": 1.332146037399822, + "ewc_loss": 0.007732895668596029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73289575590752e-05, + "grad_norm": 3.8577003479003906, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8636201620101929, + "num_tokens": 399691628.0, + "step": 10472 + }, + { + "epoch": 1.3322732476784125, + "ewc_loss": 0.007754312362521887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75431253714487e-05, + "grad_norm": 3.792912483215332, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8838879466056824, + "num_tokens": 399732875.0, + "step": 10473 + }, + { + "epoch": 1.3324004579570028, + "ewc_loss": 0.007706792559474707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706792530370876e-05, + "grad_norm": 3.829193592071533, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.884777843952179, + "num_tokens": 399769537.0, + "step": 10474 + }, + { + "epoch": 1.3325276682355933, + "ewc_loss": 0.007740935776382685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740935689071193e-05, + "grad_norm": 3.827265501022339, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8578281402587891, + "num_tokens": 399807757.0, + "step": 10475 + }, + { + "epoch": 1.3326548785141838, + "ewc_loss": 0.007718469947576523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.718469714745879e-05, + "grad_norm": 3.936016082763672, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.880905270576477, + "num_tokens": 399837351.0, + "step": 10476 + }, + { + "epoch": 1.3327820887927744, + "ewc_loss": 0.0077898744493722916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.789874507579952e-05, + "grad_norm": 3.779174327850342, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8911932706832886, + "num_tokens": 399873678.0, + "step": 10477 + }, + { + "epoch": 1.332909299071365, + "ewc_loss": 0.007667988538742065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.667988393222913e-05, + "grad_norm": 3.773270845413208, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8750982284545898, + "num_tokens": 399916206.0, + "step": 10478 + }, + { + "epoch": 1.3330365093499554, + "ewc_loss": 0.007723839953541756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.723840099060908e-05, + "grad_norm": 3.77697491645813, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8863528966903687, + "num_tokens": 399954144.0, + "step": 10479 + }, + { + "epoch": 1.333163719628546, + "ewc_loss": 0.00772877549752593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.728775381110609e-05, + "grad_norm": 3.798314094543457, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8823422193527222, + "num_tokens": 399995865.0, + "step": 10480 + }, + { + "epoch": 1.3332909299071365, + "ewc_loss": 0.007714845705777407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.714845560258254e-05, + "grad_norm": 3.830672025680542, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8763480186462402, + "num_tokens": 400033234.0, + "step": 10481 + }, + { + "epoch": 1.333418140185727, + "ewc_loss": 0.007752243895083666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752243982395157e-05, + "grad_norm": 3.8498826026916504, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8831304311752319, + "num_tokens": 400066842.0, + "step": 10482 + }, + { + "epoch": 1.3335453504643175, + "ewc_loss": 0.007740319240838289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740319415461272e-05, + "grad_norm": 3.8149313926696777, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8848071098327637, + "num_tokens": 400099380.0, + "step": 10483 + }, + { + "epoch": 1.333672560742908, + "ewc_loss": 0.007718612905591726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.718613051110879e-05, + "grad_norm": 3.759155035018921, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8844923973083496, + "num_tokens": 400137419.0, + "step": 10484 + }, + { + "epoch": 1.3337997710214986, + "ewc_loss": 0.007686061318963766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.686061144340783e-05, + "grad_norm": 3.8563876152038574, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8746206760406494, + "num_tokens": 400171086.0, + "step": 10485 + }, + { + "epoch": 1.3339269813000891, + "ewc_loss": 0.007778534200042486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778534200042486e-05, + "grad_norm": 3.8293190002441406, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8724414706230164, + "num_tokens": 400206763.0, + "step": 10486 + }, + { + "epoch": 1.3340541915786797, + "ewc_loss": 0.00772809237241745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.728092168690637e-05, + "grad_norm": 3.868753433227539, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8585900068283081, + "num_tokens": 400242708.0, + "step": 10487 + }, + { + "epoch": 1.3341814018572702, + "ewc_loss": 0.007757043000310659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757043204037473e-05, + "grad_norm": 3.871553897857666, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8541601896286011, + "num_tokens": 400277417.0, + "step": 10488 + }, + { + "epoch": 1.3343086121358605, + "ewc_loss": 0.007765570655465126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.765570626361296e-05, + "grad_norm": 3.787710666656494, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8747638463973999, + "num_tokens": 400315811.0, + "step": 10489 + }, + { + "epoch": 1.334435822414451, + "ewc_loss": 0.007714938838034868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.714938692515716e-05, + "grad_norm": 3.7890050411224365, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8555619716644287, + "num_tokens": 400356321.0, + "step": 10490 + }, + { + "epoch": 1.3345630326930416, + "ewc_loss": 0.007749822456389666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749822543701157e-05, + "grad_norm": 3.8377954959869385, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8752039670944214, + "num_tokens": 400392382.0, + "step": 10491 + }, + { + "epoch": 1.334690242971632, + "ewc_loss": 0.007785263005644083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785263005644083e-05, + "grad_norm": 3.801727533340454, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8806176781654358, + "num_tokens": 400434947.0, + "step": 10492 + }, + { + "epoch": 1.3348174532502226, + "ewc_loss": 0.007738267537206411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738267595414072e-05, + "grad_norm": 3.8275012969970703, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8567469716072083, + "num_tokens": 400472742.0, + "step": 10493 + }, + { + "epoch": 1.3349446635288131, + "ewc_loss": 0.007770581636577845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770581578370184e-05, + "grad_norm": 3.7657952308654785, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8877893686294556, + "num_tokens": 400510029.0, + "step": 10494 + }, + { + "epoch": 1.3350718738074037, + "ewc_loss": 0.007722274865955114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722275040578097e-05, + "grad_norm": 3.8646154403686523, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8698921203613281, + "num_tokens": 400547383.0, + "step": 10495 + }, + { + "epoch": 1.3351990840859942, + "ewc_loss": 0.007800532970577478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80053305788897e-05, + "grad_norm": 3.7718167304992676, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8846879005432129, + "num_tokens": 400585242.0, + "step": 10496 + }, + { + "epoch": 1.3353262943645847, + "ewc_loss": 0.007711285259574652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711285434197634e-05, + "grad_norm": 3.8128533363342285, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8860503435134888, + "num_tokens": 400623117.0, + "step": 10497 + }, + { + "epoch": 1.3354535046431752, + "ewc_loss": 0.007777176331728697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77717650635168e-05, + "grad_norm": 3.8374171257019043, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8637472987174988, + "num_tokens": 400660170.0, + "step": 10498 + }, + { + "epoch": 1.3355807149217656, + "ewc_loss": 0.007763396482914686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763396570226178e-05, + "grad_norm": 3.7976083755493164, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8787457346916199, + "num_tokens": 400700467.0, + "step": 10499 + }, + { + "epoch": 1.335707925200356, + "ewc_loss": 0.007716798223555088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716798427281901e-05, + "grad_norm": 3.816635847091675, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8675970435142517, + "num_tokens": 400740269.0, + "step": 10500 + }, + { + "epoch": 1.3358351354789466, + "ewc_loss": 0.007732995320111513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732995436526835e-05, + "grad_norm": 3.8041224479675293, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8839632868766785, + "num_tokens": 400776465.0, + "step": 10501 + }, + { + "epoch": 1.3359623457575371, + "ewc_loss": 0.0077290344052016735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729034405201674e-05, + "grad_norm": 3.845776319503784, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8719307780265808, + "num_tokens": 400811210.0, + "step": 10502 + }, + { + "epoch": 1.3360895560361277, + "ewc_loss": 0.007750064134597778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.750064105493948e-05, + "grad_norm": 3.8326733112335205, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8665938377380371, + "num_tokens": 400848820.0, + "step": 10503 + }, + { + "epoch": 1.3362167663147182, + "ewc_loss": 0.007724524009972811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.724524039076641e-05, + "grad_norm": 3.75661039352417, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8760421276092529, + "num_tokens": 400888774.0, + "step": 10504 + }, + { + "epoch": 1.3363439765933087, + "ewc_loss": 0.0076921246945858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.692124927416444e-05, + "grad_norm": 3.7432572841644287, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8767529129981995, + "num_tokens": 400935067.0, + "step": 10505 + }, + { + "epoch": 1.3364711868718993, + "ewc_loss": 0.007728469558060169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.728469790890813e-05, + "grad_norm": 3.8295092582702637, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8832957744598389, + "num_tokens": 400974122.0, + "step": 10506 + }, + { + "epoch": 1.3365983971504898, + "ewc_loss": 0.00775853032246232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.758530409773812e-05, + "grad_norm": 3.8010919094085693, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8822312355041504, + "num_tokens": 401008911.0, + "step": 10507 + }, + { + "epoch": 1.3367256074290803, + "ewc_loss": 0.007707830984145403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70783080952242e-05, + "grad_norm": 3.8578684329986572, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8797993659973145, + "num_tokens": 401039453.0, + "step": 10508 + }, + { + "epoch": 1.3368528177076708, + "ewc_loss": 0.007751659490168095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.751659722998738e-05, + "grad_norm": 3.776519775390625, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8816083073616028, + "num_tokens": 401075854.0, + "step": 10509 + }, + { + "epoch": 1.3369800279862614, + "ewc_loss": 0.007662965916097164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.662965799681842e-05, + "grad_norm": 3.800679922103882, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8718724846839905, + "num_tokens": 401114922.0, + "step": 10510 + }, + { + "epoch": 1.337107238264852, + "ewc_loss": 0.0077165355905890465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716535765212029e-05, + "grad_norm": 3.7762985229492188, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8828274607658386, + "num_tokens": 401155833.0, + "step": 10511 + }, + { + "epoch": 1.3372344485434424, + "ewc_loss": 0.007700555492192507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.700555579503998e-05, + "grad_norm": 3.77315616607666, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8811601400375366, + "num_tokens": 401196971.0, + "step": 10512 + }, + { + "epoch": 1.337361658822033, + "ewc_loss": 0.0076920464634895325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69204634707421e-05, + "grad_norm": 3.8349599838256836, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8433858156204224, + "num_tokens": 401237312.0, + "step": 10513 + }, + { + "epoch": 1.3374888691006233, + "ewc_loss": 0.00773785263299942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.737852865830064e-05, + "grad_norm": 3.8469629287719727, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8712785840034485, + "num_tokens": 401274630.0, + "step": 10514 + }, + { + "epoch": 1.3376160793792138, + "ewc_loss": 0.007712766993790865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.712766819167882e-05, + "grad_norm": 3.8265528678894043, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8802348971366882, + "num_tokens": 401311780.0, + "step": 10515 + }, + { + "epoch": 1.3377432896578043, + "ewc_loss": 0.00769598176702857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.695981912547722e-05, + "grad_norm": 3.7980523109436035, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8826150894165039, + "num_tokens": 401352614.0, + "step": 10516 + }, + { + "epoch": 1.3378704999363948, + "ewc_loss": 0.007694148924201727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69414909882471e-05, + "grad_norm": 3.8207015991210938, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8764989376068115, + "num_tokens": 401394495.0, + "step": 10517 + }, + { + "epoch": 1.3379977102149854, + "ewc_loss": 0.007706369273364544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706369069637731e-05, + "grad_norm": 3.9107489585876465, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8618900775909424, + "num_tokens": 401427758.0, + "step": 10518 + }, + { + "epoch": 1.338124920493576, + "ewc_loss": 0.007754899561405182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.754899706924334e-05, + "grad_norm": 3.7811732292175293, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.859283447265625, + "num_tokens": 401473868.0, + "step": 10519 + }, + { + "epoch": 1.3382521307721664, + "ewc_loss": 0.007647375576198101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.647375605301932e-05, + "grad_norm": 3.8609426021575928, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8578737378120422, + "num_tokens": 401510459.0, + "step": 10520 + }, + { + "epoch": 1.338379341050757, + "ewc_loss": 0.007767150178551674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.767150236759335e-05, + "grad_norm": 3.9204516410827637, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8459539413452148, + "num_tokens": 401545283.0, + "step": 10521 + }, + { + "epoch": 1.3385065513293475, + "ewc_loss": 0.0077676852233707905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.767685019643977e-05, + "grad_norm": 3.8856277465820312, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8765774369239807, + "num_tokens": 401577166.0, + "step": 10522 + }, + { + "epoch": 1.3386337616079378, + "ewc_loss": 0.007736715022474527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.736714906059206e-05, + "grad_norm": 3.741483449935913, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.886901319026947, + "num_tokens": 401619075.0, + "step": 10523 + }, + { + "epoch": 1.3387609718865283, + "ewc_loss": 0.00768328458070755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.683284638915211e-05, + "grad_norm": 3.789111852645874, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8684962391853333, + "num_tokens": 401661286.0, + "step": 10524 + }, + { + "epoch": 1.3388881821651188, + "ewc_loss": 0.007769810501486063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76981032686308e-05, + "grad_norm": 3.8288960456848145, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8667837977409363, + "num_tokens": 401699672.0, + "step": 10525 + }, + { + "epoch": 1.3390153924437094, + "ewc_loss": 0.007762902416288853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.762902532704175e-05, + "grad_norm": 3.8468194007873535, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8686875104904175, + "num_tokens": 401732676.0, + "step": 10526 + }, + { + "epoch": 1.3391426027223, + "ewc_loss": 0.0077664852142333984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766485214233398e-05, + "grad_norm": 3.787271022796631, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8721330165863037, + "num_tokens": 401771593.0, + "step": 10527 + }, + { + "epoch": 1.3392698130008904, + "ewc_loss": 0.007756509818136692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.756509876344353e-05, + "grad_norm": 3.864424228668213, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8610552549362183, + "num_tokens": 401809605.0, + "step": 10528 + }, + { + "epoch": 1.339397023279481, + "ewc_loss": 0.007821512408554554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.821512554073706e-05, + "grad_norm": 3.836484670639038, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8646090030670166, + "num_tokens": 401845349.0, + "step": 10529 + }, + { + "epoch": 1.3395242335580715, + "ewc_loss": 0.007786520291119814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786520291119814e-05, + "grad_norm": 3.7659316062927246, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8964914083480835, + "num_tokens": 401883186.0, + "step": 10530 + }, + { + "epoch": 1.339651443836662, + "ewc_loss": 0.007742725778371096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742725574644282e-05, + "grad_norm": 3.8464741706848145, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.857588529586792, + "num_tokens": 401921149.0, + "step": 10531 + }, + { + "epoch": 1.3397786541152525, + "ewc_loss": 0.007817687466740608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81768758315593e-05, + "grad_norm": 3.7566134929656982, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8673684597015381, + "num_tokens": 401961813.0, + "step": 10532 + }, + { + "epoch": 1.339905864393843, + "ewc_loss": 0.007749691605567932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749691576464102e-05, + "grad_norm": 3.808320999145508, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8575316667556763, + "num_tokens": 402001212.0, + "step": 10533 + }, + { + "epoch": 1.3400330746724336, + "ewc_loss": 0.007827827706933022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.827827357687056e-05, + "grad_norm": 3.78999400138855, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8655617237091064, + "num_tokens": 402044648.0, + "step": 10534 + }, + { + "epoch": 1.3401602849510241, + "ewc_loss": 0.00778306694701314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.783067121636122e-05, + "grad_norm": 3.7663421630859375, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8782119154930115, + "num_tokens": 402087391.0, + "step": 10535 + }, + { + "epoch": 1.3402874952296147, + "ewc_loss": 0.007787767332047224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.787767390254885e-05, + "grad_norm": 3.832235813140869, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8943023085594177, + "num_tokens": 402119598.0, + "step": 10536 + }, + { + "epoch": 1.3404147055082052, + "ewc_loss": 0.007815803401172161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815803110133857e-05, + "grad_norm": 3.8962135314941406, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8511255979537964, + "num_tokens": 402153628.0, + "step": 10537 + }, + { + "epoch": 1.3405419157867955, + "ewc_loss": 0.007835795171558857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835795258870348e-05, + "grad_norm": 3.7611706256866455, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8890851736068726, + "num_tokens": 402190876.0, + "step": 10538 + }, + { + "epoch": 1.340669126065386, + "ewc_loss": 0.00774861965328455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.748619827907532e-05, + "grad_norm": 3.797464370727539, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8831179141998291, + "num_tokens": 402227168.0, + "step": 10539 + }, + { + "epoch": 1.3407963363439765, + "ewc_loss": 0.00781073234975338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.810732495272532e-05, + "grad_norm": 3.833801507949829, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8837665915489197, + "num_tokens": 402262731.0, + "step": 10540 + }, + { + "epoch": 1.340923546622567, + "ewc_loss": 0.007806882262229919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.806882058503106e-05, + "grad_norm": 3.8657114505767822, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8752144575119019, + "num_tokens": 402299992.0, + "step": 10541 + }, + { + "epoch": 1.3410507569011576, + "ewc_loss": 0.007790094241499901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.790094241499901e-05, + "grad_norm": 3.7836151123046875, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8669485449790955, + "num_tokens": 402339129.0, + "step": 10542 + }, + { + "epoch": 1.3411779671797481, + "ewc_loss": 0.007740291766822338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740291766822338e-05, + "grad_norm": 3.8568906784057617, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8576818704605103, + "num_tokens": 402375380.0, + "step": 10543 + }, + { + "epoch": 1.3413051774583387, + "ewc_loss": 0.007808179594576359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.808179361745715e-05, + "grad_norm": 3.8311777114868164, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8616646528244019, + "num_tokens": 402412159.0, + "step": 10544 + }, + { + "epoch": 1.3414323877369292, + "ewc_loss": 0.007765626069158316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.765625923639163e-05, + "grad_norm": 3.8829965591430664, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8713377118110657, + "num_tokens": 402446676.0, + "step": 10545 + }, + { + "epoch": 1.3415595980155197, + "ewc_loss": 0.007794540375471115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794540579197928e-05, + "grad_norm": 3.7436087131500244, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8746984004974365, + "num_tokens": 402490741.0, + "step": 10546 + }, + { + "epoch": 1.3416868082941102, + "ewc_loss": 0.007718177977949381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71817794884555e-05, + "grad_norm": 3.827981948852539, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8835086822509766, + "num_tokens": 402527398.0, + "step": 10547 + }, + { + "epoch": 1.3418140185727006, + "ewc_loss": 0.007808258756995201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80825866968371e-05, + "grad_norm": 3.8061206340789795, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8736932277679443, + "num_tokens": 402565914.0, + "step": 10548 + }, + { + "epoch": 1.341941228851291, + "ewc_loss": 0.007740844041109085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740844012005255e-05, + "grad_norm": 3.863767623901367, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.877399206161499, + "num_tokens": 402600671.0, + "step": 10549 + }, + { + "epoch": 1.3420684391298816, + "ewc_loss": 0.007799382787197828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.799382728990167e-05, + "grad_norm": 3.8236422538757324, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8862411975860596, + "num_tokens": 402637381.0, + "step": 10550 + }, + { + "epoch": 1.3421956494084721, + "ewc_loss": 0.007743609603494406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.743609603494406e-05, + "grad_norm": 3.8725714683532715, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.877443790435791, + "num_tokens": 402671190.0, + "step": 10551 + }, + { + "epoch": 1.3423228596870627, + "ewc_loss": 0.0077971648424863815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797165017109364e-05, + "grad_norm": 3.741204261779785, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8734151124954224, + "num_tokens": 402721518.0, + "step": 10552 + }, + { + "epoch": 1.3424500699656532, + "ewc_loss": 0.007697426714003086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.697426917729899e-05, + "grad_norm": 3.783783435821533, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8755813241004944, + "num_tokens": 402758949.0, + "step": 10553 + }, + { + "epoch": 1.3425772802442437, + "ewc_loss": 0.007769033778458834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769033982185647e-05, + "grad_norm": 3.864957571029663, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8692212700843811, + "num_tokens": 402794342.0, + "step": 10554 + }, + { + "epoch": 1.3427044905228342, + "ewc_loss": 0.007783954031765461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.783954060869291e-05, + "grad_norm": 3.828004837036133, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8854272961616516, + "num_tokens": 402831031.0, + "step": 10555 + }, + { + "epoch": 1.3428317008014248, + "ewc_loss": 0.007735687308013439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735687540844083e-05, + "grad_norm": 3.8203299045562744, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.883678674697876, + "num_tokens": 402871486.0, + "step": 10556 + }, + { + "epoch": 1.3429589110800153, + "ewc_loss": 0.007735786493867636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735786493867636e-05, + "grad_norm": 3.761749744415283, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8869219422340393, + "num_tokens": 402916024.0, + "step": 10557 + }, + { + "epoch": 1.3430861213586058, + "ewc_loss": 0.0077054463326931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705446478212252e-05, + "grad_norm": 3.9208807945251465, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8571724891662598, + "num_tokens": 402952116.0, + "step": 10558 + }, + { + "epoch": 1.3432133316371964, + "ewc_loss": 0.007806242443621159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80624250182882e-05, + "grad_norm": 3.753260374069214, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8729430437088013, + "num_tokens": 402998064.0, + "step": 10559 + }, + { + "epoch": 1.343340541915787, + "ewc_loss": 0.007627440150827169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.62744020903483e-05, + "grad_norm": 3.7946789264678955, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8774888515472412, + "num_tokens": 403041763.0, + "step": 10560 + }, + { + "epoch": 1.3434677521943774, + "ewc_loss": 0.007726672105491161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.726671901764348e-05, + "grad_norm": 3.7911431789398193, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8861187696456909, + "num_tokens": 403080956.0, + "step": 10561 + }, + { + "epoch": 1.343594962472968, + "ewc_loss": 0.007687780540436506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.687780453125015e-05, + "grad_norm": 3.8090298175811768, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8825867176055908, + "num_tokens": 403117200.0, + "step": 10562 + }, + { + "epoch": 1.3437221727515583, + "ewc_loss": 0.007712038699537516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.712038495810702e-05, + "grad_norm": 3.8452675342559814, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8743504285812378, + "num_tokens": 403154112.0, + "step": 10563 + }, + { + "epoch": 1.3438493830301488, + "ewc_loss": 0.007697541732341051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.697541877860203e-05, + "grad_norm": 3.8217520713806152, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8727162480354309, + "num_tokens": 403188265.0, + "step": 10564 + }, + { + "epoch": 1.3439765933087393, + "ewc_loss": 0.007698444649577141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.698444824200124e-05, + "grad_norm": 3.774177074432373, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8518356084823608, + "num_tokens": 403231236.0, + "step": 10565 + }, + { + "epoch": 1.3441038035873298, + "ewc_loss": 0.007668292615562677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.668292528251186e-05, + "grad_norm": 3.7911264896392822, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8755496144294739, + "num_tokens": 403268642.0, + "step": 10566 + }, + { + "epoch": 1.3442310138659204, + "ewc_loss": 0.007705688942223787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705688767600805e-05, + "grad_norm": 3.8017566204071045, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8822041153907776, + "num_tokens": 403307679.0, + "step": 10567 + }, + { + "epoch": 1.344358224144511, + "ewc_loss": 0.007696748711168766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696748798480257e-05, + "grad_norm": 3.821894645690918, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8781893253326416, + "num_tokens": 403346395.0, + "step": 10568 + }, + { + "epoch": 1.3444854344231014, + "ewc_loss": 0.007708015851676464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70801561884582e-05, + "grad_norm": 3.8358161449432373, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8833960294723511, + "num_tokens": 403383001.0, + "step": 10569 + }, + { + "epoch": 1.344612644701692, + "ewc_loss": 0.007710020989179611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71002087276429e-05, + "grad_norm": 3.7786803245544434, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8665781021118164, + "num_tokens": 403426090.0, + "step": 10570 + }, + { + "epoch": 1.3447398549802825, + "ewc_loss": 0.007677614688873291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.677614485146478e-05, + "grad_norm": 3.794010877609253, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8671325445175171, + "num_tokens": 403470231.0, + "step": 10571 + }, + { + "epoch": 1.3448670652588728, + "ewc_loss": 0.007687088567763567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.687088509555906e-05, + "grad_norm": 3.8237922191619873, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8796085119247437, + "num_tokens": 403508677.0, + "step": 10572 + }, + { + "epoch": 1.3449942755374633, + "ewc_loss": 0.007722868118435144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722868031123653e-05, + "grad_norm": 3.883708953857422, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8573393821716309, + "num_tokens": 403545175.0, + "step": 10573 + }, + { + "epoch": 1.3451214858160538, + "ewc_loss": 0.007730397861450911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730397919658571e-05, + "grad_norm": 3.766637086868286, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8868505954742432, + "num_tokens": 403583321.0, + "step": 10574 + }, + { + "epoch": 1.3452486960946444, + "ewc_loss": 0.007647727150470018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.647727034054697e-05, + "grad_norm": 3.8603646755218506, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8685088157653809, + "num_tokens": 403618547.0, + "step": 10575 + }, + { + "epoch": 1.345375906373235, + "ewc_loss": 0.007742991670966148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742991874692962e-05, + "grad_norm": 3.811147928237915, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8768250942230225, + "num_tokens": 403657552.0, + "step": 10576 + }, + { + "epoch": 1.3455031166518254, + "ewc_loss": 0.007682138122618198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.682137947995216e-05, + "grad_norm": 3.8132996559143066, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8772680163383484, + "num_tokens": 403697021.0, + "step": 10577 + }, + { + "epoch": 1.345630326930416, + "ewc_loss": 0.007710294332355261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.710294448770583e-05, + "grad_norm": 3.8323113918304443, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8642945885658264, + "num_tokens": 403736480.0, + "step": 10578 + }, + { + "epoch": 1.3457575372090065, + "ewc_loss": 0.007716446183621883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716446270933375e-05, + "grad_norm": 3.8505728244781494, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8801522850990295, + "num_tokens": 403772056.0, + "step": 10579 + }, + { + "epoch": 1.345884747487597, + "ewc_loss": 0.007706575561314821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706575706833974e-05, + "grad_norm": 3.7762789726257324, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8695046901702881, + "num_tokens": 403813646.0, + "step": 10580 + }, + { + "epoch": 1.3460119577661875, + "ewc_loss": 0.007664991542696953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.664991426281631e-05, + "grad_norm": 3.7995119094848633, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8763571381568909, + "num_tokens": 403853904.0, + "step": 10581 + }, + { + "epoch": 1.346139168044778, + "ewc_loss": 0.007692125625908375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.692125655012205e-05, + "grad_norm": 3.8268070220947266, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8597477078437805, + "num_tokens": 403898528.0, + "step": 10582 + }, + { + "epoch": 1.3462663783233686, + "ewc_loss": 0.0077135153114795685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713515515206382e-05, + "grad_norm": 3.954624652862549, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8610180616378784, + "num_tokens": 403929518.0, + "step": 10583 + }, + { + "epoch": 1.3463935886019591, + "ewc_loss": 0.007770163007080555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770163210807368e-05, + "grad_norm": 3.7878060340881348, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8770921230316162, + "num_tokens": 403970323.0, + "step": 10584 + }, + { + "epoch": 1.3465207988805497, + "ewc_loss": 0.007639304269105196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639304385520518e-05, + "grad_norm": 3.797556161880493, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8761515617370605, + "num_tokens": 404012499.0, + "step": 10585 + }, + { + "epoch": 1.3466480091591402, + "ewc_loss": 0.007711706683039665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711706712143496e-05, + "grad_norm": 3.8243777751922607, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8676998615264893, + "num_tokens": 404055370.0, + "step": 10586 + }, + { + "epoch": 1.3467752194377305, + "ewc_loss": 0.007735325023531914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735325198154896e-05, + "grad_norm": 3.8298614025115967, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8740100860595703, + "num_tokens": 404092498.0, + "step": 10587 + }, + { + "epoch": 1.346902429716321, + "ewc_loss": 0.007707920391112566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.707920303801075e-05, + "grad_norm": 3.793200731277466, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8800280690193176, + "num_tokens": 404132466.0, + "step": 10588 + }, + { + "epoch": 1.3470296399949115, + "ewc_loss": 0.00769507372751832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.695073873037472e-05, + "grad_norm": 3.8016974925994873, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8739204406738281, + "num_tokens": 404171913.0, + "step": 10589 + }, + { + "epoch": 1.347156850273502, + "ewc_loss": 0.007705784868448973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705784810241312e-05, + "grad_norm": 3.850125551223755, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8877959847450256, + "num_tokens": 404205817.0, + "step": 10590 + }, + { + "epoch": 1.3472840605520926, + "ewc_loss": 0.007722245063632727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722245209151879e-05, + "grad_norm": 3.820598840713501, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8770513534545898, + "num_tokens": 404243922.0, + "step": 10591 + }, + { + "epoch": 1.3474112708306831, + "ewc_loss": 0.0076875025406479836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.687502511544153e-05, + "grad_norm": 3.8199000358581543, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8784667253494263, + "num_tokens": 404283392.0, + "step": 10592 + }, + { + "epoch": 1.3475384811092737, + "ewc_loss": 0.0076841507107019424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.684150477871299e-05, + "grad_norm": 3.8258397579193115, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8491164445877075, + "num_tokens": 404323044.0, + "step": 10593 + }, + { + "epoch": 1.3476656913878642, + "ewc_loss": 0.007700311951339245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.700311834923923e-05, + "grad_norm": 3.8206369876861572, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8805042505264282, + "num_tokens": 404359761.0, + "step": 10594 + }, + { + "epoch": 1.3477929016664547, + "ewc_loss": 0.007708963472396135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708963676122949e-05, + "grad_norm": 3.9169132709503174, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8624779582023621, + "num_tokens": 404395354.0, + "step": 10595 + }, + { + "epoch": 1.3479201119450452, + "ewc_loss": 0.007753899320960045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.753899262752384e-05, + "grad_norm": 3.8807222843170166, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8566527962684631, + "num_tokens": 404432052.0, + "step": 10596 + }, + { + "epoch": 1.3480473222236355, + "ewc_loss": 0.007712710648775101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.712710794294253e-05, + "grad_norm": 3.832076072692871, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8663501143455505, + "num_tokens": 404472689.0, + "step": 10597 + }, + { + "epoch": 1.348174532502226, + "ewc_loss": 0.0077170031145215034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71700288169086e-05, + "grad_norm": 3.786762237548828, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8592774868011475, + "num_tokens": 404515916.0, + "step": 10598 + }, + { + "epoch": 1.3483017427808166, + "ewc_loss": 0.007698400411754847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.698400440858677e-05, + "grad_norm": 3.7677249908447266, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8729763031005859, + "num_tokens": 404557399.0, + "step": 10599 + }, + { + "epoch": 1.3484289530594071, + "ewc_loss": 0.007721576374024153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721576548647135e-05, + "grad_norm": 3.8449220657348633, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8705906867980957, + "num_tokens": 404592573.0, + "step": 10600 + }, + { + "epoch": 1.3485561633379977, + "ewc_loss": 0.007763484492897987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763484609313309e-05, + "grad_norm": 3.813821792602539, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8704344034194946, + "num_tokens": 404635266.0, + "step": 10601 + }, + { + "epoch": 1.3486833736165882, + "ewc_loss": 0.007714135572314262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71413542679511e-05, + "grad_norm": 3.7754275798797607, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8831813335418701, + "num_tokens": 404673004.0, + "step": 10602 + }, + { + "epoch": 1.3488105838951787, + "ewc_loss": 0.007729361765086651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729361823294312e-05, + "grad_norm": 3.959620714187622, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8606435656547546, + "num_tokens": 404705153.0, + "step": 10603 + }, + { + "epoch": 1.3489377941737692, + "ewc_loss": 0.007849158719182014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849158282624558e-05, + "grad_norm": 3.9402072429656982, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8653877973556519, + "num_tokens": 404737068.0, + "step": 10604 + }, + { + "epoch": 1.3490650044523598, + "ewc_loss": 0.007738461252301931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73846113588661e-05, + "grad_norm": 3.8616747856140137, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8545176982879639, + "num_tokens": 404773680.0, + "step": 10605 + }, + { + "epoch": 1.3491922147309503, + "ewc_loss": 0.0077396878041327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.739687862340361e-05, + "grad_norm": 3.79392409324646, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.880923867225647, + "num_tokens": 404809937.0, + "step": 10606 + }, + { + "epoch": 1.3493194250095408, + "ewc_loss": 0.007696832530200481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69683247199282e-05, + "grad_norm": 3.803349733352661, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8662067651748657, + "num_tokens": 404847146.0, + "step": 10607 + }, + { + "epoch": 1.3494466352881314, + "ewc_loss": 0.007756742183119059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.756741979392245e-05, + "grad_norm": 3.86422061920166, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8743560314178467, + "num_tokens": 404882952.0, + "step": 10608 + }, + { + "epoch": 1.3495738455667219, + "ewc_loss": 0.007772548124194145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772548269713297e-05, + "grad_norm": 3.735902786254883, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8820242285728455, + "num_tokens": 404925347.0, + "step": 10609 + }, + { + "epoch": 1.3497010558453124, + "ewc_loss": 0.0077026402577757835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702640141360462e-05, + "grad_norm": 3.8616392612457275, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8595061302185059, + "num_tokens": 404963411.0, + "step": 10610 + }, + { + "epoch": 1.349828266123903, + "ewc_loss": 0.007832816801965237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832817209418863e-05, + "grad_norm": 3.8302736282348633, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8723214268684387, + "num_tokens": 404997522.0, + "step": 10611 + }, + { + "epoch": 1.3499554764024932, + "ewc_loss": 0.00776253454387188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.762534369248897e-05, + "grad_norm": 3.8724989891052246, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8722246289253235, + "num_tokens": 405034049.0, + "step": 10612 + }, + { + "epoch": 1.3500826866810838, + "ewc_loss": 0.007772273384034634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772273238515481e-05, + "grad_norm": 3.7885656356811523, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8698360919952393, + "num_tokens": 405074985.0, + "step": 10613 + }, + { + "epoch": 1.3502098969596743, + "ewc_loss": 0.007734350394457579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734350219834596e-05, + "grad_norm": 3.778709888458252, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8871537446975708, + "num_tokens": 405116133.0, + "step": 10614 + }, + { + "epoch": 1.3503371072382648, + "ewc_loss": 0.007752574980258942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752575038466603e-05, + "grad_norm": 3.8272762298583984, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.870510458946228, + "num_tokens": 405155572.0, + "step": 10615 + }, + { + "epoch": 1.3504643175168554, + "ewc_loss": 0.007774594239890575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774594268994406e-05, + "grad_norm": 3.8382487297058105, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.875806450843811, + "num_tokens": 405194700.0, + "step": 10616 + }, + { + "epoch": 1.350591527795446, + "ewc_loss": 0.007757604122161865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757604180369526e-05, + "grad_norm": 3.848357915878296, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.869408369064331, + "num_tokens": 405233582.0, + "step": 10617 + }, + { + "epoch": 1.3507187380740364, + "ewc_loss": 0.007754758466035128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.754758553346619e-05, + "grad_norm": 3.7793781757354736, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8860055804252625, + "num_tokens": 405274987.0, + "step": 10618 + }, + { + "epoch": 1.350845948352627, + "ewc_loss": 0.007696721702814102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696721877437085e-05, + "grad_norm": 3.8811964988708496, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8717305660247803, + "num_tokens": 405308151.0, + "step": 10619 + }, + { + "epoch": 1.3509731586312175, + "ewc_loss": 0.007793452125042677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793452095938846e-05, + "grad_norm": 3.8357770442962646, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8757776021957397, + "num_tokens": 405343939.0, + "step": 10620 + }, + { + "epoch": 1.3511003689098078, + "ewc_loss": 0.007719339802861214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.719339919276536e-05, + "grad_norm": 3.806804895401001, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8723176717758179, + "num_tokens": 405384211.0, + "step": 10621 + }, + { + "epoch": 1.3512275791883983, + "ewc_loss": 0.007725267671048641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725267641944811e-05, + "grad_norm": 3.887636423110962, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.87968510389328, + "num_tokens": 405419971.0, + "step": 10622 + }, + { + "epoch": 1.3513547894669888, + "ewc_loss": 0.007769967429339886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769967487547547e-05, + "grad_norm": 3.921393632888794, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.866750955581665, + "num_tokens": 405451322.0, + "step": 10623 + }, + { + "epoch": 1.3514819997455794, + "ewc_loss": 0.007747935131192207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747935160296038e-05, + "grad_norm": 3.8218727111816406, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8656542301177979, + "num_tokens": 405496101.0, + "step": 10624 + }, + { + "epoch": 1.35160921002417, + "ewc_loss": 0.007706636097282171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706636097282171e-05, + "grad_norm": 3.886493444442749, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8581324219703674, + "num_tokens": 405530034.0, + "step": 10625 + }, + { + "epoch": 1.3517364203027604, + "ewc_loss": 0.00778003316372633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.780033047311008e-05, + "grad_norm": 3.816847562789917, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8842813372612, + "num_tokens": 405570893.0, + "step": 10626 + }, + { + "epoch": 1.351863630581351, + "ewc_loss": 0.007708188612014055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708188786637038e-05, + "grad_norm": 3.8526549339294434, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8611143827438354, + "num_tokens": 405610120.0, + "step": 10627 + }, + { + "epoch": 1.3519908408599415, + "ewc_loss": 0.007753343321382999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75334337959066e-05, + "grad_norm": 3.8064510822296143, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8732125759124756, + "num_tokens": 405651878.0, + "step": 10628 + }, + { + "epoch": 1.352118051138532, + "ewc_loss": 0.007706289179623127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706289034103975e-05, + "grad_norm": 3.7902863025665283, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8782687187194824, + "num_tokens": 405694578.0, + "step": 10629 + }, + { + "epoch": 1.3522452614171225, + "ewc_loss": 0.0077131362631917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713136437814683e-05, + "grad_norm": 3.787078380584717, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8790935277938843, + "num_tokens": 405734128.0, + "step": 10630 + }, + { + "epoch": 1.352372471695713, + "ewc_loss": 0.007722301874309778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72230196162127e-05, + "grad_norm": 3.83215069770813, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8784987330436707, + "num_tokens": 405776588.0, + "step": 10631 + }, + { + "epoch": 1.3524996819743036, + "ewc_loss": 0.007749457843601704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749458018224686e-05, + "grad_norm": 3.840296745300293, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8696303367614746, + "num_tokens": 405810867.0, + "step": 10632 + }, + { + "epoch": 1.3526268922528941, + "ewc_loss": 0.00773420836776495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73420833866112e-05, + "grad_norm": 3.793217182159424, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8908560872077942, + "num_tokens": 405848884.0, + "step": 10633 + }, + { + "epoch": 1.3527541025314846, + "ewc_loss": 0.00768243707716465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.682436989853159e-05, + "grad_norm": 3.7809641361236572, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8925645351409912, + "num_tokens": 405888759.0, + "step": 10634 + }, + { + "epoch": 1.3528813128100752, + "ewc_loss": 0.007706673815846443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706673932261765e-05, + "grad_norm": 3.835623025894165, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8869234323501587, + "num_tokens": 405925686.0, + "step": 10635 + }, + { + "epoch": 1.3530085230886655, + "ewc_loss": 0.007717440370470285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.717440166743472e-05, + "grad_norm": 3.8474621772766113, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8856584429740906, + "num_tokens": 405960836.0, + "step": 10636 + }, + { + "epoch": 1.353135733367256, + "ewc_loss": 0.0076983533799648285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.698353147134185e-05, + "grad_norm": 3.8172268867492676, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8839817047119141, + "num_tokens": 406000314.0, + "step": 10637 + }, + { + "epoch": 1.3532629436458465, + "ewc_loss": 0.0076858652755618095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.685865421080962e-05, + "grad_norm": 3.7845816612243652, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8844869136810303, + "num_tokens": 406039468.0, + "step": 10638 + }, + { + "epoch": 1.353390153924437, + "ewc_loss": 0.007676681503653526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.67668170738034e-05, + "grad_norm": 3.82515549659729, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8966012001037598, + "num_tokens": 406076828.0, + "step": 10639 + }, + { + "epoch": 1.3535173642030276, + "ewc_loss": 0.007708466146141291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708466000622138e-05, + "grad_norm": 3.8148577213287354, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8732334971427917, + "num_tokens": 406116504.0, + "step": 10640 + }, + { + "epoch": 1.3536445744816181, + "ewc_loss": 0.007667254190891981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.667254249099642e-05, + "grad_norm": 3.8319101333618164, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.875244140625, + "num_tokens": 406155184.0, + "step": 10641 + }, + { + "epoch": 1.3537717847602087, + "ewc_loss": 0.007681102026253939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.681101851630956e-05, + "grad_norm": 3.832523822784424, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8563157320022583, + "num_tokens": 406195411.0, + "step": 10642 + }, + { + "epoch": 1.3538989950387992, + "ewc_loss": 0.007678118534386158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678118709009141e-05, + "grad_norm": 3.887583017349243, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8646683096885681, + "num_tokens": 406230159.0, + "step": 10643 + }, + { + "epoch": 1.3540262053173897, + "ewc_loss": 0.007705298252403736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705298048676923e-05, + "grad_norm": 3.8585638999938965, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8634785413742065, + "num_tokens": 406264541.0, + "step": 10644 + }, + { + "epoch": 1.3541534155959802, + "ewc_loss": 0.007689583580940962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.68958343542181e-05, + "grad_norm": 3.803271532058716, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8790002465248108, + "num_tokens": 406301869.0, + "step": 10645 + }, + { + "epoch": 1.3542806258745705, + "ewc_loss": 0.007674184627830982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.674184598727152e-05, + "grad_norm": 3.8717479705810547, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8752840757369995, + "num_tokens": 406338819.0, + "step": 10646 + }, + { + "epoch": 1.354407836153161, + "ewc_loss": 0.007738940417766571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738940621493384e-05, + "grad_norm": 3.8451714515686035, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8751268982887268, + "num_tokens": 406375914.0, + "step": 10647 + }, + { + "epoch": 1.3545350464317516, + "ewc_loss": 0.007705119904130697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705119787715375e-05, + "grad_norm": 3.7818353176116943, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8785600066184998, + "num_tokens": 406418543.0, + "step": 10648 + }, + { + "epoch": 1.3546622567103421, + "ewc_loss": 0.0076922159641981125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.692215876886621e-05, + "grad_norm": 3.829390287399292, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8713666200637817, + "num_tokens": 406455731.0, + "step": 10649 + }, + { + "epoch": 1.3547894669889327, + "ewc_loss": 0.007739661727100611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73966166889295e-05, + "grad_norm": 3.8310539722442627, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8747899532318115, + "num_tokens": 406496266.0, + "step": 10650 + }, + { + "epoch": 1.3549166772675232, + "ewc_loss": 0.0077345119789242744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734511746093631e-05, + "grad_norm": 3.800839900970459, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8696180582046509, + "num_tokens": 406541676.0, + "step": 10651 + }, + { + "epoch": 1.3550438875461137, + "ewc_loss": 0.00771650206297636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716502295807004e-05, + "grad_norm": 3.844876527786255, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8703093528747559, + "num_tokens": 406578997.0, + "step": 10652 + }, + { + "epoch": 1.3551710978247042, + "ewc_loss": 0.007748940493911505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.748940697638318e-05, + "grad_norm": 3.8294525146484375, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8785121440887451, + "num_tokens": 406619404.0, + "step": 10653 + }, + { + "epoch": 1.3552983081032948, + "ewc_loss": 0.007729181554168463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72918137954548e-05, + "grad_norm": 3.808875799179077, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.890824556350708, + "num_tokens": 406656092.0, + "step": 10654 + }, + { + "epoch": 1.3554255183818853, + "ewc_loss": 0.007706885691732168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.706885662628338e-05, + "grad_norm": 3.814645767211914, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8581297397613525, + "num_tokens": 406701600.0, + "step": 10655 + }, + { + "epoch": 1.3555527286604758, + "ewc_loss": 0.007716264575719833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.71626437199302e-05, + "grad_norm": 3.8262314796447754, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8731611967086792, + "num_tokens": 406739649.0, + "step": 10656 + }, + { + "epoch": 1.3556799389390664, + "ewc_loss": 0.007722133304923773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72213315940462e-05, + "grad_norm": 3.830531120300293, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8924081921577454, + "num_tokens": 406774962.0, + "step": 10657 + }, + { + "epoch": 1.3558071492176569, + "ewc_loss": 0.007722191978245974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722192094661295e-05, + "grad_norm": 3.796776056289673, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8719162940979004, + "num_tokens": 406818047.0, + "step": 10658 + }, + { + "epoch": 1.3559343594962474, + "ewc_loss": 0.007704941555857658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.704941526753828e-05, + "grad_norm": 3.8234524726867676, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8677986264228821, + "num_tokens": 406859464.0, + "step": 10659 + }, + { + "epoch": 1.356061569774838, + "ewc_loss": 0.007722551468759775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.722551526967436e-05, + "grad_norm": 3.8153865337371826, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8718391060829163, + "num_tokens": 406896444.0, + "step": 10660 + }, + { + "epoch": 1.3561887800534282, + "ewc_loss": 0.007713438477367163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713438390055671e-05, + "grad_norm": 3.864764928817749, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8696603178977966, + "num_tokens": 406934623.0, + "step": 10661 + }, + { + "epoch": 1.3563159903320188, + "ewc_loss": 0.007730357348918915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730357174295932e-05, + "grad_norm": 3.801213026046753, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8660391569137573, + "num_tokens": 406974290.0, + "step": 10662 + }, + { + "epoch": 1.3564432006106093, + "ewc_loss": 0.007685729302465916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.685729360673577e-05, + "grad_norm": 3.818889617919922, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8912550806999207, + "num_tokens": 407011279.0, + "step": 10663 + }, + { + "epoch": 1.3565704108891998, + "ewc_loss": 0.007715029641985893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.715029641985893e-05, + "grad_norm": 3.8276777267456055, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8620579242706299, + "num_tokens": 407054733.0, + "step": 10664 + }, + { + "epoch": 1.3566976211677904, + "ewc_loss": 0.007729833945631981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729834032943472e-05, + "grad_norm": 3.7950034141540527, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8736505508422852, + "num_tokens": 407098410.0, + "step": 10665 + }, + { + "epoch": 1.3568248314463809, + "ewc_loss": 0.007696025539189577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696025568293408e-05, + "grad_norm": 3.8658554553985596, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8681526780128479, + "num_tokens": 407134231.0, + "step": 10666 + }, + { + "epoch": 1.3569520417249714, + "ewc_loss": 0.00773635832592845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73635838413611e-05, + "grad_norm": 3.890143632888794, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8608883023262024, + "num_tokens": 407167231.0, + "step": 10667 + }, + { + "epoch": 1.357079252003562, + "ewc_loss": 0.007721011992543936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721011934336275e-05, + "grad_norm": 3.8044562339782715, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8780059218406677, + "num_tokens": 407206604.0, + "step": 10668 + }, + { + "epoch": 1.3572064622821525, + "ewc_loss": 0.007677516900002956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.677516987314448e-05, + "grad_norm": 3.7810723781585693, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8768646717071533, + "num_tokens": 407248247.0, + "step": 10669 + }, + { + "epoch": 1.3573336725607428, + "ewc_loss": 0.007703088223934174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.703088340349495e-05, + "grad_norm": 3.831479787826538, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8841167688369751, + "num_tokens": 407282870.0, + "step": 10670 + }, + { + "epoch": 1.3574608828393333, + "ewc_loss": 0.0077349599450826645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734959945082664e-05, + "grad_norm": 3.8572709560394287, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8807540535926819, + "num_tokens": 407318704.0, + "step": 10671 + }, + { + "epoch": 1.3575880931179238, + "ewc_loss": 0.007745499722659588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.74549989728257e-05, + "grad_norm": 3.8188319206237793, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8769826889038086, + "num_tokens": 407360003.0, + "step": 10672 + }, + { + "epoch": 1.3577153033965144, + "ewc_loss": 0.007709040306508541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709040073677897e-05, + "grad_norm": 3.832258939743042, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8689584136009216, + "num_tokens": 407399473.0, + "step": 10673 + }, + { + "epoch": 1.357842513675105, + "ewc_loss": 0.0077458275482058525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745827315375209e-05, + "grad_norm": 3.868649482727051, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8456645011901855, + "num_tokens": 407440576.0, + "step": 10674 + }, + { + "epoch": 1.3579697239536954, + "ewc_loss": 0.007743979338556528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.743979222141206e-05, + "grad_norm": 3.8124654293060303, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.870591402053833, + "num_tokens": 407481057.0, + "step": 10675 + }, + { + "epoch": 1.358096934232286, + "ewc_loss": 0.007711057085543871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711056969128549e-05, + "grad_norm": 3.8179917335510254, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.886020839214325, + "num_tokens": 407517104.0, + "step": 10676 + }, + { + "epoch": 1.3582241445108765, + "ewc_loss": 0.007742041721940041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742041634628549e-05, + "grad_norm": 3.848637580871582, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8795136213302612, + "num_tokens": 407552604.0, + "step": 10677 + }, + { + "epoch": 1.358351354789467, + "ewc_loss": 0.007743341848254204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.743341848254204e-05, + "grad_norm": 3.83111310005188, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8862497806549072, + "num_tokens": 407588282.0, + "step": 10678 + }, + { + "epoch": 1.3584785650680575, + "ewc_loss": 0.007740688975900412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740689034108073e-05, + "grad_norm": 3.8922839164733887, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8697649836540222, + "num_tokens": 407619837.0, + "step": 10679 + }, + { + "epoch": 1.358605775346648, + "ewc_loss": 0.007776728831231594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776729034958407e-05, + "grad_norm": 3.8224544525146484, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8700116872787476, + "num_tokens": 407656939.0, + "step": 10680 + }, + { + "epoch": 1.3587329856252386, + "ewc_loss": 0.007732745725661516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732745871180668e-05, + "grad_norm": 3.8127975463867188, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8643032312393188, + "num_tokens": 407702756.0, + "step": 10681 + }, + { + "epoch": 1.3588601959038291, + "ewc_loss": 0.007732597645372152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732597441645339e-05, + "grad_norm": 3.8442866802215576, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8829169273376465, + "num_tokens": 407736142.0, + "step": 10682 + }, + { + "epoch": 1.3589874061824196, + "ewc_loss": 0.007773775607347488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77377572376281e-05, + "grad_norm": 3.829416275024414, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8709698915481567, + "num_tokens": 407773184.0, + "step": 10683 + }, + { + "epoch": 1.3591146164610102, + "ewc_loss": 0.007757932413369417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757932326057926e-05, + "grad_norm": 3.8669228553771973, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8592431545257568, + "num_tokens": 407806543.0, + "step": 10684 + }, + { + "epoch": 1.3592418267396005, + "ewc_loss": 0.007797843776643276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797843863954768e-05, + "grad_norm": 3.882154703140259, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8730950355529785, + "num_tokens": 407841371.0, + "step": 10685 + }, + { + "epoch": 1.359369037018191, + "ewc_loss": 0.007786507718265057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78650792199187e-05, + "grad_norm": 3.7744088172912598, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8767284154891968, + "num_tokens": 407879862.0, + "step": 10686 + }, + { + "epoch": 1.3594962472967815, + "ewc_loss": 0.007723448798060417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.723448652541265e-05, + "grad_norm": 3.8502793312072754, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8678973913192749, + "num_tokens": 407917947.0, + "step": 10687 + }, + { + "epoch": 1.359623457575372, + "ewc_loss": 0.007801668718457222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.801668834872544e-05, + "grad_norm": 3.8008573055267334, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8812241554260254, + "num_tokens": 407953987.0, + "step": 10688 + }, + { + "epoch": 1.3597506678539626, + "ewc_loss": 0.007749978918582201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749978976789862e-05, + "grad_norm": 3.8108913898468018, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8583571910858154, + "num_tokens": 407993166.0, + "step": 10689 + }, + { + "epoch": 1.3598778781325531, + "ewc_loss": 0.007762589026242495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.762588938931003e-05, + "grad_norm": 3.735903263092041, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8822482228279114, + "num_tokens": 408037063.0, + "step": 10690 + }, + { + "epoch": 1.3600050884111436, + "ewc_loss": 0.007726975250989199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72697530919686e-05, + "grad_norm": 3.8029232025146484, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.876835286617279, + "num_tokens": 408078526.0, + "step": 10691 + }, + { + "epoch": 1.3601322986897342, + "ewc_loss": 0.007793901022523642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793901022523642e-05, + "grad_norm": 3.807246685028076, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8802733421325684, + "num_tokens": 408116002.0, + "step": 10692 + }, + { + "epoch": 1.3602595089683247, + "ewc_loss": 0.007742138579487801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742138404864818e-05, + "grad_norm": 3.7997193336486816, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8695828318595886, + "num_tokens": 408155885.0, + "step": 10693 + }, + { + "epoch": 1.3603867192469152, + "ewc_loss": 0.007745983544737101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745983748463914e-05, + "grad_norm": 3.8684163093566895, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.878912091255188, + "num_tokens": 408190085.0, + "step": 10694 + }, + { + "epoch": 1.3605139295255055, + "ewc_loss": 0.0077778990380465984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.777899008942768e-05, + "grad_norm": 3.8205604553222656, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8751709461212158, + "num_tokens": 408227289.0, + "step": 10695 + }, + { + "epoch": 1.360641139804096, + "ewc_loss": 0.0077275545336306095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.727554475422949e-05, + "grad_norm": 3.781247138977051, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8821091651916504, + "num_tokens": 408271853.0, + "step": 10696 + }, + { + "epoch": 1.3607683500826866, + "ewc_loss": 0.007710487116128206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.710487261647359e-05, + "grad_norm": 3.7892446517944336, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8854098916053772, + "num_tokens": 408311133.0, + "step": 10697 + }, + { + "epoch": 1.3608955603612771, + "ewc_loss": 0.007736838422715664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.736838597338647e-05, + "grad_norm": 3.925513744354248, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8773108720779419, + "num_tokens": 408342775.0, + "step": 10698 + }, + { + "epoch": 1.3610227706398677, + "ewc_loss": 0.007804723456501961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.804723281878978e-05, + "grad_norm": 3.785050630569458, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8862345218658447, + "num_tokens": 408379063.0, + "step": 10699 + }, + { + "epoch": 1.3611499809184582, + "ewc_loss": 0.007664103992283344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6641037594527e-05, + "grad_norm": 3.8925654888153076, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8614133596420288, + "num_tokens": 408415856.0, + "step": 10700 + }, + { + "epoch": 1.3612771911970487, + "ewc_loss": 0.007803656626492739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.803656626492739e-05, + "grad_norm": 3.8208749294281006, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8791711330413818, + "num_tokens": 408451522.0, + "step": 10701 + }, + { + "epoch": 1.3614044014756392, + "ewc_loss": 0.007705457508563995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705457392148674e-05, + "grad_norm": 3.7584376335144043, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8793489933013916, + "num_tokens": 408494605.0, + "step": 10702 + }, + { + "epoch": 1.3615316117542298, + "ewc_loss": 0.007702816277742386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702816219534725e-05, + "grad_norm": 3.826960802078247, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8740290403366089, + "num_tokens": 408536373.0, + "step": 10703 + }, + { + "epoch": 1.3616588220328203, + "ewc_loss": 0.00776601443067193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766014459775761e-05, + "grad_norm": 3.8435609340667725, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8628741502761841, + "num_tokens": 408574724.0, + "step": 10704 + }, + { + "epoch": 1.3617860323114108, + "ewc_loss": 0.00773898558691144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738985732430592e-05, + "grad_norm": 3.789473056793213, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8670953512191772, + "num_tokens": 408615057.0, + "step": 10705 + }, + { + "epoch": 1.3619132425900013, + "ewc_loss": 0.00769423134624958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69423131714575e-05, + "grad_norm": 3.854335308074951, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8622211813926697, + "num_tokens": 408651324.0, + "step": 10706 + }, + { + "epoch": 1.3620404528685919, + "ewc_loss": 0.007771779783070087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77177992858924e-05, + "grad_norm": 3.790864944458008, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8806841969490051, + "num_tokens": 408690533.0, + "step": 10707 + }, + { + "epoch": 1.3621676631471824, + "ewc_loss": 0.007708979770541191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.7089796832297e-05, + "grad_norm": 3.8572816848754883, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8670538663864136, + "num_tokens": 408725970.0, + "step": 10708 + }, + { + "epoch": 1.362294873425773, + "ewc_loss": 0.007774623576551676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774623372824863e-05, + "grad_norm": 3.775308847427368, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8829834461212158, + "num_tokens": 408766550.0, + "step": 10709 + }, + { + "epoch": 1.3624220837043632, + "ewc_loss": 0.007709513884037733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709513738518581e-05, + "grad_norm": 3.8475048542022705, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8794476389884949, + "num_tokens": 408804320.0, + "step": 10710 + }, + { + "epoch": 1.3625492939829538, + "ewc_loss": 0.007788956165313721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.788956281729043e-05, + "grad_norm": 3.824425458908081, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8762054443359375, + "num_tokens": 408840902.0, + "step": 10711 + }, + { + "epoch": 1.3626765042615443, + "ewc_loss": 0.007725795730948448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.7257958764676e-05, + "grad_norm": 3.7987771034240723, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8735480904579163, + "num_tokens": 408879346.0, + "step": 10712 + }, + { + "epoch": 1.3628037145401348, + "ewc_loss": 0.007734562736004591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73456267779693e-05, + "grad_norm": 3.7804603576660156, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8857517242431641, + "num_tokens": 408919336.0, + "step": 10713 + }, + { + "epoch": 1.3629309248187254, + "ewc_loss": 0.007727852091193199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.727852062089369e-05, + "grad_norm": 3.8327910900115967, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8693838715553284, + "num_tokens": 408961547.0, + "step": 10714 + }, + { + "epoch": 1.3630581350973159, + "ewc_loss": 0.00776956370100379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76956367189996e-05, + "grad_norm": 3.780395746231079, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8903071880340576, + "num_tokens": 409003573.0, + "step": 10715 + }, + { + "epoch": 1.3631853453759064, + "ewc_loss": 0.007705838419497013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705838652327657e-05, + "grad_norm": 3.839862108230591, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8663010001182556, + "num_tokens": 409045560.0, + "step": 10716 + }, + { + "epoch": 1.363312555654497, + "ewc_loss": 0.007762361783534288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.762361929053441e-05, + "grad_norm": 3.833679676055908, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8786321878433228, + "num_tokens": 409087245.0, + "step": 10717 + }, + { + "epoch": 1.3634397659330875, + "ewc_loss": 0.007716738618910313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716738764429465e-05, + "grad_norm": 3.7653110027313232, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8817524313926697, + "num_tokens": 409134177.0, + "step": 10718 + }, + { + "epoch": 1.3635669762116778, + "ewc_loss": 0.007687833625823259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.687833567615598e-05, + "grad_norm": 3.842576026916504, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.861621081829071, + "num_tokens": 409169495.0, + "step": 10719 + }, + { + "epoch": 1.3636941864902683, + "ewc_loss": 0.007763035129755735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763034955132753e-05, + "grad_norm": 3.8438665866851807, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8494948744773865, + "num_tokens": 409207781.0, + "step": 10720 + }, + { + "epoch": 1.3638213967688588, + "ewc_loss": 0.0077230799943208694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.723079761490226e-05, + "grad_norm": 3.8399136066436768, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8747670650482178, + "num_tokens": 409245411.0, + "step": 10721 + }, + { + "epoch": 1.3639486070474494, + "ewc_loss": 0.007743891794234514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.743891910649836e-05, + "grad_norm": 3.785985231399536, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8835368156433105, + "num_tokens": 409281757.0, + "step": 10722 + }, + { + "epoch": 1.3640758173260399, + "ewc_loss": 0.007705908268690109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705908501520753e-05, + "grad_norm": 3.8052971363067627, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8631331920623779, + "num_tokens": 409321317.0, + "step": 10723 + }, + { + "epoch": 1.3642030276046304, + "ewc_loss": 0.007765374146401882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.765374175505713e-05, + "grad_norm": 3.8115341663360596, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8803594708442688, + "num_tokens": 409358919.0, + "step": 10724 + }, + { + "epoch": 1.364330237883221, + "ewc_loss": 0.007741707842797041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741707668174058e-05, + "grad_norm": 3.8641164302825928, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8658367991447449, + "num_tokens": 409394355.0, + "step": 10725 + }, + { + "epoch": 1.3644574481618115, + "ewc_loss": 0.007764777634292841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76477754698135e-05, + "grad_norm": 3.771003007888794, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8839104175567627, + "num_tokens": 409433781.0, + "step": 10726 + }, + { + "epoch": 1.364584658440402, + "ewc_loss": 0.00770548777654767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705487951170653e-05, + "grad_norm": 3.838745594024658, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.86485755443573, + "num_tokens": 409471728.0, + "step": 10727 + }, + { + "epoch": 1.3647118687189925, + "ewc_loss": 0.0077764117158949375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776411803206429e-05, + "grad_norm": 3.831876277923584, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8923564553260803, + "num_tokens": 409502294.0, + "step": 10728 + }, + { + "epoch": 1.364839078997583, + "ewc_loss": 0.0077492366544902325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749236829113215e-05, + "grad_norm": 3.7994465827941895, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8765888214111328, + "num_tokens": 409545878.0, + "step": 10729 + }, + { + "epoch": 1.3649662892761736, + "ewc_loss": 0.00775176240131259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.751762314001098e-05, + "grad_norm": 3.7804932594299316, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8770248293876648, + "num_tokens": 409584753.0, + "step": 10730 + }, + { + "epoch": 1.3650934995547641, + "ewc_loss": 0.007749589160084724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749588985461742e-05, + "grad_norm": 3.9043896198272705, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8789148330688477, + "num_tokens": 409613725.0, + "step": 10731 + }, + { + "epoch": 1.3652207098333546, + "ewc_loss": 0.007837397046387196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837397424736992e-05, + "grad_norm": 3.855980396270752, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8676178455352783, + "num_tokens": 409652443.0, + "step": 10732 + }, + { + "epoch": 1.3653479201119452, + "ewc_loss": 0.007752072066068649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752072269795462e-05, + "grad_norm": 3.824998617172241, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8793116807937622, + "num_tokens": 409690683.0, + "step": 10733 + }, + { + "epoch": 1.3654751303905355, + "ewc_loss": 0.007754966616630554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.754966645734385e-05, + "grad_norm": 3.8321034908294678, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8743117451667786, + "num_tokens": 409727502.0, + "step": 10734 + }, + { + "epoch": 1.365602340669126, + "ewc_loss": 0.007776323240250349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776323036523536e-05, + "grad_norm": 3.8337113857269287, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8690177798271179, + "num_tokens": 409764592.0, + "step": 10735 + }, + { + "epoch": 1.3657295509477165, + "ewc_loss": 0.007784326560795307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.784326589899138e-05, + "grad_norm": 3.850339651107788, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8528158664703369, + "num_tokens": 409809645.0, + "step": 10736 + }, + { + "epoch": 1.365856761226307, + "ewc_loss": 0.007760165259242058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.760165317449719e-05, + "grad_norm": 3.804048776626587, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8739182949066162, + "num_tokens": 409848048.0, + "step": 10737 + }, + { + "epoch": 1.3659839715048976, + "ewc_loss": 0.007750797551125288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.750797522021458e-05, + "grad_norm": 3.8535478115081787, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8694410920143127, + "num_tokens": 409884176.0, + "step": 10738 + }, + { + "epoch": 1.3661111817834881, + "ewc_loss": 0.007777273189276457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.777273276587948e-05, + "grad_norm": 3.8215386867523193, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8771690130233765, + "num_tokens": 409922630.0, + "step": 10739 + }, + { + "epoch": 1.3662383920620786, + "ewc_loss": 0.0077527533285319805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75275329942815e-05, + "grad_norm": 3.8541243076324463, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8800129890441895, + "num_tokens": 409961876.0, + "step": 10740 + }, + { + "epoch": 1.3663656023406692, + "ewc_loss": 0.007778091821819544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778091821819544e-05, + "grad_norm": 3.8405184745788574, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8827134370803833, + "num_tokens": 409995432.0, + "step": 10741 + }, + { + "epoch": 1.3664928126192597, + "ewc_loss": 0.007760378532111645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.760378503007814e-05, + "grad_norm": 3.8261098861694336, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8732777833938599, + "num_tokens": 410034876.0, + "step": 10742 + }, + { + "epoch": 1.3666200228978502, + "ewc_loss": 0.007754556369036436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.754556281724945e-05, + "grad_norm": 3.803095579147339, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8769876956939697, + "num_tokens": 410079190.0, + "step": 10743 + }, + { + "epoch": 1.3667472331764405, + "ewc_loss": 0.00774498051032424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.744980393908918e-05, + "grad_norm": 3.8401646614074707, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8789369463920593, + "num_tokens": 410112475.0, + "step": 10744 + }, + { + "epoch": 1.366874443455031, + "ewc_loss": 0.007776892744004726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776892744004726e-05, + "grad_norm": 3.777172327041626, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8875638246536255, + "num_tokens": 410154583.0, + "step": 10745 + }, + { + "epoch": 1.3670016537336216, + "ewc_loss": 0.007708081975579262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70808183006011e-05, + "grad_norm": 3.8195133209228516, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8747936487197876, + "num_tokens": 410191631.0, + "step": 10746 + }, + { + "epoch": 1.3671288640122121, + "ewc_loss": 0.007774916477501392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774916593916714e-05, + "grad_norm": 3.842256784439087, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8529499769210815, + "num_tokens": 410232435.0, + "step": 10747 + }, + { + "epoch": 1.3672560742908026, + "ewc_loss": 0.007747666910290718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747666677460074e-05, + "grad_norm": 3.851829767227173, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8789067268371582, + "num_tokens": 410272940.0, + "step": 10748 + }, + { + "epoch": 1.3673832845693932, + "ewc_loss": 0.007756401319056749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.756401464575902e-05, + "grad_norm": 3.839261531829834, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8695817589759827, + "num_tokens": 410312807.0, + "step": 10749 + }, + { + "epoch": 1.3675104948479837, + "ewc_loss": 0.007732587866485119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732587982900441e-05, + "grad_norm": 3.780761480331421, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8793540000915527, + "num_tokens": 410352433.0, + "step": 10750 + }, + { + "epoch": 1.3676377051265742, + "ewc_loss": 0.007696579210460186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696579268667847e-05, + "grad_norm": 3.815065622329712, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8744126558303833, + "num_tokens": 410394843.0, + "step": 10751 + }, + { + "epoch": 1.3677649154051648, + "ewc_loss": 0.007743654772639275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.743654714431614e-05, + "grad_norm": 3.7724990844726562, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8770336508750916, + "num_tokens": 410439353.0, + "step": 10752 + }, + { + "epoch": 1.3678921256837553, + "ewc_loss": 0.007692247163504362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.692247163504362e-05, + "grad_norm": 3.808582305908203, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8526758551597595, + "num_tokens": 410485789.0, + "step": 10753 + }, + { + "epoch": 1.3680193359623458, + "ewc_loss": 0.007745805662125349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745805487502366e-05, + "grad_norm": 3.840078830718994, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8595991134643555, + "num_tokens": 410523385.0, + "step": 10754 + }, + { + "epoch": 1.3681465462409363, + "ewc_loss": 0.007727441843599081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.727441698079929e-05, + "grad_norm": 3.9333066940307617, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8690006732940674, + "num_tokens": 410553340.0, + "step": 10755 + }, + { + "epoch": 1.3682737565195269, + "ewc_loss": 0.007785038091242313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785038178553805e-05, + "grad_norm": 3.8175528049468994, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8829988241195679, + "num_tokens": 410590041.0, + "step": 10756 + }, + { + "epoch": 1.3684009667981174, + "ewc_loss": 0.007678865920752287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678865949856117e-05, + "grad_norm": 3.8316829204559326, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8751133680343628, + "num_tokens": 410627703.0, + "step": 10757 + }, + { + "epoch": 1.368528177076708, + "ewc_loss": 0.007740116212517023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740116416243836e-05, + "grad_norm": 3.8406360149383545, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8667935729026794, + "num_tokens": 410662555.0, + "step": 10758 + }, + { + "epoch": 1.3686553873552982, + "ewc_loss": 0.0077387308701872826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738731073914096e-05, + "grad_norm": 3.8078107833862305, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8566204905509949, + "num_tokens": 410706026.0, + "step": 10759 + }, + { + "epoch": 1.3687825976338888, + "ewc_loss": 0.007705133408308029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705133612034842e-05, + "grad_norm": 3.810296058654785, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8722152709960938, + "num_tokens": 410743275.0, + "step": 10760 + }, + { + "epoch": 1.3689098079124793, + "ewc_loss": 0.0077334376983344555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.733437814749777e-05, + "grad_norm": 3.81585431098938, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8835881948471069, + "num_tokens": 410781220.0, + "step": 10761 + }, + { + "epoch": 1.3690370181910698, + "ewc_loss": 0.007751052733510733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.751052908133715e-05, + "grad_norm": 3.818079710006714, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.882601261138916, + "num_tokens": 410820014.0, + "step": 10762 + }, + { + "epoch": 1.3691642284696603, + "ewc_loss": 0.007739664521068335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.739664579275995e-05, + "grad_norm": 3.7924551963806152, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.86979740858078, + "num_tokens": 410858721.0, + "step": 10763 + }, + { + "epoch": 1.3692914387482509, + "ewc_loss": 0.0077315643429756165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.731564255664125e-05, + "grad_norm": 3.790245771408081, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8794761300086975, + "num_tokens": 410898527.0, + "step": 10764 + }, + { + "epoch": 1.3694186490268414, + "ewc_loss": 0.007746046409010887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.746046321699396e-05, + "grad_norm": 3.821120500564575, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8744842410087585, + "num_tokens": 410938426.0, + "step": 10765 + }, + { + "epoch": 1.369545859305432, + "ewc_loss": 0.007736070081591606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.736070256214589e-05, + "grad_norm": 3.858812093734741, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8807032108306885, + "num_tokens": 410969940.0, + "step": 10766 + }, + { + "epoch": 1.3696730695840225, + "ewc_loss": 0.007748580537736416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.748580537736416e-05, + "grad_norm": 3.8530385494232178, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8583596348762512, + "num_tokens": 411008859.0, + "step": 10767 + }, + { + "epoch": 1.3698002798626128, + "ewc_loss": 0.007752273231744766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752273086225614e-05, + "grad_norm": 3.8398056030273438, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8863335251808167, + "num_tokens": 411043516.0, + "step": 10768 + }, + { + "epoch": 1.3699274901412033, + "ewc_loss": 0.007748795673251152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.748795906081796e-05, + "grad_norm": 3.831825017929077, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8688114881515503, + "num_tokens": 411079092.0, + "step": 10769 + }, + { + "epoch": 1.3700547004197938, + "ewc_loss": 0.00774511368945241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745113543933257e-05, + "grad_norm": 3.8179476261138916, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8795607686042786, + "num_tokens": 411117360.0, + "step": 10770 + }, + { + "epoch": 1.3701819106983844, + "ewc_loss": 0.007757721468806267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757721323287115e-05, + "grad_norm": 3.871415376663208, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8649328947067261, + "num_tokens": 411153615.0, + "step": 10771 + }, + { + "epoch": 1.3703091209769749, + "ewc_loss": 0.0077766599133610725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776659913361073e-05, + "grad_norm": 3.818171977996826, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8748297691345215, + "num_tokens": 411193571.0, + "step": 10772 + }, + { + "epoch": 1.3704363312555654, + "ewc_loss": 0.00774254510179162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742545130895451e-05, + "grad_norm": 3.862363576889038, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8573567867279053, + "num_tokens": 411230427.0, + "step": 10773 + }, + { + "epoch": 1.370563541534156, + "ewc_loss": 0.007791244424879551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791244570398703e-05, + "grad_norm": 3.7873826026916504, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8721003532409668, + "num_tokens": 411271251.0, + "step": 10774 + }, + { + "epoch": 1.3706907518127465, + "ewc_loss": 0.00773789593949914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.737895793979988e-05, + "grad_norm": 3.8783373832702637, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8771615028381348, + "num_tokens": 411304906.0, + "step": 10775 + }, + { + "epoch": 1.370817962091337, + "ewc_loss": 0.007806351408362389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.806351641193032e-05, + "grad_norm": 3.787842273712158, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8915462493896484, + "num_tokens": 411341417.0, + "step": 10776 + }, + { + "epoch": 1.3709451723699275, + "ewc_loss": 0.007752152159810066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752152305329219e-05, + "grad_norm": 3.821038007736206, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8734195828437805, + "num_tokens": 411382118.0, + "step": 10777 + }, + { + "epoch": 1.371072382648518, + "ewc_loss": 0.007817932404577732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817932782927528e-05, + "grad_norm": 3.7681877613067627, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8631755113601685, + "num_tokens": 411426702.0, + "step": 10778 + }, + { + "epoch": 1.3711995929271086, + "ewc_loss": 0.007769346237182617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769346120767295e-05, + "grad_norm": 3.8796186447143555, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8737630844116211, + "num_tokens": 411461689.0, + "step": 10779 + }, + { + "epoch": 1.371326803205699, + "ewc_loss": 0.00784255564212799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842556078685448e-05, + "grad_norm": 3.808218240737915, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8746751546859741, + "num_tokens": 411499613.0, + "step": 10780 + }, + { + "epoch": 1.3714540134842896, + "ewc_loss": 0.0077582430094480515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.758243009448051e-05, + "grad_norm": 3.858201503753662, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8727763891220093, + "num_tokens": 411538422.0, + "step": 10781 + }, + { + "epoch": 1.3715812237628802, + "ewc_loss": 0.0078120785765349865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812078547431156e-05, + "grad_norm": 3.827620506286621, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8879575729370117, + "num_tokens": 411572401.0, + "step": 10782 + }, + { + "epoch": 1.3717084340414705, + "ewc_loss": 0.007765043061226606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.765043119434267e-05, + "grad_norm": 3.821099042892456, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8797654509544373, + "num_tokens": 411611268.0, + "step": 10783 + }, + { + "epoch": 1.371835644320061, + "ewc_loss": 0.007790368515998125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.790368545101956e-05, + "grad_norm": 3.7907633781433105, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8754305243492126, + "num_tokens": 411648837.0, + "step": 10784 + }, + { + "epoch": 1.3719628545986515, + "ewc_loss": 0.007764662615954876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.764662586851045e-05, + "grad_norm": 3.801865816116333, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8786364793777466, + "num_tokens": 411690327.0, + "step": 10785 + }, + { + "epoch": 1.372090064877242, + "ewc_loss": 0.007774509955197573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774509867886081e-05, + "grad_norm": 3.8482367992401123, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8731343746185303, + "num_tokens": 411729665.0, + "step": 10786 + }, + { + "epoch": 1.3722172751558326, + "ewc_loss": 0.007785307243466377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78530738898553e-05, + "grad_norm": 3.8460326194763184, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8882226347923279, + "num_tokens": 411764716.0, + "step": 10787 + }, + { + "epoch": 1.3723444854344231, + "ewc_loss": 0.007769958581775427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76995875639841e-05, + "grad_norm": 3.8069939613342285, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8747143745422363, + "num_tokens": 411805560.0, + "step": 10788 + }, + { + "epoch": 1.3724716957130136, + "ewc_loss": 0.007744200993329287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.744201138848439e-05, + "grad_norm": 3.861658811569214, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8771871328353882, + "num_tokens": 411841151.0, + "step": 10789 + }, + { + "epoch": 1.3725989059916042, + "ewc_loss": 0.007795229088515043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79522888478823e-05, + "grad_norm": 3.8530492782592773, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8804919123649597, + "num_tokens": 411875088.0, + "step": 10790 + }, + { + "epoch": 1.3727261162701947, + "ewc_loss": 0.007750176824629307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.750176882836968e-05, + "grad_norm": 3.81000018119812, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8748653531074524, + "num_tokens": 411914554.0, + "step": 10791 + }, + { + "epoch": 1.3728533265487852, + "ewc_loss": 0.007721925154328346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721925067016855e-05, + "grad_norm": 3.8025848865509033, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8758753538131714, + "num_tokens": 411952237.0, + "step": 10792 + }, + { + "epoch": 1.3729805368273755, + "ewc_loss": 0.0077398051507771015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.739805005257949e-05, + "grad_norm": 3.8418922424316406, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.868539035320282, + "num_tokens": 411989676.0, + "step": 10793 + }, + { + "epoch": 1.373107747105966, + "ewc_loss": 0.007762599270790815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.762599125271663e-05, + "grad_norm": 3.8101515769958496, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8868327140808105, + "num_tokens": 412025804.0, + "step": 10794 + }, + { + "epoch": 1.3732349573845566, + "ewc_loss": 0.007731241174042225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.731241203146055e-05, + "grad_norm": 3.8036439418792725, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8906217813491821, + "num_tokens": 412060085.0, + "step": 10795 + }, + { + "epoch": 1.3733621676631471, + "ewc_loss": 0.00774922501295805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749225187581033e-05, + "grad_norm": 3.831096887588501, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8809425830841064, + "num_tokens": 412100750.0, + "step": 10796 + }, + { + "epoch": 1.3734893779417376, + "ewc_loss": 0.007747508119791746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747508061584085e-05, + "grad_norm": 3.8452794551849365, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8777885437011719, + "num_tokens": 412145829.0, + "step": 10797 + }, + { + "epoch": 1.3736165882203282, + "ewc_loss": 0.0077348011545836926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734801329206675e-05, + "grad_norm": 3.7763071060180664, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8795322179794312, + "num_tokens": 412187607.0, + "step": 10798 + }, + { + "epoch": 1.3737437984989187, + "ewc_loss": 0.0076952287927269936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.695228850934654e-05, + "grad_norm": 3.8402178287506104, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8614580631256104, + "num_tokens": 412227214.0, + "step": 10799 + }, + { + "epoch": 1.3738710087775092, + "ewc_loss": 0.007759848143905401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75984808569774e-05, + "grad_norm": 3.815638780593872, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8796975612640381, + "num_tokens": 412261929.0, + "step": 10800 + }, + { + "epoch": 1.3739982190560998, + "ewc_loss": 0.007715943269431591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.715943502262235e-05, + "grad_norm": 3.8225576877593994, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8713431358337402, + "num_tokens": 412299441.0, + "step": 10801 + }, + { + "epoch": 1.3741254293346903, + "ewc_loss": 0.007721809204667807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72180937929079e-05, + "grad_norm": 3.803821325302124, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.874770998954773, + "num_tokens": 412339248.0, + "step": 10802 + }, + { + "epoch": 1.3742526396132808, + "ewc_loss": 0.007726309355348349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.726309559075162e-05, + "grad_norm": 3.842054605484009, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8734458684921265, + "num_tokens": 412377224.0, + "step": 10803 + }, + { + "epoch": 1.3743798498918713, + "ewc_loss": 0.007743495982140303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.743496098555624e-05, + "grad_norm": 3.8667635917663574, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8527549505233765, + "num_tokens": 412415331.0, + "step": 10804 + }, + { + "epoch": 1.3745070601704619, + "ewc_loss": 0.007767168339341879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76716842665337e-05, + "grad_norm": 3.820798397064209, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8731200098991394, + "num_tokens": 412455604.0, + "step": 10805 + }, + { + "epoch": 1.3746342704490524, + "ewc_loss": 0.007729264907538891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729265053058043e-05, + "grad_norm": 3.8563241958618164, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8824180960655212, + "num_tokens": 412489295.0, + "step": 10806 + }, + { + "epoch": 1.374761480727643, + "ewc_loss": 0.007772489916533232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772490062052384e-05, + "grad_norm": 3.8559751510620117, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8702422380447388, + "num_tokens": 412525750.0, + "step": 10807 + }, + { + "epoch": 1.3748886910062332, + "ewc_loss": 0.0077431825920939445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.743182504782453e-05, + "grad_norm": 3.760870933532715, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8825693130493164, + "num_tokens": 412567163.0, + "step": 10808 + }, + { + "epoch": 1.3750159012848238, + "ewc_loss": 0.007713364902883768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713364902883768e-05, + "grad_norm": 3.857192039489746, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8816014528274536, + "num_tokens": 412598789.0, + "step": 10809 + }, + { + "epoch": 1.3751431115634143, + "ewc_loss": 0.00779314711689949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793147233314812e-05, + "grad_norm": 3.8554959297180176, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8765398859977722, + "num_tokens": 412632588.0, + "step": 10810 + }, + { + "epoch": 1.3752703218420048, + "ewc_loss": 0.007764547597616911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.764547626720741e-05, + "grad_norm": 3.7552363872528076, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8722661137580872, + "num_tokens": 412676369.0, + "step": 10811 + }, + { + "epoch": 1.3753975321205953, + "ewc_loss": 0.007703311741352081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.703311712248251e-05, + "grad_norm": 3.7650363445281982, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8899452686309814, + "num_tokens": 412712674.0, + "step": 10812 + }, + { + "epoch": 1.3755247423991859, + "ewc_loss": 0.0077536641620099545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.753664249321446e-05, + "grad_norm": 3.797534465789795, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8689302206039429, + "num_tokens": 412753531.0, + "step": 10813 + }, + { + "epoch": 1.3756519526777764, + "ewc_loss": 0.007773939985781908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.773940160404891e-05, + "grad_norm": 3.768720865249634, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.872555136680603, + "num_tokens": 412799450.0, + "step": 10814 + }, + { + "epoch": 1.375779162956367, + "ewc_loss": 0.007734162732958794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73416250012815e-05, + "grad_norm": 3.7837064266204834, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8963108062744141, + "num_tokens": 412836858.0, + "step": 10815 + }, + { + "epoch": 1.3759063732349575, + "ewc_loss": 0.007753934245556593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.753934187348932e-05, + "grad_norm": 3.8632030487060547, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8774338364601135, + "num_tokens": 412871169.0, + "step": 10816 + }, + { + "epoch": 1.3760335835135478, + "ewc_loss": 0.007775646168738604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.775646372465417e-05, + "grad_norm": 3.785775661468506, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8797615766525269, + "num_tokens": 412909281.0, + "step": 10817 + }, + { + "epoch": 1.3761607937921383, + "ewc_loss": 0.007709730416536331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709730562055483e-05, + "grad_norm": 3.8203907012939453, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8770159482955933, + "num_tokens": 412950126.0, + "step": 10818 + }, + { + "epoch": 1.3762880040707288, + "ewc_loss": 0.0077540017664432526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.754001853754744e-05, + "grad_norm": 3.873690366744995, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8608838319778442, + "num_tokens": 412988644.0, + "step": 10819 + }, + { + "epoch": 1.3764152143493193, + "ewc_loss": 0.007772174198180437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772174285491928e-05, + "grad_norm": 3.8586254119873047, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8628796339035034, + "num_tokens": 413028403.0, + "step": 10820 + }, + { + "epoch": 1.3765424246279099, + "ewc_loss": 0.007735487539321184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735487452009693e-05, + "grad_norm": 3.834902763366699, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8652400970458984, + "num_tokens": 413067429.0, + "step": 10821 + }, + { + "epoch": 1.3766696349065004, + "ewc_loss": 0.0077180382795631886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.718038250459358e-05, + "grad_norm": 3.869536876678467, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8720748424530029, + "num_tokens": 413101630.0, + "step": 10822 + }, + { + "epoch": 1.376796845185091, + "ewc_loss": 0.00775592727586627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.755927072139457e-05, + "grad_norm": 3.7995359897613525, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8688351511955261, + "num_tokens": 413147757.0, + "step": 10823 + }, + { + "epoch": 1.3769240554636815, + "ewc_loss": 0.007710957899689674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.710958016104996e-05, + "grad_norm": 3.840064764022827, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8776316046714783, + "num_tokens": 413186803.0, + "step": 10824 + }, + { + "epoch": 1.377051265742272, + "ewc_loss": 0.007763691246509552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763691246509552e-05, + "grad_norm": 3.876073122024536, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8703812956809998, + "num_tokens": 413227201.0, + "step": 10825 + }, + { + "epoch": 1.3771784760208625, + "ewc_loss": 0.007763200905174017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763200846966356e-05, + "grad_norm": 3.783942937850952, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8745691776275635, + "num_tokens": 413270594.0, + "step": 10826 + }, + { + "epoch": 1.377305686299453, + "ewc_loss": 0.007691273000091314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.691272912779823e-05, + "grad_norm": 3.8653621673583984, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.857867419719696, + "num_tokens": 413307345.0, + "step": 10827 + }, + { + "epoch": 1.3774328965780436, + "ewc_loss": 0.007789121475070715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.789121445966884e-05, + "grad_norm": 3.8759281635284424, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8800056576728821, + "num_tokens": 413344310.0, + "step": 10828 + }, + { + "epoch": 1.377560106856634, + "ewc_loss": 0.007760872598737478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.760872540529817e-05, + "grad_norm": 3.797991991043091, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8944495916366577, + "num_tokens": 413382047.0, + "step": 10829 + }, + { + "epoch": 1.3776873171352246, + "ewc_loss": 0.0077001540921628475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.700153946643695e-05, + "grad_norm": 3.7774224281311035, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.872247040271759, + "num_tokens": 413424658.0, + "step": 10830 + }, + { + "epoch": 1.3778145274138152, + "ewc_loss": 0.007728815544396639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.728815398877487e-05, + "grad_norm": 3.862360715866089, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8821806311607361, + "num_tokens": 413458386.0, + "step": 10831 + }, + { + "epoch": 1.3779417376924055, + "ewc_loss": 0.007766095921397209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76609595050104e-05, + "grad_norm": 3.8951845169067383, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8618839979171753, + "num_tokens": 413492530.0, + "step": 10832 + }, + { + "epoch": 1.378068947970996, + "ewc_loss": 0.007766374386847019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766374619677663e-05, + "grad_norm": 3.7991838455200195, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8852612972259521, + "num_tokens": 413528894.0, + "step": 10833 + }, + { + "epoch": 1.3781961582495865, + "ewc_loss": 0.007715579587966204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.715579704381526e-05, + "grad_norm": 3.846686601638794, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8719990253448486, + "num_tokens": 413564765.0, + "step": 10834 + }, + { + "epoch": 1.378323368528177, + "ewc_loss": 0.007769609335809946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769609510432929e-05, + "grad_norm": 3.899129867553711, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8475046157836914, + "num_tokens": 413598122.0, + "step": 10835 + }, + { + "epoch": 1.3784505788067676, + "ewc_loss": 0.007811284624040127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811284740455449e-05, + "grad_norm": 3.88034725189209, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.85743248462677, + "num_tokens": 413634519.0, + "step": 10836 + }, + { + "epoch": 1.378577789085358, + "ewc_loss": 0.007776582147926092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776582060614601e-05, + "grad_norm": 3.827868700027466, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8662208318710327, + "num_tokens": 413674021.0, + "step": 10837 + }, + { + "epoch": 1.3787049993639486, + "ewc_loss": 0.007766710594296455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766710768919438e-05, + "grad_norm": 3.7704105377197266, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.88036048412323, + "num_tokens": 413713804.0, + "step": 10838 + }, + { + "epoch": 1.3788322096425392, + "ewc_loss": 0.007775994949042797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.775994890835136e-05, + "grad_norm": 3.81227970123291, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8806126713752747, + "num_tokens": 413755665.0, + "step": 10839 + }, + { + "epoch": 1.3789594199211297, + "ewc_loss": 0.0078119998797774315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811999967088923e-05, + "grad_norm": 3.8473615646362305, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8742532134056091, + "num_tokens": 413790872.0, + "step": 10840 + }, + { + "epoch": 1.3790866301997202, + "ewc_loss": 0.007812351454049349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812351395841688e-05, + "grad_norm": 3.770954132080078, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8930116891860962, + "num_tokens": 413830798.0, + "step": 10841 + }, + { + "epoch": 1.3792138404783105, + "ewc_loss": 0.007770651951432228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770652155159041e-05, + "grad_norm": 3.8783011436462402, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8798779845237732, + "num_tokens": 413865960.0, + "step": 10842 + }, + { + "epoch": 1.379341050756901, + "ewc_loss": 0.007848229259252548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848229142837226e-05, + "grad_norm": 3.86798357963562, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8853352069854736, + "num_tokens": 413898222.0, + "step": 10843 + }, + { + "epoch": 1.3794682610354916, + "ewc_loss": 0.007802470587193966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.802470645401627e-05, + "grad_norm": 3.873774290084839, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8687373399734497, + "num_tokens": 413931839.0, + "step": 10844 + }, + { + "epoch": 1.3795954713140821, + "ewc_loss": 0.007834835909307003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.834835560061038e-05, + "grad_norm": 3.897153377532959, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8647180199623108, + "num_tokens": 413963815.0, + "step": 10845 + }, + { + "epoch": 1.3797226815926726, + "ewc_loss": 0.007827598601579666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.827598165022209e-05, + "grad_norm": 3.7759110927581787, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8837284445762634, + "num_tokens": 414002476.0, + "step": 10846 + }, + { + "epoch": 1.3798498918712632, + "ewc_loss": 0.007752872537821531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752872625133023e-05, + "grad_norm": 3.8397903442382812, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.869892418384552, + "num_tokens": 414041121.0, + "step": 10847 + }, + { + "epoch": 1.3799771021498537, + "ewc_loss": 0.007844035513699055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844035280868411e-05, + "grad_norm": 3.8398866653442383, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8616335391998291, + "num_tokens": 414080447.0, + "step": 10848 + }, + { + "epoch": 1.3801043124284442, + "ewc_loss": 0.007808607071638107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.808607188053429e-05, + "grad_norm": 3.9853146076202393, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8648135662078857, + "num_tokens": 414108925.0, + "step": 10849 + }, + { + "epoch": 1.3802315227070348, + "ewc_loss": 0.00791230145841837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.912301225587726e-05, + "grad_norm": 3.7634952068328857, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8793025016784668, + "num_tokens": 414154296.0, + "step": 10850 + }, + { + "epoch": 1.3803587329856253, + "ewc_loss": 0.007723765913397074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.723765884293243e-05, + "grad_norm": 3.8162362575531006, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.862613320350647, + "num_tokens": 414194612.0, + "step": 10851 + }, + { + "epoch": 1.3804859432642158, + "ewc_loss": 0.007854213938117027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.854213617974892e-05, + "grad_norm": 3.9008312225341797, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8774169683456421, + "num_tokens": 414229939.0, + "step": 10852 + }, + { + "epoch": 1.3806131535428063, + "ewc_loss": 0.007867693901062012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86769378464669e-05, + "grad_norm": 3.8751425743103027, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8713439702987671, + "num_tokens": 414265193.0, + "step": 10853 + }, + { + "epoch": 1.3807403638213969, + "ewc_loss": 0.007816372439265251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.816372090019286e-05, + "grad_norm": 3.7924325466156006, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8730430603027344, + "num_tokens": 414301471.0, + "step": 10854 + }, + { + "epoch": 1.3808675740999874, + "ewc_loss": 0.007786272093653679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78627218096517e-05, + "grad_norm": 3.9114344120025635, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8452074527740479, + "num_tokens": 414338755.0, + "step": 10855 + }, + { + "epoch": 1.380994784378578, + "ewc_loss": 0.007886217907071114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88621764513664e-05, + "grad_norm": 3.801157236099243, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8754643201828003, + "num_tokens": 414379996.0, + "step": 10856 + }, + { + "epoch": 1.3811219946571682, + "ewc_loss": 0.007770366966724396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770366937620565e-05, + "grad_norm": 3.7931950092315674, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8770788908004761, + "num_tokens": 414421905.0, + "step": 10857 + }, + { + "epoch": 1.3812492049357588, + "ewc_loss": 0.007804497145116329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.804496999597177e-05, + "grad_norm": 3.7646336555480957, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8788906931877136, + "num_tokens": 414465149.0, + "step": 10858 + }, + { + "epoch": 1.3813764152143493, + "ewc_loss": 0.007785166148096323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785166235407814e-05, + "grad_norm": 3.8074381351470947, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8839517831802368, + "num_tokens": 414504086.0, + "step": 10859 + }, + { + "epoch": 1.3815036254929398, + "ewc_loss": 0.007801385130733252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80138507252559e-05, + "grad_norm": 3.857694149017334, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8593928217887878, + "num_tokens": 414545392.0, + "step": 10860 + }, + { + "epoch": 1.3816308357715303, + "ewc_loss": 0.007817155681550503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817155710654333e-05, + "grad_norm": 3.8733880519866943, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8404103517532349, + "num_tokens": 414586445.0, + "step": 10861 + }, + { + "epoch": 1.3817580460501209, + "ewc_loss": 0.0078107863664627075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.810786337358877e-05, + "grad_norm": 3.8767707347869873, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8830691576004028, + "num_tokens": 414618467.0, + "step": 10862 + }, + { + "epoch": 1.3818852563287114, + "ewc_loss": 0.007795241195708513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.795241253916174e-05, + "grad_norm": 3.8703553676605225, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8845324516296387, + "num_tokens": 414651253.0, + "step": 10863 + }, + { + "epoch": 1.382012466607302, + "ewc_loss": 0.007789734750986099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78973480919376e-05, + "grad_norm": 3.851639986038208, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.873748779296875, + "num_tokens": 414692016.0, + "step": 10864 + }, + { + "epoch": 1.3821396768858925, + "ewc_loss": 0.007779664359986782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.779664156259969e-05, + "grad_norm": 3.8419032096862793, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8787243366241455, + "num_tokens": 414731840.0, + "step": 10865 + }, + { + "epoch": 1.3822668871644828, + "ewc_loss": 0.0077827624045312405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782762259012088e-05, + "grad_norm": 3.8585896492004395, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8769087195396423, + "num_tokens": 414767110.0, + "step": 10866 + }, + { + "epoch": 1.3823940974430733, + "ewc_loss": 0.0077922469936311245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.792247197357938e-05, + "grad_norm": 3.869497060775757, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8795660734176636, + "num_tokens": 414799340.0, + "step": 10867 + }, + { + "epoch": 1.3825213077216638, + "ewc_loss": 0.007771862670779228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.771862874506041e-05, + "grad_norm": 3.779911518096924, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8773064017295837, + "num_tokens": 414840915.0, + "step": 10868 + }, + { + "epoch": 1.3826485180002543, + "ewc_loss": 0.0077372705563902855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.737270789220929e-05, + "grad_norm": 3.85791277885437, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8644556999206543, + "num_tokens": 414877298.0, + "step": 10869 + }, + { + "epoch": 1.3827757282788449, + "ewc_loss": 0.007819018326699734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819018355803564e-05, + "grad_norm": 3.8130548000335693, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.881118655204773, + "num_tokens": 414916431.0, + "step": 10870 + }, + { + "epoch": 1.3829029385574354, + "ewc_loss": 0.007771392352879047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.771392120048404e-05, + "grad_norm": 3.8156590461730957, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8732365965843201, + "num_tokens": 414958148.0, + "step": 10871 + }, + { + "epoch": 1.383030148836026, + "ewc_loss": 0.007793401833623648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793401891831309e-05, + "grad_norm": 3.7947983741760254, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8980870246887207, + "num_tokens": 414997008.0, + "step": 10872 + }, + { + "epoch": 1.3831573591146165, + "ewc_loss": 0.007764799520373344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.764799374854192e-05, + "grad_norm": 3.8308417797088623, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8600550889968872, + "num_tokens": 415042893.0, + "step": 10873 + }, + { + "epoch": 1.383284569393207, + "ewc_loss": 0.007787108886986971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.787108916090801e-05, + "grad_norm": 3.8611972332000732, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8726842999458313, + "num_tokens": 415077476.0, + "step": 10874 + }, + { + "epoch": 1.3834117796717975, + "ewc_loss": 0.007781198713928461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.7811986557208e-05, + "grad_norm": 3.792480707168579, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8679071068763733, + "num_tokens": 415124329.0, + "step": 10875 + }, + { + "epoch": 1.383538989950388, + "ewc_loss": 0.0077305450104177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730544894002378e-05, + "grad_norm": 3.823671340942383, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.88435959815979, + "num_tokens": 415164101.0, + "step": 10876 + }, + { + "epoch": 1.3836662002289786, + "ewc_loss": 0.007775355130434036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77535533416085e-05, + "grad_norm": 3.8203725814819336, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8755953311920166, + "num_tokens": 415203792.0, + "step": 10877 + }, + { + "epoch": 1.383793410507569, + "ewc_loss": 0.007745763286948204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745763286948204e-05, + "grad_norm": 4.103460788726807, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8713793754577637, + "num_tokens": 415233754.0, + "step": 10878 + }, + { + "epoch": 1.3839206207861596, + "ewc_loss": 0.007914813235402107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914812886156142e-05, + "grad_norm": 3.816751480102539, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.882174015045166, + "num_tokens": 415269567.0, + "step": 10879 + }, + { + "epoch": 1.3840478310647502, + "ewc_loss": 0.007663210853934288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.66321099945344e-05, + "grad_norm": 3.782184600830078, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8777214288711548, + "num_tokens": 415312660.0, + "step": 10880 + }, + { + "epoch": 1.3841750413433405, + "ewc_loss": 0.0077459136955440044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745913899270818e-05, + "grad_norm": 3.75738263130188, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8829027414321899, + "num_tokens": 415355377.0, + "step": 10881 + }, + { + "epoch": 1.384302251621931, + "ewc_loss": 0.007738332264125347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738332351436839e-05, + "grad_norm": 3.8353488445281982, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8864439129829407, + "num_tokens": 415393109.0, + "step": 10882 + }, + { + "epoch": 1.3844294619005215, + "ewc_loss": 0.00777237955480814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772379467496648e-05, + "grad_norm": 3.789274215698242, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.885995626449585, + "num_tokens": 415428650.0, + "step": 10883 + }, + { + "epoch": 1.384556672179112, + "ewc_loss": 0.007707448676228523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.707448821747676e-05, + "grad_norm": 3.8040316104888916, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8717718720436096, + "num_tokens": 415467166.0, + "step": 10884 + }, + { + "epoch": 1.3846838824577026, + "ewc_loss": 0.007731306832283735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.731306686764583e-05, + "grad_norm": 3.811868906021118, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8872271180152893, + "num_tokens": 415505633.0, + "step": 10885 + }, + { + "epoch": 1.384811092736293, + "ewc_loss": 0.007735467050224543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.735467079328373e-05, + "grad_norm": 3.8506627082824707, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8733227252960205, + "num_tokens": 415546254.0, + "step": 10886 + }, + { + "epoch": 1.3849383030148836, + "ewc_loss": 0.007737261243164539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.737261330476031e-05, + "grad_norm": 3.7978603839874268, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8749101161956787, + "num_tokens": 415584999.0, + "step": 10887 + }, + { + "epoch": 1.3850655132934742, + "ewc_loss": 0.007685092743486166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.685092714382336e-05, + "grad_norm": 3.825138568878174, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8780831694602966, + "num_tokens": 415624608.0, + "step": 10888 + }, + { + "epoch": 1.3851927235720647, + "ewc_loss": 0.007715510670095682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.715510582784191e-05, + "grad_norm": 3.8382976055145264, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.859817624092102, + "num_tokens": 415663405.0, + "step": 10889 + }, + { + "epoch": 1.385319933850655, + "ewc_loss": 0.007718687877058983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.718687993474305e-05, + "grad_norm": 3.836420774459839, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.888890266418457, + "num_tokens": 415698934.0, + "step": 10890 + }, + { + "epoch": 1.3854471441292455, + "ewc_loss": 0.007709852885454893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709852798143402e-05, + "grad_norm": 3.874769687652588, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8458461761474609, + "num_tokens": 415736973.0, + "step": 10891 + }, + { + "epoch": 1.385574354407836, + "ewc_loss": 0.007741207256913185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741207082290202e-05, + "grad_norm": 3.827214002609253, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8767547607421875, + "num_tokens": 415778364.0, + "step": 10892 + }, + { + "epoch": 1.3857015646864266, + "ewc_loss": 0.007687647361308336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.687647303100675e-05, + "grad_norm": 3.7676570415496826, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8906005620956421, + "num_tokens": 415817502.0, + "step": 10893 + }, + { + "epoch": 1.385828774965017, + "ewc_loss": 0.00769352400675416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.693524094065651e-05, + "grad_norm": 3.821202278137207, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8702866435050964, + "num_tokens": 415860364.0, + "step": 10894 + }, + { + "epoch": 1.3859559852436076, + "ewc_loss": 0.007739630993455648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73963110987097e-05, + "grad_norm": 3.8214449882507324, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8784501552581787, + "num_tokens": 415896816.0, + "step": 10895 + }, + { + "epoch": 1.3860831955221982, + "ewc_loss": 0.007698130328208208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.698130502831191e-05, + "grad_norm": 3.8886899948120117, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8562974333763123, + "num_tokens": 415935173.0, + "step": 10896 + }, + { + "epoch": 1.3862104058007887, + "ewc_loss": 0.0077574411407113075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757441198918968e-05, + "grad_norm": 3.801173210144043, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.883964478969574, + "num_tokens": 415977043.0, + "step": 10897 + }, + { + "epoch": 1.3863376160793792, + "ewc_loss": 0.007678063586354256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.678063411731273e-05, + "grad_norm": 3.8441686630249023, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8832491040229797, + "num_tokens": 416014376.0, + "step": 10898 + }, + { + "epoch": 1.3864648263579697, + "ewc_loss": 0.0077425213530659676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742521120235324e-05, + "grad_norm": 3.809774160385132, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8627759218215942, + "num_tokens": 416055648.0, + "step": 10899 + }, + { + "epoch": 1.3865920366365603, + "ewc_loss": 0.00771476561203599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.714765524724498e-05, + "grad_norm": 3.9350802898406982, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8662022948265076, + "num_tokens": 416087062.0, + "step": 10900 + }, + { + "epoch": 1.3867192469151508, + "ewc_loss": 0.007811517454683781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811517571099102e-05, + "grad_norm": 3.877767324447632, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8811662197113037, + "num_tokens": 416122559.0, + "step": 10901 + }, + { + "epoch": 1.3868464571937413, + "ewc_loss": 0.007729972247034311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729972276138142e-05, + "grad_norm": 3.8637049198150635, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8683983683586121, + "num_tokens": 416155228.0, + "step": 10902 + }, + { + "epoch": 1.3869736674723319, + "ewc_loss": 0.007754450663924217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.754450780339539e-05, + "grad_norm": 3.911410093307495, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.861605703830719, + "num_tokens": 416191190.0, + "step": 10903 + }, + { + "epoch": 1.3871008777509224, + "ewc_loss": 0.007803875021636486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.803874905221164e-05, + "grad_norm": 3.8455278873443604, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8777539730072021, + "num_tokens": 416228009.0, + "step": 10904 + }, + { + "epoch": 1.387228088029513, + "ewc_loss": 0.007741890847682953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741891022305936e-05, + "grad_norm": 3.839827060699463, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8692060708999634, + "num_tokens": 416263657.0, + "step": 10905 + }, + { + "epoch": 1.3873552983081032, + "ewc_loss": 0.007762589491903782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.762589666526765e-05, + "grad_norm": 3.8307764530181885, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8676863312721252, + "num_tokens": 416305187.0, + "step": 10906 + }, + { + "epoch": 1.3874825085866938, + "ewc_loss": 0.0077649676240980625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76496744947508e-05, + "grad_norm": 3.864478588104248, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8716834783554077, + "num_tokens": 416342578.0, + "step": 10907 + }, + { + "epoch": 1.3876097188652843, + "ewc_loss": 0.007774475030601025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774474943289533e-05, + "grad_norm": 3.837034225463867, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8717249631881714, + "num_tokens": 416378969.0, + "step": 10908 + }, + { + "epoch": 1.3877369291438748, + "ewc_loss": 0.007761981338262558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761981396470219e-05, + "grad_norm": 3.8500657081604004, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8728195428848267, + "num_tokens": 416412958.0, + "step": 10909 + }, + { + "epoch": 1.3878641394224653, + "ewc_loss": 0.007797638885676861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797638681950048e-05, + "grad_norm": 3.9104065895080566, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8556400537490845, + "num_tokens": 416445643.0, + "step": 10910 + }, + { + "epoch": 1.3879913497010559, + "ewc_loss": 0.007805402856320143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805402856320143e-05, + "grad_norm": 3.829702615737915, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8617876768112183, + "num_tokens": 416484891.0, + "step": 10911 + }, + { + "epoch": 1.3881185599796464, + "ewc_loss": 0.00776020297780633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.760203152429312e-05, + "grad_norm": 3.897904634475708, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.861160933971405, + "num_tokens": 416520504.0, + "step": 10912 + }, + { + "epoch": 1.388245770258237, + "ewc_loss": 0.007842372171580791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842371996957809e-05, + "grad_norm": 3.7752513885498047, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.874237060546875, + "num_tokens": 416561877.0, + "step": 10913 + }, + { + "epoch": 1.3883729805368275, + "ewc_loss": 0.007752110715955496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752110832370818e-05, + "grad_norm": 3.8383748531341553, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8774882555007935, + "num_tokens": 416597676.0, + "step": 10914 + }, + { + "epoch": 1.3885001908154178, + "ewc_loss": 0.007831375114619732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831375114619732e-05, + "grad_norm": 3.8848884105682373, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8796418905258179, + "num_tokens": 416630557.0, + "step": 10915 + }, + { + "epoch": 1.3886274010940083, + "ewc_loss": 0.007845105603337288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.845105574233457e-05, + "grad_norm": 3.8285815715789795, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8765300512313843, + "num_tokens": 416668604.0, + "step": 10916 + }, + { + "epoch": 1.3887546113725988, + "ewc_loss": 0.007785727735608816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78572793933563e-05, + "grad_norm": 3.84306001663208, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8548117280006409, + "num_tokens": 416710246.0, + "step": 10917 + }, + { + "epoch": 1.3888818216511893, + "ewc_loss": 0.007835479453206062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835479482309893e-05, + "grad_norm": 3.87334942817688, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8742201924324036, + "num_tokens": 416744962.0, + "step": 10918 + }, + { + "epoch": 1.3890090319297799, + "ewc_loss": 0.007846166379749775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.846166408853605e-05, + "grad_norm": 3.867025852203369, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8748403787612915, + "num_tokens": 416779483.0, + "step": 10919 + }, + { + "epoch": 1.3891362422083704, + "ewc_loss": 0.007825177162885666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825176726328209e-05, + "grad_norm": 3.7663981914520264, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8858277797698975, + "num_tokens": 416822135.0, + "step": 10920 + }, + { + "epoch": 1.389263452486961, + "ewc_loss": 0.007761550601571798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761550659779459e-05, + "grad_norm": 3.862551689147949, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8724538683891296, + "num_tokens": 416860383.0, + "step": 10921 + }, + { + "epoch": 1.3893906627655515, + "ewc_loss": 0.007847648113965988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847648521419615e-05, + "grad_norm": 3.8903331756591797, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8693219423294067, + "num_tokens": 416896960.0, + "step": 10922 + }, + { + "epoch": 1.389517873044142, + "ewc_loss": 0.007826934568583965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.826934597687796e-05, + "grad_norm": 3.8271288871765137, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8845460414886475, + "num_tokens": 416931981.0, + "step": 10923 + }, + { + "epoch": 1.3896450833227325, + "ewc_loss": 0.007771740667521954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.771740638418123e-05, + "grad_norm": 3.85783314704895, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8827934861183167, + "num_tokens": 416968179.0, + "step": 10924 + }, + { + "epoch": 1.389772293601323, + "ewc_loss": 0.00781293772161007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812937838025391e-05, + "grad_norm": 3.815736770629883, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8781474828720093, + "num_tokens": 417008879.0, + "step": 10925 + }, + { + "epoch": 1.3898995038799136, + "ewc_loss": 0.007763374131172895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763374014757574e-05, + "grad_norm": 3.8341846466064453, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8848850727081299, + "num_tokens": 417042497.0, + "step": 10926 + }, + { + "epoch": 1.390026714158504, + "ewc_loss": 0.007808944676071405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.808944792486727e-05, + "grad_norm": 3.818408966064453, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8757222890853882, + "num_tokens": 417084396.0, + "step": 10927 + }, + { + "epoch": 1.3901539244370946, + "ewc_loss": 0.007774392142891884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774391997372732e-05, + "grad_norm": 3.839423179626465, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8872371912002563, + "num_tokens": 417121014.0, + "step": 10928 + }, + { + "epoch": 1.3902811347156852, + "ewc_loss": 0.007777930237352848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.777930295560509e-05, + "grad_norm": 3.8369603157043457, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8727208971977234, + "num_tokens": 417160169.0, + "step": 10929 + }, + { + "epoch": 1.3904083449942755, + "ewc_loss": 0.007748461328446865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.748461212031543e-05, + "grad_norm": 3.9041695594787598, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8597602844238281, + "num_tokens": 417195024.0, + "step": 10930 + }, + { + "epoch": 1.390535555272866, + "ewc_loss": 0.007805738132447004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805738277966157e-05, + "grad_norm": 3.8453097343444824, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8798032999038696, + "num_tokens": 417231344.0, + "step": 10931 + }, + { + "epoch": 1.3906627655514565, + "ewc_loss": 0.007730187848210335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730187644483522e-05, + "grad_norm": 3.8340225219726562, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8678051233291626, + "num_tokens": 417270266.0, + "step": 10932 + }, + { + "epoch": 1.390789975830047, + "ewc_loss": 0.007772095501422882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772095705149695e-05, + "grad_norm": 3.869807243347168, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8723721504211426, + "num_tokens": 417308292.0, + "step": 10933 + }, + { + "epoch": 1.3909171861086376, + "ewc_loss": 0.007777648512274027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77764871600084e-05, + "grad_norm": 3.8463661670684814, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.872654378414154, + "num_tokens": 417346358.0, + "step": 10934 + }, + { + "epoch": 1.391044396387228, + "ewc_loss": 0.007757897954434156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757898129057139e-05, + "grad_norm": 3.8620498180389404, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8740131258964539, + "num_tokens": 417381924.0, + "step": 10935 + }, + { + "epoch": 1.3911716066658186, + "ewc_loss": 0.00778612308204174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78612302383408e-05, + "grad_norm": 3.917229413986206, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8480507731437683, + "num_tokens": 417417779.0, + "step": 10936 + }, + { + "epoch": 1.3912988169444092, + "ewc_loss": 0.007814067415893078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814067794242874e-05, + "grad_norm": 3.8742570877075195, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8673515319824219, + "num_tokens": 417459076.0, + "step": 10937 + }, + { + "epoch": 1.3914260272229997, + "ewc_loss": 0.007761936169117689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761936285533011e-05, + "grad_norm": 3.8182685375213623, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8750030994415283, + "num_tokens": 417499422.0, + "step": 10938 + }, + { + "epoch": 1.39155323750159, + "ewc_loss": 0.007765512447804213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.765512418700382e-05, + "grad_norm": 3.8525619506835938, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.874352216720581, + "num_tokens": 417541776.0, + "step": 10939 + }, + { + "epoch": 1.3916804477801805, + "ewc_loss": 0.007796451449394226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.796451245667413e-05, + "grad_norm": 3.897491216659546, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8627680540084839, + "num_tokens": 417576469.0, + "step": 10940 + }, + { + "epoch": 1.391807658058771, + "ewc_loss": 0.007794467266649008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794467092026025e-05, + "grad_norm": 3.871587038040161, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8747320175170898, + "num_tokens": 417611726.0, + "step": 10941 + }, + { + "epoch": 1.3919348683373616, + "ewc_loss": 0.007768832612782717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.768832438159734e-05, + "grad_norm": 3.8156020641326904, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8662096261978149, + "num_tokens": 417650135.0, + "step": 10942 + }, + { + "epoch": 1.392062078615952, + "ewc_loss": 0.007766570895910263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766571070533246e-05, + "grad_norm": 3.84901762008667, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8559548854827881, + "num_tokens": 417690981.0, + "step": 10943 + }, + { + "epoch": 1.3921892888945426, + "ewc_loss": 0.007812595926225185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812595868017524e-05, + "grad_norm": 3.8441128730773926, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8619869351387024, + "num_tokens": 417732152.0, + "step": 10944 + }, + { + "epoch": 1.3923164991731332, + "ewc_loss": 0.007794483099132776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794483099132776e-05, + "grad_norm": 3.8422985076904297, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8631836175918579, + "num_tokens": 417771814.0, + "step": 10945 + }, + { + "epoch": 1.3924437094517237, + "ewc_loss": 0.007792550604790449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.792550604790449e-05, + "grad_norm": 3.8183250427246094, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8638237714767456, + "num_tokens": 417810663.0, + "step": 10946 + }, + { + "epoch": 1.3925709197303142, + "ewc_loss": 0.007799171842634678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.799171726219356e-05, + "grad_norm": 3.8558545112609863, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8771041631698608, + "num_tokens": 417845664.0, + "step": 10947 + }, + { + "epoch": 1.3926981300089047, + "ewc_loss": 0.007822598330676556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.822598126949742e-05, + "grad_norm": 3.7961137294769287, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8656257390975952, + "num_tokens": 417889617.0, + "step": 10948 + }, + { + "epoch": 1.3928253402874953, + "ewc_loss": 0.007775789126753807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.775788981234655e-05, + "grad_norm": 3.8319592475891113, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8722206354141235, + "num_tokens": 417930820.0, + "step": 10949 + }, + { + "epoch": 1.3929525505660858, + "ewc_loss": 0.007817570120096207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81756971264258e-05, + "grad_norm": 3.8811287879943848, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8740018606185913, + "num_tokens": 417966799.0, + "step": 10950 + }, + { + "epoch": 1.3930797608446763, + "ewc_loss": 0.007848844863474369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848844688851386e-05, + "grad_norm": 3.8223700523376465, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8660737872123718, + "num_tokens": 418009328.0, + "step": 10951 + }, + { + "epoch": 1.3932069711232669, + "ewc_loss": 0.007781559601426125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781559543218464e-05, + "grad_norm": 3.8218090534210205, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8761818408966064, + "num_tokens": 418048548.0, + "step": 10952 + }, + { + "epoch": 1.3933341814018574, + "ewc_loss": 0.007794672157615423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794672274030745e-05, + "grad_norm": 3.757389783859253, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8720917701721191, + "num_tokens": 418094947.0, + "step": 10953 + }, + { + "epoch": 1.393461391680448, + "ewc_loss": 0.007756994571536779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.756994455121458e-05, + "grad_norm": 3.834423303604126, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8646944761276245, + "num_tokens": 418137144.0, + "step": 10954 + }, + { + "epoch": 1.3935886019590382, + "ewc_loss": 0.007823644205927849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.823644409654662e-05, + "grad_norm": 3.814340829849243, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8932679891586304, + "num_tokens": 418171449.0, + "step": 10955 + }, + { + "epoch": 1.3937158122376287, + "ewc_loss": 0.00777060491964221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770604861434549e-05, + "grad_norm": 3.8280093669891357, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8756914138793945, + "num_tokens": 418209854.0, + "step": 10956 + }, + { + "epoch": 1.3938430225162193, + "ewc_loss": 0.007760508917272091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.760508742649108e-05, + "grad_norm": 3.8428966999053955, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8816224932670593, + "num_tokens": 418245121.0, + "step": 10957 + }, + { + "epoch": 1.3939702327948098, + "ewc_loss": 0.007757936138659716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757935964036733e-05, + "grad_norm": 3.8092989921569824, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8780423402786255, + "num_tokens": 418284158.0, + "step": 10958 + }, + { + "epoch": 1.3940974430734003, + "ewc_loss": 0.007749687880277634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749687938485295e-05, + "grad_norm": 3.8270492553710938, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.880339503288269, + "num_tokens": 418324091.0, + "step": 10959 + }, + { + "epoch": 1.3942246533519909, + "ewc_loss": 0.007752683945000172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752684177830815e-05, + "grad_norm": 3.8263933658599854, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8747044801712036, + "num_tokens": 418364764.0, + "step": 10960 + }, + { + "epoch": 1.3943518636305814, + "ewc_loss": 0.00774287898093462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742879097349942e-05, + "grad_norm": 3.84919810295105, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8695498704910278, + "num_tokens": 418403974.0, + "step": 10961 + }, + { + "epoch": 1.394479073909172, + "ewc_loss": 0.0077402712777256966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740271394141018e-05, + "grad_norm": 3.805285930633545, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8646995425224304, + "num_tokens": 418446341.0, + "step": 10962 + }, + { + "epoch": 1.3946062841877624, + "ewc_loss": 0.007705234922468662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70523474784568e-05, + "grad_norm": 3.8031105995178223, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8740857839584351, + "num_tokens": 418490989.0, + "step": 10963 + }, + { + "epoch": 1.3947334944663528, + "ewc_loss": 0.007721700239926577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.721700239926577e-05, + "grad_norm": 3.9120240211486816, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8676902651786804, + "num_tokens": 418524633.0, + "step": 10964 + }, + { + "epoch": 1.3948607047449433, + "ewc_loss": 0.0077813658863306046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781366002745926e-05, + "grad_norm": 3.8787682056427, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8742365837097168, + "num_tokens": 418562576.0, + "step": 10965 + }, + { + "epoch": 1.3949879150235338, + "ewc_loss": 0.00770565215498209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705652387812734e-05, + "grad_norm": 3.823366165161133, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8725285530090332, + "num_tokens": 418596537.0, + "step": 10966 + }, + { + "epoch": 1.3951151253021243, + "ewc_loss": 0.007702456787228584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.702456787228584e-05, + "grad_norm": 3.9349303245544434, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8722444772720337, + "num_tokens": 418625800.0, + "step": 10967 + }, + { + "epoch": 1.3952423355807149, + "ewc_loss": 0.007789415307343006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.789415394654498e-05, + "grad_norm": 3.839442491531372, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8607056736946106, + "num_tokens": 418663818.0, + "step": 10968 + }, + { + "epoch": 1.3953695458593054, + "ewc_loss": 0.007701382040977478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70138212828897e-05, + "grad_norm": 3.821523666381836, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8794668912887573, + "num_tokens": 418700607.0, + "step": 10969 + }, + { + "epoch": 1.395496756137896, + "ewc_loss": 0.007738499902188778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738499698461965e-05, + "grad_norm": 3.8463258743286133, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8685035705566406, + "num_tokens": 418739357.0, + "step": 10970 + }, + { + "epoch": 1.3956239664164865, + "ewc_loss": 0.0077619473449885845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761947199469432e-05, + "grad_norm": 3.9070146083831787, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.880534291267395, + "num_tokens": 418769752.0, + "step": 10971 + }, + { + "epoch": 1.395751176695077, + "ewc_loss": 0.007797159720212221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797159923939034e-05, + "grad_norm": 3.8661677837371826, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8541347980499268, + "num_tokens": 418808196.0, + "step": 10972 + }, + { + "epoch": 1.3958783869736675, + "ewc_loss": 0.007765588816255331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.765588816255331e-05, + "grad_norm": 3.8648629188537598, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8892924785614014, + "num_tokens": 418843335.0, + "step": 10973 + }, + { + "epoch": 1.396005597252258, + "ewc_loss": 0.007780659012496471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.780658779665828e-05, + "grad_norm": 3.8711464405059814, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8820708990097046, + "num_tokens": 418876591.0, + "step": 10974 + }, + { + "epoch": 1.3961328075308486, + "ewc_loss": 0.007794616743922234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794616976752877e-05, + "grad_norm": 3.897347927093506, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.861436665058136, + "num_tokens": 418910482.0, + "step": 10975 + }, + { + "epoch": 1.396260017809439, + "ewc_loss": 0.007817888632416725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817888399586082e-05, + "grad_norm": 3.910856008529663, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8660976886749268, + "num_tokens": 418942248.0, + "step": 10976 + }, + { + "epoch": 1.3963872280880296, + "ewc_loss": 0.007828072644770145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.828072557458654e-05, + "grad_norm": 3.8509113788604736, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8762437105178833, + "num_tokens": 418977770.0, + "step": 10977 + }, + { + "epoch": 1.3965144383666201, + "ewc_loss": 0.0078111132606863976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811113027855754e-05, + "grad_norm": 3.870847702026367, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8676855564117432, + "num_tokens": 419016464.0, + "step": 10978 + }, + { + "epoch": 1.3966416486452105, + "ewc_loss": 0.00784281361848116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84281364758499e-05, + "grad_norm": 3.8537611961364746, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8654640913009644, + "num_tokens": 419053254.0, + "step": 10979 + }, + { + "epoch": 1.396768858923801, + "ewc_loss": 0.007836084812879562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836084841983393e-05, + "grad_norm": 3.8530876636505127, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8686683177947998, + "num_tokens": 419088940.0, + "step": 10980 + }, + { + "epoch": 1.3968960692023915, + "ewc_loss": 0.007851094007492065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851094414945692e-05, + "grad_norm": 3.829202175140381, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8729493618011475, + "num_tokens": 419127897.0, + "step": 10981 + }, + { + "epoch": 1.397023279480982, + "ewc_loss": 0.007837369106709957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837369048502296e-05, + "grad_norm": 3.8057658672332764, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8764346837997437, + "num_tokens": 419170921.0, + "step": 10982 + }, + { + "epoch": 1.3971504897595726, + "ewc_loss": 0.007836200296878815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836200529709458e-05, + "grad_norm": 3.883012294769287, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.87956303358078, + "num_tokens": 419206032.0, + "step": 10983 + }, + { + "epoch": 1.397277700038163, + "ewc_loss": 0.007868410088121891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868410466471687e-05, + "grad_norm": 3.8483338356018066, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.887496292591095, + "num_tokens": 419244240.0, + "step": 10984 + }, + { + "epoch": 1.3974049103167536, + "ewc_loss": 0.007830441929399967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.830442336853594e-05, + "grad_norm": 3.877563238143921, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8818408250808716, + "num_tokens": 419274902.0, + "step": 10985 + }, + { + "epoch": 1.3975321205953442, + "ewc_loss": 0.00784597359597683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84597359597683e-05, + "grad_norm": 3.8144187927246094, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8862694501876831, + "num_tokens": 419312295.0, + "step": 10986 + }, + { + "epoch": 1.3976593308739347, + "ewc_loss": 0.007791584357619286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791584357619286e-05, + "grad_norm": 3.870107650756836, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8558782339096069, + "num_tokens": 419350999.0, + "step": 10987 + }, + { + "epoch": 1.397786541152525, + "ewc_loss": 0.007837314158678055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837313751224428e-05, + "grad_norm": 3.8602182865142822, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8753559589385986, + "num_tokens": 419386739.0, + "step": 10988 + }, + { + "epoch": 1.3979137514311155, + "ewc_loss": 0.007810765877366066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.810765964677557e-05, + "grad_norm": 3.7805516719818115, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8798960447311401, + "num_tokens": 419430353.0, + "step": 10989 + }, + { + "epoch": 1.398040961709706, + "ewc_loss": 0.007773112040013075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.773112156428397e-05, + "grad_norm": 3.9080471992492676, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8705374002456665, + "num_tokens": 419462910.0, + "step": 10990 + }, + { + "epoch": 1.3981681719882966, + "ewc_loss": 0.007874189876019955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.874189759604633e-05, + "grad_norm": 3.8418943881988525, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8840835094451904, + "num_tokens": 419498727.0, + "step": 10991 + }, + { + "epoch": 1.398295382266887, + "ewc_loss": 0.007782974746078253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782974716974422e-05, + "grad_norm": 3.898209571838379, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8660088181495667, + "num_tokens": 419534496.0, + "step": 10992 + }, + { + "epoch": 1.3984225925454776, + "ewc_loss": 0.00784390326589346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.843902858439833e-05, + "grad_norm": 3.824639081954956, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8701196908950806, + "num_tokens": 419575760.0, + "step": 10993 + }, + { + "epoch": 1.3985498028240682, + "ewc_loss": 0.007779140956699848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.779141014907509e-05, + "grad_norm": 3.835033655166626, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8792551755905151, + "num_tokens": 419616822.0, + "step": 10994 + }, + { + "epoch": 1.3986770131026587, + "ewc_loss": 0.007797844707965851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797844591550529e-05, + "grad_norm": 3.8508613109588623, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.879422664642334, + "num_tokens": 419650425.0, + "step": 10995 + }, + { + "epoch": 1.3988042233812492, + "ewc_loss": 0.007805342320352793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805342465871945e-05, + "grad_norm": 3.8429813385009766, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8829396367073059, + "num_tokens": 419685501.0, + "step": 10996 + }, + { + "epoch": 1.3989314336598397, + "ewc_loss": 0.007798750419169664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.798750448273495e-05, + "grad_norm": 3.8337416648864746, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8761321306228638, + "num_tokens": 419724471.0, + "step": 10997 + }, + { + "epoch": 1.3990586439384303, + "ewc_loss": 0.007785348687320948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78534886194393e-05, + "grad_norm": 3.8190653324127197, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8815600275993347, + "num_tokens": 419763683.0, + "step": 10998 + }, + { + "epoch": 1.3991858542170208, + "ewc_loss": 0.00777440657839179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77440654928796e-05, + "grad_norm": 3.827920913696289, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8786928653717041, + "num_tokens": 419801238.0, + "step": 10999 + }, + { + "epoch": 1.3993130644956113, + "ewc_loss": 0.00778457336127758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.784573244862258e-05, + "grad_norm": 3.955537796020508, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8695574402809143, + "num_tokens": 419832787.0, + "step": 11000 + }, + { + "epoch": 1.3994402747742019, + "ewc_loss": 0.007859562523663044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859562174417078e-05, + "grad_norm": 3.9070863723754883, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8645037412643433, + "num_tokens": 419864294.0, + "step": 11001 + }, + { + "epoch": 1.3995674850527924, + "ewc_loss": 0.007784718181937933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.784718036418781e-05, + "grad_norm": 3.8335483074188232, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8757368326187134, + "num_tokens": 419900493.0, + "step": 11002 + }, + { + "epoch": 1.399694695331383, + "ewc_loss": 0.007795128971338272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.795129204168916e-05, + "grad_norm": 3.821791887283325, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8898707628250122, + "num_tokens": 419938917.0, + "step": 11003 + }, + { + "epoch": 1.3998219056099732, + "ewc_loss": 0.007801597472280264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.801597530487925e-05, + "grad_norm": 3.7494232654571533, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8885433673858643, + "num_tokens": 419978771.0, + "step": 11004 + }, + { + "epoch": 1.3999491158885637, + "ewc_loss": 0.007793762255460024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793762051733211e-05, + "grad_norm": 3.90106463432312, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8718630075454712, + "num_tokens": 420011836.0, + "step": 11005 + }, + { + "epoch": 1.4000763261671543, + "ewc_loss": 0.007903581485152245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.903581717982888e-05, + "grad_norm": 3.8479087352752686, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8534141182899475, + "num_tokens": 420051618.0, + "step": 11006 + }, + { + "epoch": 1.4002035364457448, + "ewc_loss": 0.007813036441802979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.813036791048944e-05, + "grad_norm": 3.8443503379821777, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8611658215522766, + "num_tokens": 420089119.0, + "step": 11007 + }, + { + "epoch": 1.4003307467243353, + "ewc_loss": 0.007833409123122692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.833408744772896e-05, + "grad_norm": 3.864938497543335, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8849930763244629, + "num_tokens": 420123706.0, + "step": 11008 + }, + { + "epoch": 1.4004579570029259, + "ewc_loss": 0.007842876017093658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842876220820472e-05, + "grad_norm": 3.9138550758361816, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.871153712272644, + "num_tokens": 420156723.0, + "step": 11009 + }, + { + "epoch": 1.4005851672815164, + "ewc_loss": 0.007850603200495243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850603287806734e-05, + "grad_norm": 3.79557728767395, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8741337060928345, + "num_tokens": 420198519.0, + "step": 11010 + }, + { + "epoch": 1.400712377560107, + "ewc_loss": 0.007775059901177883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.775059930281714e-05, + "grad_norm": 3.868182897567749, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8862992525100708, + "num_tokens": 420231722.0, + "step": 11011 + }, + { + "epoch": 1.4008395878386974, + "ewc_loss": 0.00785570964217186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.855709554860368e-05, + "grad_norm": 3.8045814037323, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8767862319946289, + "num_tokens": 420272059.0, + "step": 11012 + }, + { + "epoch": 1.4009667981172877, + "ewc_loss": 0.007805857807397842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805857603671029e-05, + "grad_norm": 3.859915018081665, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8815954923629761, + "num_tokens": 420307695.0, + "step": 11013 + }, + { + "epoch": 1.4010940083958783, + "ewc_loss": 0.007861913181841373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861913036322221e-05, + "grad_norm": 3.844376802444458, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.882556676864624, + "num_tokens": 420345989.0, + "step": 11014 + }, + { + "epoch": 1.4012212186744688, + "ewc_loss": 0.007821924053132534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.821924373274669e-05, + "grad_norm": 3.8283917903900146, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8801187872886658, + "num_tokens": 420387194.0, + "step": 11015 + }, + { + "epoch": 1.4013484289530593, + "ewc_loss": 0.007817097939550877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817098230589181e-05, + "grad_norm": 3.8033430576324463, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8897137641906738, + "num_tokens": 420427608.0, + "step": 11016 + }, + { + "epoch": 1.4014756392316499, + "ewc_loss": 0.00778124900534749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781248859828338e-05, + "grad_norm": 3.8804430961608887, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.879731297492981, + "num_tokens": 420461363.0, + "step": 11017 + }, + { + "epoch": 1.4016028495102404, + "ewc_loss": 0.007844772189855576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844772335374728e-05, + "grad_norm": 3.865452289581299, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8568428754806519, + "num_tokens": 420503577.0, + "step": 11018 + }, + { + "epoch": 1.401730059788831, + "ewc_loss": 0.007798151113092899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.798150909366086e-05, + "grad_norm": 3.8592491149902344, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8856652975082397, + "num_tokens": 420538852.0, + "step": 11019 + }, + { + "epoch": 1.4018572700674214, + "ewc_loss": 0.00781258288770914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812582771293819e-05, + "grad_norm": 3.86444091796875, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8712514638900757, + "num_tokens": 420577495.0, + "step": 11020 + }, + { + "epoch": 1.401984480346012, + "ewc_loss": 0.007797944359481335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797944272169843e-05, + "grad_norm": 3.7966816425323486, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8747786283493042, + "num_tokens": 420616123.0, + "step": 11021 + }, + { + "epoch": 1.4021116906246025, + "ewc_loss": 0.007769809570163488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769809599267319e-05, + "grad_norm": 3.816110372543335, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8832367658615112, + "num_tokens": 420653904.0, + "step": 11022 + }, + { + "epoch": 1.402238900903193, + "ewc_loss": 0.0078117395751178265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811739487806335e-05, + "grad_norm": 3.791720151901245, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8968015313148499, + "num_tokens": 420693539.0, + "step": 11023 + }, + { + "epoch": 1.4023661111817836, + "ewc_loss": 0.00776375038549304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763750181766227e-05, + "grad_norm": 3.8369295597076416, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8770691752433777, + "num_tokens": 420729046.0, + "step": 11024 + }, + { + "epoch": 1.402493321460374, + "ewc_loss": 0.007821185514330864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82118586357683e-05, + "grad_norm": 3.8926901817321777, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8787740468978882, + "num_tokens": 420766143.0, + "step": 11025 + }, + { + "epoch": 1.4026205317389646, + "ewc_loss": 0.0078101386316120625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.810138777131215e-05, + "grad_norm": 3.79746413230896, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8710649609565735, + "num_tokens": 420806367.0, + "step": 11026 + }, + { + "epoch": 1.4027477420175551, + "ewc_loss": 0.007740972097963095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740972068859264e-05, + "grad_norm": 3.8009917736053467, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8807640075683594, + "num_tokens": 420845081.0, + "step": 11027 + }, + { + "epoch": 1.4028749522961454, + "ewc_loss": 0.007785139139741659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785139314364642e-05, + "grad_norm": 3.814037561416626, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8810504674911499, + "num_tokens": 420885833.0, + "step": 11028 + }, + { + "epoch": 1.403002162574736, + "ewc_loss": 0.007761427201330662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761426968500018e-05, + "grad_norm": 3.928271532058716, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8799487948417664, + "num_tokens": 420916313.0, + "step": 11029 + }, + { + "epoch": 1.4031293728533265, + "ewc_loss": 0.007829863578081131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829863898223266e-05, + "grad_norm": 3.848025321960449, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8573974370956421, + "num_tokens": 420957011.0, + "step": 11030 + }, + { + "epoch": 1.403256583131917, + "ewc_loss": 0.007732181344181299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732181256869808e-05, + "grad_norm": 3.79419207572937, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8783641457557678, + "num_tokens": 420994815.0, + "step": 11031 + }, + { + "epoch": 1.4033837934105076, + "ewc_loss": 0.007745884824544191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745884795440361e-05, + "grad_norm": 3.823385238647461, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.881258487701416, + "num_tokens": 421033977.0, + "step": 11032 + }, + { + "epoch": 1.403511003689098, + "ewc_loss": 0.007770091760903597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770091906422749e-05, + "grad_norm": 3.825725555419922, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8792890906333923, + "num_tokens": 421068871.0, + "step": 11033 + }, + { + "epoch": 1.4036382139676886, + "ewc_loss": 0.007769836578518152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769836520310491e-05, + "grad_norm": 3.8575570583343506, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8757359981536865, + "num_tokens": 421105680.0, + "step": 11034 + }, + { + "epoch": 1.4037654242462791, + "ewc_loss": 0.007768484763801098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.768484647385776e-05, + "grad_norm": 3.7614216804504395, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8869988322257996, + "num_tokens": 421149519.0, + "step": 11035 + }, + { + "epoch": 1.4038926345248697, + "ewc_loss": 0.007716882973909378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716882828390226e-05, + "grad_norm": 3.9007568359375, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8761380910873413, + "num_tokens": 421190195.0, + "step": 11036 + }, + { + "epoch": 1.40401984480346, + "ewc_loss": 0.007808792870491743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.808792724972591e-05, + "grad_norm": 3.899444341659546, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8717641234397888, + "num_tokens": 421218637.0, + "step": 11037 + }, + { + "epoch": 1.4041470550820505, + "ewc_loss": 0.007749113254249096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749113137833774e-05, + "grad_norm": 3.8059539794921875, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8802176713943481, + "num_tokens": 421261300.0, + "step": 11038 + }, + { + "epoch": 1.404274265360641, + "ewc_loss": 0.007713041268289089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713041122769937e-05, + "grad_norm": 3.8296682834625244, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8714324235916138, + "num_tokens": 421297250.0, + "step": 11039 + }, + { + "epoch": 1.4044014756392316, + "ewc_loss": 0.007776570972055197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77657114667818e-05, + "grad_norm": 3.8281257152557373, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8694474101066589, + "num_tokens": 421334213.0, + "step": 11040 + }, + { + "epoch": 1.404528685917822, + "ewc_loss": 0.007745393551886082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745393668301404e-05, + "grad_norm": 3.847731828689575, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8820374608039856, + "num_tokens": 421368341.0, + "step": 11041 + }, + { + "epoch": 1.4046558961964126, + "ewc_loss": 0.00775476498529315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.754765101708472e-05, + "grad_norm": 3.8209333419799805, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8643555641174316, + "num_tokens": 421408278.0, + "step": 11042 + }, + { + "epoch": 1.4047831064750032, + "ewc_loss": 0.007766715716570616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766715862089768e-05, + "grad_norm": 3.8878841400146484, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8676907420158386, + "num_tokens": 421442559.0, + "step": 11043 + }, + { + "epoch": 1.4049103167535937, + "ewc_loss": 0.007803286425769329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.803286280250177e-05, + "grad_norm": 3.863048553466797, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8751324415206909, + "num_tokens": 421474014.0, + "step": 11044 + }, + { + "epoch": 1.4050375270321842, + "ewc_loss": 0.007774199824780226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774199912091717e-05, + "grad_norm": 3.8791353702545166, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.850355863571167, + "num_tokens": 421511193.0, + "step": 11045 + }, + { + "epoch": 1.4051647373107747, + "ewc_loss": 0.00780095811933279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.800957973813638e-05, + "grad_norm": 3.801669120788574, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8755983114242554, + "num_tokens": 421547895.0, + "step": 11046 + }, + { + "epoch": 1.4052919475893653, + "ewc_loss": 0.00777885876595974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778858707752079e-05, + "grad_norm": 3.8825759887695312, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8747150897979736, + "num_tokens": 421582883.0, + "step": 11047 + }, + { + "epoch": 1.4054191578679558, + "ewc_loss": 0.007852012291550636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85201191320084e-05, + "grad_norm": 3.7959117889404297, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8773605823516846, + "num_tokens": 421619838.0, + "step": 11048 + }, + { + "epoch": 1.4055463681465463, + "ewc_loss": 0.007761651184409857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761651067994535e-05, + "grad_norm": 3.850101947784424, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8842886686325073, + "num_tokens": 421659451.0, + "step": 11049 + }, + { + "epoch": 1.4056735784251368, + "ewc_loss": 0.007853270508348942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853270653868094e-05, + "grad_norm": 3.8110764026641846, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8726053237915039, + "num_tokens": 421704826.0, + "step": 11050 + }, + { + "epoch": 1.4058007887037274, + "ewc_loss": 0.0077986884862184525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.798688602633774e-05, + "grad_norm": 3.8126275539398193, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8712620735168457, + "num_tokens": 421748953.0, + "step": 11051 + }, + { + "epoch": 1.405927998982318, + "ewc_loss": 0.007799448445439339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.799448212608695e-05, + "grad_norm": 3.860084295272827, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8577092885971069, + "num_tokens": 421786614.0, + "step": 11052 + }, + { + "epoch": 1.4060552092609082, + "ewc_loss": 0.00784641969949007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.846419612178579e-05, + "grad_norm": 3.921764850616455, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8703566193580627, + "num_tokens": 421817291.0, + "step": 11053 + }, + { + "epoch": 1.4061824195394987, + "ewc_loss": 0.007838764227926731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838763849576935e-05, + "grad_norm": 3.848132610321045, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8720974922180176, + "num_tokens": 421858200.0, + "step": 11054 + }, + { + "epoch": 1.4063096298180893, + "ewc_loss": 0.00778609374538064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786093920003623e-05, + "grad_norm": 3.816434621810913, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8741937875747681, + "num_tokens": 421903343.0, + "step": 11055 + }, + { + "epoch": 1.4064368400966798, + "ewc_loss": 0.007799983024597168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.799982995493338e-05, + "grad_norm": 3.837237596511841, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8902310132980347, + "num_tokens": 421941426.0, + "step": 11056 + }, + { + "epoch": 1.4065640503752703, + "ewc_loss": 0.007804990280419588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.804990309523419e-05, + "grad_norm": 3.8241615295410156, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.871494710445404, + "num_tokens": 421983992.0, + "step": 11057 + }, + { + "epoch": 1.4066912606538609, + "ewc_loss": 0.007791050244122744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791050302330405e-05, + "grad_norm": 3.881976842880249, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8836860656738281, + "num_tokens": 422015066.0, + "step": 11058 + }, + { + "epoch": 1.4068184709324514, + "ewc_loss": 0.007832800038158894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83280047471635e-05, + "grad_norm": 3.8337368965148926, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8760440349578857, + "num_tokens": 422052392.0, + "step": 11059 + }, + { + "epoch": 1.406945681211042, + "ewc_loss": 0.007764027453958988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.764027395751327e-05, + "grad_norm": 3.913908004760742, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8699873685836792, + "num_tokens": 422091453.0, + "step": 11060 + }, + { + "epoch": 1.4070728914896324, + "ewc_loss": 0.007821128703653812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.821129111107439e-05, + "grad_norm": 3.870105504989624, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8590256571769714, + "num_tokens": 422131019.0, + "step": 11061 + }, + { + "epoch": 1.4072001017682227, + "ewc_loss": 0.007766854017972946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766854105284438e-05, + "grad_norm": 3.8180978298187256, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8691670298576355, + "num_tokens": 422172139.0, + "step": 11062 + }, + { + "epoch": 1.4073273120468133, + "ewc_loss": 0.0077593266032636166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.759326399536803e-05, + "grad_norm": 3.824826717376709, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8792359232902527, + "num_tokens": 422209708.0, + "step": 11063 + }, + { + "epoch": 1.4074545223254038, + "ewc_loss": 0.0077660297974944115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76602973928675e-05, + "grad_norm": 3.8523552417755127, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8758739233016968, + "num_tokens": 422245091.0, + "step": 11064 + }, + { + "epoch": 1.4075817326039943, + "ewc_loss": 0.007785458583384752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785458728903905e-05, + "grad_norm": 3.9026172161102295, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8733581304550171, + "num_tokens": 422280054.0, + "step": 11065 + }, + { + "epoch": 1.4077089428825849, + "ewc_loss": 0.007798288948833942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.798289152560756e-05, + "grad_norm": 3.916329860687256, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8800815343856812, + "num_tokens": 422310036.0, + "step": 11066 + }, + { + "epoch": 1.4078361531611754, + "ewc_loss": 0.007789939176291227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.789939263602719e-05, + "grad_norm": 3.8811886310577393, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8583402633666992, + "num_tokens": 422347914.0, + "step": 11067 + }, + { + "epoch": 1.407963363439766, + "ewc_loss": 0.007786037400364876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786037167534232e-05, + "grad_norm": 3.8706376552581787, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8778581619262695, + "num_tokens": 422383207.0, + "step": 11068 + }, + { + "epoch": 1.4080905737183564, + "ewc_loss": 0.007805700413882732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805700442986563e-05, + "grad_norm": 3.8582451343536377, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8790603876113892, + "num_tokens": 422414675.0, + "step": 11069 + }, + { + "epoch": 1.408217783996947, + "ewc_loss": 0.007793409749865532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793409895384684e-05, + "grad_norm": 3.852102518081665, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8640433549880981, + "num_tokens": 422449746.0, + "step": 11070 + }, + { + "epoch": 1.4083449942755375, + "ewc_loss": 0.00779390474781394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793904660502449e-05, + "grad_norm": 3.8430933952331543, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8686574101448059, + "num_tokens": 422487789.0, + "step": 11071 + }, + { + "epoch": 1.408472204554128, + "ewc_loss": 0.007811977993696928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81197813921608e-05, + "grad_norm": 3.8293263912200928, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8808306455612183, + "num_tokens": 422521750.0, + "step": 11072 + }, + { + "epoch": 1.4085994148327186, + "ewc_loss": 0.0078118029050529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811802788637578e-05, + "grad_norm": 3.8009579181671143, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8833141922950745, + "num_tokens": 422562923.0, + "step": 11073 + }, + { + "epoch": 1.408726625111309, + "ewc_loss": 0.007786495611071587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786495552863926e-05, + "grad_norm": 3.825216054916382, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8646520376205444, + "num_tokens": 422606832.0, + "step": 11074 + }, + { + "epoch": 1.4088538353898996, + "ewc_loss": 0.00781701784580946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817017467459664e-05, + "grad_norm": 3.862722396850586, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8788802623748779, + "num_tokens": 422644465.0, + "step": 11075 + }, + { + "epoch": 1.4089810456684901, + "ewc_loss": 0.007819806225597858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81980634201318e-05, + "grad_norm": 3.8394083976745605, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8836408257484436, + "num_tokens": 422681204.0, + "step": 11076 + }, + { + "epoch": 1.4091082559470804, + "ewc_loss": 0.007803113665431738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80311384005472e-05, + "grad_norm": 3.892371892929077, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8640921115875244, + "num_tokens": 422716325.0, + "step": 11077 + }, + { + "epoch": 1.409235466225671, + "ewc_loss": 0.007851764559745789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851764530641958e-05, + "grad_norm": 3.8444511890411377, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8651747107505798, + "num_tokens": 422753644.0, + "step": 11078 + }, + { + "epoch": 1.4093626765042615, + "ewc_loss": 0.007788727059960365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.788727089064196e-05, + "grad_norm": 3.9224870204925537, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8739495277404785, + "num_tokens": 422790893.0, + "step": 11079 + }, + { + "epoch": 1.409489886782852, + "ewc_loss": 0.007859127596020699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859127799747512e-05, + "grad_norm": 3.848825216293335, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.866378903388977, + "num_tokens": 422829400.0, + "step": 11080 + }, + { + "epoch": 1.4096170970614426, + "ewc_loss": 0.007796364836394787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.796364661771804e-05, + "grad_norm": 3.836536407470703, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8834430575370789, + "num_tokens": 422867386.0, + "step": 11081 + }, + { + "epoch": 1.409744307340033, + "ewc_loss": 0.007808671798557043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.808671944076195e-05, + "grad_norm": 3.7848031520843506, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8756641149520874, + "num_tokens": 422910736.0, + "step": 11082 + }, + { + "epoch": 1.4098715176186236, + "ewc_loss": 0.007797693368047476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797693251632154e-05, + "grad_norm": 3.8361377716064453, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8805454969406128, + "num_tokens": 422950302.0, + "step": 11083 + }, + { + "epoch": 1.4099987278972141, + "ewc_loss": 0.007845012471079826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.845012441975996e-05, + "grad_norm": 3.8874030113220215, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.874384880065918, + "num_tokens": 422985892.0, + "step": 11084 + }, + { + "epoch": 1.4101259381758047, + "ewc_loss": 0.007840179838240147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.840179750928655e-05, + "grad_norm": 3.8632287979125977, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8672629594802856, + "num_tokens": 423020075.0, + "step": 11085 + }, + { + "epoch": 1.410253148454395, + "ewc_loss": 0.007825399748980999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825400098226964e-05, + "grad_norm": 3.863837242126465, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8771933913230896, + "num_tokens": 423058260.0, + "step": 11086 + }, + { + "epoch": 1.4103803587329855, + "ewc_loss": 0.007820849306881428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820849714335054e-05, + "grad_norm": 3.8175501823425293, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8966149687767029, + "num_tokens": 423092158.0, + "step": 11087 + }, + { + "epoch": 1.410507569011576, + "ewc_loss": 0.0077970800921320915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797079888405278e-05, + "grad_norm": 3.9233641624450684, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8728662729263306, + "num_tokens": 423123999.0, + "step": 11088 + }, + { + "epoch": 1.4106347792901666, + "ewc_loss": 0.007871406152844429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.871405978221446e-05, + "grad_norm": 3.8264167308807373, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8712418675422668, + "num_tokens": 423167111.0, + "step": 11089 + }, + { + "epoch": 1.410761989568757, + "ewc_loss": 0.007784626912325621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.784627086948603e-05, + "grad_norm": 3.8816800117492676, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8676812052726746, + "num_tokens": 423202060.0, + "step": 11090 + }, + { + "epoch": 1.4108891998473476, + "ewc_loss": 0.007854803465306759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.854803698137403e-05, + "grad_norm": 3.866379499435425, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8670393228530884, + "num_tokens": 423241200.0, + "step": 11091 + }, + { + "epoch": 1.4110164101259381, + "ewc_loss": 0.007815010845661163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815010758349672e-05, + "grad_norm": 3.7688381671905518, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8829198479652405, + "num_tokens": 423284713.0, + "step": 11092 + }, + { + "epoch": 1.4111436204045287, + "ewc_loss": 0.0077935256995260715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79352558311075e-05, + "grad_norm": 3.8896710872650146, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8770701885223389, + "num_tokens": 423322524.0, + "step": 11093 + }, + { + "epoch": 1.4112708306831192, + "ewc_loss": 0.007877054624259472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877055031713098e-05, + "grad_norm": 3.8165762424468994, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8760230541229248, + "num_tokens": 423360551.0, + "step": 11094 + }, + { + "epoch": 1.4113980409617097, + "ewc_loss": 0.00778276426717639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782764441799372e-05, + "grad_norm": 3.8496110439300537, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.884958028793335, + "num_tokens": 423395227.0, + "step": 11095 + }, + { + "epoch": 1.4115252512403003, + "ewc_loss": 0.007833009585738182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.833010022295639e-05, + "grad_norm": 3.8725874423980713, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8608357906341553, + "num_tokens": 423434757.0, + "step": 11096 + }, + { + "epoch": 1.4116524615188908, + "ewc_loss": 0.007817014120519161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817014557076618e-05, + "grad_norm": 3.9250669479370117, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8911725878715515, + "num_tokens": 423464599.0, + "step": 11097 + }, + { + "epoch": 1.4117796717974813, + "ewc_loss": 0.007850701920688152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850701513234526e-05, + "grad_norm": 3.8849527835845947, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8694842457771301, + "num_tokens": 423504327.0, + "step": 11098 + }, + { + "epoch": 1.4119068820760718, + "ewc_loss": 0.007814697921276093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814697892172262e-05, + "grad_norm": 3.895359516143799, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8548365831375122, + "num_tokens": 423543114.0, + "step": 11099 + }, + { + "epoch": 1.4120340923546624, + "ewc_loss": 0.00784026924520731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84026924520731e-05, + "grad_norm": 3.834235668182373, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8613846898078918, + "num_tokens": 423581727.0, + "step": 11100 + }, + { + "epoch": 1.412161302633253, + "ewc_loss": 0.007795909885317087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.795909914420918e-05, + "grad_norm": 3.872577428817749, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8773163557052612, + "num_tokens": 423617073.0, + "step": 11101 + }, + { + "epoch": 1.4122885129118432, + "ewc_loss": 0.007866491563618183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866491796448827e-05, + "grad_norm": 3.753157377243042, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8767924904823303, + "num_tokens": 423664922.0, + "step": 11102 + }, + { + "epoch": 1.4124157231904337, + "ewc_loss": 0.0077544464729726315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75444641476497e-05, + "grad_norm": 3.8342714309692383, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8670094609260559, + "num_tokens": 423706990.0, + "step": 11103 + }, + { + "epoch": 1.4125429334690243, + "ewc_loss": 0.007870538160204887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870537956478074e-05, + "grad_norm": 3.899285078048706, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.85798180103302, + "num_tokens": 423745945.0, + "step": 11104 + }, + { + "epoch": 1.4126701437476148, + "ewc_loss": 0.00785770732909441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857707532821223e-05, + "grad_norm": 3.8623247146606445, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8806403279304504, + "num_tokens": 423780824.0, + "step": 11105 + }, + { + "epoch": 1.4127973540262053, + "ewc_loss": 0.007800467777997255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.800467574270442e-05, + "grad_norm": 3.800708770751953, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8846659660339355, + "num_tokens": 423818640.0, + "step": 11106 + }, + { + "epoch": 1.4129245643047958, + "ewc_loss": 0.007781392894685268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781392923789099e-05, + "grad_norm": 3.8764266967773438, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.884371280670166, + "num_tokens": 423856657.0, + "step": 11107 + }, + { + "epoch": 1.4130517745833864, + "ewc_loss": 0.007839635014533997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839634781703353e-05, + "grad_norm": 3.763282060623169, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8874067068099976, + "num_tokens": 423900288.0, + "step": 11108 + }, + { + "epoch": 1.413178984861977, + "ewc_loss": 0.007741331588476896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741331501165405e-05, + "grad_norm": 3.8504371643066406, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.882355809211731, + "num_tokens": 423935929.0, + "step": 11109 + }, + { + "epoch": 1.4133061951405674, + "ewc_loss": 0.007827632129192352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.827632362022996e-05, + "grad_norm": 3.9014761447906494, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8720124363899231, + "num_tokens": 423970719.0, + "step": 11110 + }, + { + "epoch": 1.4134334054191577, + "ewc_loss": 0.0078073181211948395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807317888364196e-05, + "grad_norm": 3.8726613521575928, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.875510573387146, + "num_tokens": 424012338.0, + "step": 11111 + }, + { + "epoch": 1.4135606156977483, + "ewc_loss": 0.007786491885781288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786491914885119e-05, + "grad_norm": 3.8549439907073975, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.858392596244812, + "num_tokens": 424050112.0, + "step": 11112 + }, + { + "epoch": 1.4136878259763388, + "ewc_loss": 0.00777745945379138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.777459541102871e-05, + "grad_norm": 3.826824188232422, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8758069276809692, + "num_tokens": 424092688.0, + "step": 11113 + }, + { + "epoch": 1.4138150362549293, + "ewc_loss": 0.0077599165961146355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.759916479699314e-05, + "grad_norm": 3.8641300201416016, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8723956942558289, + "num_tokens": 424131355.0, + "step": 11114 + }, + { + "epoch": 1.4139422465335199, + "ewc_loss": 0.007799155544489622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.799155719112605e-05, + "grad_norm": 3.79284930229187, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8653038740158081, + "num_tokens": 424177768.0, + "step": 11115 + }, + { + "epoch": 1.4140694568121104, + "ewc_loss": 0.007730240002274513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.730240031378344e-05, + "grad_norm": 3.8700451850891113, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8862867951393127, + "num_tokens": 424215834.0, + "step": 11116 + }, + { + "epoch": 1.414196667090701, + "ewc_loss": 0.007793426513671875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793426630087197e-05, + "grad_norm": 3.834841012954712, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8777149319648743, + "num_tokens": 424260002.0, + "step": 11117 + }, + { + "epoch": 1.4143238773692914, + "ewc_loss": 0.007733487524092197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.733487291261554e-05, + "grad_norm": 3.860180616378784, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8747507333755493, + "num_tokens": 424299343.0, + "step": 11118 + }, + { + "epoch": 1.414451087647882, + "ewc_loss": 0.007755116559565067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.755116530461237e-05, + "grad_norm": 3.788045883178711, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8694849610328674, + "num_tokens": 424340312.0, + "step": 11119 + }, + { + "epoch": 1.4145782979264725, + "ewc_loss": 0.007719846908003092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.719847053522244e-05, + "grad_norm": 3.8565027713775635, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.881564736366272, + "num_tokens": 424376658.0, + "step": 11120 + }, + { + "epoch": 1.414705508205063, + "ewc_loss": 0.007772859651595354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772859680699185e-05, + "grad_norm": 3.839970588684082, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8857187032699585, + "num_tokens": 424415499.0, + "step": 11121 + }, + { + "epoch": 1.4148327184836536, + "ewc_loss": 0.007724462077021599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72446219343692e-05, + "grad_norm": 3.8545517921447754, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8764554858207703, + "num_tokens": 424453401.0, + "step": 11122 + }, + { + "epoch": 1.414959928762244, + "ewc_loss": 0.007752298377454281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752298552077264e-05, + "grad_norm": 3.862122058868408, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8786278963088989, + "num_tokens": 424490654.0, + "step": 11123 + }, + { + "epoch": 1.4150871390408346, + "ewc_loss": 0.007737161125987768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.737160922260955e-05, + "grad_norm": 3.8092031478881836, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8829610347747803, + "num_tokens": 424534469.0, + "step": 11124 + }, + { + "epoch": 1.4152143493194251, + "ewc_loss": 0.007698277477174997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.698277477174997e-05, + "grad_norm": 3.910783529281616, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8661304712295532, + "num_tokens": 424570187.0, + "step": 11125 + }, + { + "epoch": 1.4153415595980154, + "ewc_loss": 0.007789625786244869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.789625669829547e-05, + "grad_norm": 3.885483980178833, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8579508662223816, + "num_tokens": 424607898.0, + "step": 11126 + }, + { + "epoch": 1.415468769876606, + "ewc_loss": 0.007726844400167465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.726844341959804e-05, + "grad_norm": 3.819131851196289, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8763703107833862, + "num_tokens": 424652156.0, + "step": 11127 + }, + { + "epoch": 1.4155959801551965, + "ewc_loss": 0.007717417553067207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.717417611274868e-05, + "grad_norm": 3.8752822875976562, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8666179776191711, + "num_tokens": 424688735.0, + "step": 11128 + }, + { + "epoch": 1.415723190433787, + "ewc_loss": 0.00777036277577281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770362572045997e-05, + "grad_norm": 3.8213372230529785, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8771133422851562, + "num_tokens": 424732859.0, + "step": 11129 + }, + { + "epoch": 1.4158504007123776, + "ewc_loss": 0.007696453481912613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.696453394601122e-05, + "grad_norm": 3.8305652141571045, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8675751686096191, + "num_tokens": 424769381.0, + "step": 11130 + }, + { + "epoch": 1.415977610990968, + "ewc_loss": 0.00774416932836175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.744169124634936e-05, + "grad_norm": 3.8783721923828125, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8768830895423889, + "num_tokens": 424802362.0, + "step": 11131 + }, + { + "epoch": 1.4161048212695586, + "ewc_loss": 0.0077694314531981945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769431249471381e-05, + "grad_norm": 3.863286256790161, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8730109930038452, + "num_tokens": 424837897.0, + "step": 11132 + }, + { + "epoch": 1.4162320315481491, + "ewc_loss": 0.0077427104115486145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742710295133293e-05, + "grad_norm": 3.80306077003479, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.886353611946106, + "num_tokens": 424878154.0, + "step": 11133 + }, + { + "epoch": 1.4163592418267397, + "ewc_loss": 0.007722153328359127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72215353208594e-05, + "grad_norm": 3.8345706462860107, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8805935382843018, + "num_tokens": 424915063.0, + "step": 11134 + }, + { + "epoch": 1.41648645210533, + "ewc_loss": 0.00777527317404747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.775273115839809e-05, + "grad_norm": 3.849015474319458, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.873525857925415, + "num_tokens": 424955159.0, + "step": 11135 + }, + { + "epoch": 1.4166136623839205, + "ewc_loss": 0.007774850353598595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774850382702425e-05, + "grad_norm": 3.8798015117645264, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8658874034881592, + "num_tokens": 424992573.0, + "step": 11136 + }, + { + "epoch": 1.416740872662511, + "ewc_loss": 0.007787201553583145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.787201320752501e-05, + "grad_norm": 3.867058753967285, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8791320323944092, + "num_tokens": 425030452.0, + "step": 11137 + }, + { + "epoch": 1.4168680829411016, + "ewc_loss": 0.007767853792756796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.767853821860626e-05, + "grad_norm": 3.819876194000244, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8879899978637695, + "num_tokens": 425067118.0, + "step": 11138 + }, + { + "epoch": 1.416995293219692, + "ewc_loss": 0.007761303335428238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761303277220577e-05, + "grad_norm": 3.9135115146636963, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8662692308425903, + "num_tokens": 425101684.0, + "step": 11139 + }, + { + "epoch": 1.4171225034982826, + "ewc_loss": 0.007829580456018448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829580863472074e-05, + "grad_norm": 3.8188791275024414, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.884323239326477, + "num_tokens": 425143127.0, + "step": 11140 + }, + { + "epoch": 1.4172497137768731, + "ewc_loss": 0.007741798646748066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741798617644235e-05, + "grad_norm": 3.9301536083221436, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8769103288650513, + "num_tokens": 425173912.0, + "step": 11141 + }, + { + "epoch": 1.4173769240554637, + "ewc_loss": 0.007829667069017887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829667447367683e-05, + "grad_norm": 3.8742897510528564, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8794578313827515, + "num_tokens": 425210146.0, + "step": 11142 + }, + { + "epoch": 1.4175041343340542, + "ewc_loss": 0.00776672875508666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766728958813474e-05, + "grad_norm": 3.819891929626465, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.887942910194397, + "num_tokens": 425246082.0, + "step": 11143 + }, + { + "epoch": 1.4176313446126447, + "ewc_loss": 0.007780448533594608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.780448504490778e-05, + "grad_norm": 3.826235771179199, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8723443746566772, + "num_tokens": 425287470.0, + "step": 11144 + }, + { + "epoch": 1.4177585548912353, + "ewc_loss": 0.0078077418729662895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807742076693103e-05, + "grad_norm": 3.829054355621338, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8720253109931946, + "num_tokens": 425326553.0, + "step": 11145 + }, + { + "epoch": 1.4178857651698258, + "ewc_loss": 0.007798684295266867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.798684237059206e-05, + "grad_norm": 3.896115303039551, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8686546087265015, + "num_tokens": 425363105.0, + "step": 11146 + }, + { + "epoch": 1.4180129754484163, + "ewc_loss": 0.007832804694771767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832804840290919e-05, + "grad_norm": 3.8210439682006836, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8685046434402466, + "num_tokens": 425402513.0, + "step": 11147 + }, + { + "epoch": 1.4181401857270068, + "ewc_loss": 0.007784595247358084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.784595072735101e-05, + "grad_norm": 3.8370707035064697, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.875174880027771, + "num_tokens": 425440132.0, + "step": 11148 + }, + { + "epoch": 1.4182673960055974, + "ewc_loss": 0.007825225591659546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825225475244224e-05, + "grad_norm": 3.8077423572540283, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8735312223434448, + "num_tokens": 425481738.0, + "step": 11149 + }, + { + "epoch": 1.418394606284188, + "ewc_loss": 0.0078041586093604565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.804158667568117e-05, + "grad_norm": 3.9085962772369385, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8752150535583496, + "num_tokens": 425516916.0, + "step": 11150 + }, + { + "epoch": 1.4185218165627782, + "ewc_loss": 0.007857246324419975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857246237108484e-05, + "grad_norm": 3.838653802871704, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8738815784454346, + "num_tokens": 425555951.0, + "step": 11151 + }, + { + "epoch": 1.4186490268413687, + "ewc_loss": 0.007779885549098253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77988534537144e-05, + "grad_norm": 3.8621559143066406, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8765814304351807, + "num_tokens": 425595302.0, + "step": 11152 + }, + { + "epoch": 1.4187762371199593, + "ewc_loss": 0.007839115336537361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8391152783297e-05, + "grad_norm": 3.939572811126709, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8789148330688477, + "num_tokens": 425623281.0, + "step": 11153 + }, + { + "epoch": 1.4189034473985498, + "ewc_loss": 0.007863836362957954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86383607191965e-05, + "grad_norm": 3.831109046936035, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8828498125076294, + "num_tokens": 425659124.0, + "step": 11154 + }, + { + "epoch": 1.4190306576771403, + "ewc_loss": 0.007778776343911886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778776489431038e-05, + "grad_norm": 3.8369603157043457, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8627139329910278, + "num_tokens": 425695809.0, + "step": 11155 + }, + { + "epoch": 1.4191578679557308, + "ewc_loss": 0.007847198285162449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847198139643297e-05, + "grad_norm": 3.8577775955200195, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8653578758239746, + "num_tokens": 425731890.0, + "step": 11156 + }, + { + "epoch": 1.4192850782343214, + "ewc_loss": 0.007826479151844978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.826479122741148e-05, + "grad_norm": 3.8346118927001953, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.882805585861206, + "num_tokens": 425770899.0, + "step": 11157 + }, + { + "epoch": 1.419412288512912, + "ewc_loss": 0.007817192934453487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817192818038166e-05, + "grad_norm": 3.9606809616088867, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8851876258850098, + "num_tokens": 425807812.0, + "step": 11158 + }, + { + "epoch": 1.4195394987915024, + "ewc_loss": 0.007901504635810852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901504432084039e-05, + "grad_norm": 3.816455841064453, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8731290102005005, + "num_tokens": 425848997.0, + "step": 11159 + }, + { + "epoch": 1.4196667090700927, + "ewc_loss": 0.00777221005409956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772209937684238e-05, + "grad_norm": 3.765333890914917, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8851572275161743, + "num_tokens": 425887816.0, + "step": 11160 + }, + { + "epoch": 1.4197939193486833, + "ewc_loss": 0.007795214187353849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.795214332873002e-05, + "grad_norm": 3.873073101043701, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8662782311439514, + "num_tokens": 425925488.0, + "step": 11161 + }, + { + "epoch": 1.4199211296272738, + "ewc_loss": 0.007875005714595318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.875005394453183e-05, + "grad_norm": 3.818495035171509, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8820984959602356, + "num_tokens": 425965616.0, + "step": 11162 + }, + { + "epoch": 1.4200483399058643, + "ewc_loss": 0.007781929802149534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781929889461026e-05, + "grad_norm": 3.8368258476257324, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8688137531280518, + "num_tokens": 426005308.0, + "step": 11163 + }, + { + "epoch": 1.4201755501844548, + "ewc_loss": 0.007814813405275345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814813579898328e-05, + "grad_norm": 3.872941255569458, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8726253509521484, + "num_tokens": 426042339.0, + "step": 11164 + }, + { + "epoch": 1.4203027604630454, + "ewc_loss": 0.00782072450965643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82072456786409e-05, + "grad_norm": 3.8410537242889404, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8741142749786377, + "num_tokens": 426081925.0, + "step": 11165 + }, + { + "epoch": 1.420429970741636, + "ewc_loss": 0.00780066242441535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.800662569934502e-05, + "grad_norm": 3.811182737350464, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8690440654754639, + "num_tokens": 426120918.0, + "step": 11166 + }, + { + "epoch": 1.4205571810202264, + "ewc_loss": 0.007785037625581026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785037450958043e-05, + "grad_norm": 3.814295530319214, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8638825416564941, + "num_tokens": 426158711.0, + "step": 11167 + }, + { + "epoch": 1.420684391298817, + "ewc_loss": 0.007811243180185556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811243267497048e-05, + "grad_norm": 3.86067271232605, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8700840473175049, + "num_tokens": 426197034.0, + "step": 11168 + }, + { + "epoch": 1.4208116015774075, + "ewc_loss": 0.007841374725103378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841374463168904e-05, + "grad_norm": 3.7955567836761475, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8675851225852966, + "num_tokens": 426239390.0, + "step": 11169 + }, + { + "epoch": 1.420938811855998, + "ewc_loss": 0.007784891873598099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78489193180576e-05, + "grad_norm": 3.7993385791778564, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8665245771408081, + "num_tokens": 426285488.0, + "step": 11170 + }, + { + "epoch": 1.4210660221345885, + "ewc_loss": 0.007802675012499094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.802675099810585e-05, + "grad_norm": 3.8574798107147217, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8616631031036377, + "num_tokens": 426323955.0, + "step": 11171 + }, + { + "epoch": 1.421193232413179, + "ewc_loss": 0.007826379500329494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.826379442121834e-05, + "grad_norm": 3.8682961463928223, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8924444317817688, + "num_tokens": 426359460.0, + "step": 11172 + }, + { + "epoch": 1.4213204426917696, + "ewc_loss": 0.007796617690473795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.796617865096778e-05, + "grad_norm": 3.8630897998809814, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8665282130241394, + "num_tokens": 426397104.0, + "step": 11173 + }, + { + "epoch": 1.4214476529703601, + "ewc_loss": 0.007803412154316902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.803412154316902e-05, + "grad_norm": 3.7869651317596436, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8839347958564758, + "num_tokens": 426441349.0, + "step": 11174 + }, + { + "epoch": 1.4215748632489504, + "ewc_loss": 0.00775762228295207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.757622370263562e-05, + "grad_norm": 3.8151400089263916, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8822709321975708, + "num_tokens": 426478410.0, + "step": 11175 + }, + { + "epoch": 1.421702073527541, + "ewc_loss": 0.007794942241162062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794942212058231e-05, + "grad_norm": 3.7814414501190186, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8736051321029663, + "num_tokens": 426519874.0, + "step": 11176 + }, + { + "epoch": 1.4218292838061315, + "ewc_loss": 0.007778620813041925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778620783938095e-05, + "grad_norm": 3.8641529083251953, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8637514114379883, + "num_tokens": 426559970.0, + "step": 11177 + }, + { + "epoch": 1.421956494084722, + "ewc_loss": 0.0078070336021482944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807033398421481e-05, + "grad_norm": 3.858473539352417, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8763008117675781, + "num_tokens": 426593897.0, + "step": 11178 + }, + { + "epoch": 1.4220837043633126, + "ewc_loss": 0.007791619282215834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791619282215834e-05, + "grad_norm": 3.834789752960205, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8753427863121033, + "num_tokens": 426633712.0, + "step": 11179 + }, + { + "epoch": 1.422210914641903, + "ewc_loss": 0.007772814482450485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772814569761977e-05, + "grad_norm": 3.8393502235412598, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8881092667579651, + "num_tokens": 426669085.0, + "step": 11180 + }, + { + "epoch": 1.4223381249204936, + "ewc_loss": 0.0077915070578455925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791507232468575e-05, + "grad_norm": 3.820873975753784, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8861579895019531, + "num_tokens": 426709260.0, + "step": 11181 + }, + { + "epoch": 1.4224653351990841, + "ewc_loss": 0.007766122929751873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766122871544212e-05, + "grad_norm": 3.8478620052337646, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8635064363479614, + "num_tokens": 426747747.0, + "step": 11182 + }, + { + "epoch": 1.4225925454776747, + "ewc_loss": 0.0077920216135680676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.792021642671898e-05, + "grad_norm": 3.8758785724639893, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8615964651107788, + "num_tokens": 426788185.0, + "step": 11183 + }, + { + "epoch": 1.422719755756265, + "ewc_loss": 0.007792547810822725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.792547694407403e-05, + "grad_norm": 3.826864004135132, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8759437799453735, + "num_tokens": 426828663.0, + "step": 11184 + }, + { + "epoch": 1.4228469660348555, + "ewc_loss": 0.007742561865597963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742561865597963e-05, + "grad_norm": 3.8446059226989746, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8658746480941772, + "num_tokens": 426869205.0, + "step": 11185 + }, + { + "epoch": 1.422974176313446, + "ewc_loss": 0.007771878037601709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.771878154017031e-05, + "grad_norm": 3.8670530319213867, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8894345164299011, + "num_tokens": 426900561.0, + "step": 11186 + }, + { + "epoch": 1.4231013865920366, + "ewc_loss": 0.007773956749588251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.773956895107403e-05, + "grad_norm": 3.7763397693634033, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8895919322967529, + "num_tokens": 426940618.0, + "step": 11187 + }, + { + "epoch": 1.423228596870627, + "ewc_loss": 0.007712698541581631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.712698425166309e-05, + "grad_norm": 3.8860514163970947, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8667551279067993, + "num_tokens": 426975509.0, + "step": 11188 + }, + { + "epoch": 1.4233558071492176, + "ewc_loss": 0.007812484633177519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812484545866027e-05, + "grad_norm": 3.8325960636138916, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8838778734207153, + "num_tokens": 427011647.0, + "step": 11189 + }, + { + "epoch": 1.4234830174278081, + "ewc_loss": 0.007742049638181925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742049638181925e-05, + "grad_norm": 3.838813543319702, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8716809749603271, + "num_tokens": 427046631.0, + "step": 11190 + }, + { + "epoch": 1.4236102277063987, + "ewc_loss": 0.007765413261950016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.765413465676829e-05, + "grad_norm": 3.853165864944458, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.871290922164917, + "num_tokens": 427081364.0, + "step": 11191 + }, + { + "epoch": 1.4237374379849892, + "ewc_loss": 0.007769473362714052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769473450025544e-05, + "grad_norm": 3.894500732421875, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8748636841773987, + "num_tokens": 427118667.0, + "step": 11192 + }, + { + "epoch": 1.4238646482635797, + "ewc_loss": 0.007801807951182127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.801807805662975e-05, + "grad_norm": 3.8236441612243652, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8802108764648438, + "num_tokens": 427155017.0, + "step": 11193 + }, + { + "epoch": 1.4239918585421703, + "ewc_loss": 0.00773377250880003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73377250880003e-05, + "grad_norm": 3.847374439239502, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8682053089141846, + "num_tokens": 427191741.0, + "step": 11194 + }, + { + "epoch": 1.4241190688207608, + "ewc_loss": 0.007777463179081678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.777463179081678e-05, + "grad_norm": 3.8904974460601807, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8788416981697083, + "num_tokens": 427226210.0, + "step": 11195 + }, + { + "epoch": 1.4242462790993513, + "ewc_loss": 0.007809748873114586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.809748785803095e-05, + "grad_norm": 3.8648104667663574, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8722416758537292, + "num_tokens": 427261699.0, + "step": 11196 + }, + { + "epoch": 1.4243734893779418, + "ewc_loss": 0.00777843501418829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778435247018933e-05, + "grad_norm": 3.880403518676758, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8644386529922485, + "num_tokens": 427297158.0, + "step": 11197 + }, + { + "epoch": 1.4245006996565324, + "ewc_loss": 0.007812745869159698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812745752744377e-05, + "grad_norm": 3.7891759872436523, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8599128723144531, + "num_tokens": 427338234.0, + "step": 11198 + }, + { + "epoch": 1.424627909935123, + "ewc_loss": 0.007787215057760477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.787215145071968e-05, + "grad_norm": 3.8791208267211914, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8678508996963501, + "num_tokens": 427376727.0, + "step": 11199 + }, + { + "epoch": 1.4247551202137132, + "ewc_loss": 0.007862299680709839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862300117267296e-05, + "grad_norm": 3.874950408935547, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8866053819656372, + "num_tokens": 427417095.0, + "step": 11200 + }, + { + "epoch": 1.4248823304923037, + "ewc_loss": 0.007797053083777428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797052967362106e-05, + "grad_norm": 3.8433828353881836, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8582149744033813, + "num_tokens": 427453243.0, + "step": 11201 + }, + { + "epoch": 1.4250095407708943, + "ewc_loss": 0.007822360843420029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82236093073152e-05, + "grad_norm": 3.8370766639709473, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.878010630607605, + "num_tokens": 427490961.0, + "step": 11202 + }, + { + "epoch": 1.4251367510494848, + "ewc_loss": 0.00783019233494997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.830192771507427e-05, + "grad_norm": 3.857327461242676, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.883247971534729, + "num_tokens": 427521487.0, + "step": 11203 + }, + { + "epoch": 1.4252639613280753, + "ewc_loss": 0.007838616147637367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838616147637367e-05, + "grad_norm": 3.8130149841308594, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8717827796936035, + "num_tokens": 427556607.0, + "step": 11204 + }, + { + "epoch": 1.4253911716066658, + "ewc_loss": 0.007825266569852829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825266220606863e-05, + "grad_norm": 3.791865110397339, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8754779696464539, + "num_tokens": 427598940.0, + "step": 11205 + }, + { + "epoch": 1.4255183818852564, + "ewc_loss": 0.007818800397217274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818800077075139e-05, + "grad_norm": 3.832106351852417, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8648878335952759, + "num_tokens": 427641337.0, + "step": 11206 + }, + { + "epoch": 1.425645592163847, + "ewc_loss": 0.007856309413909912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856309821363539e-05, + "grad_norm": 3.8236193656921387, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8836020231246948, + "num_tokens": 427680235.0, + "step": 11207 + }, + { + "epoch": 1.4257728024424374, + "ewc_loss": 0.007825812324881554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825811917427927e-05, + "grad_norm": 3.8776447772979736, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8751681447029114, + "num_tokens": 427719918.0, + "step": 11208 + }, + { + "epoch": 1.4259000127210277, + "ewc_loss": 0.007866792380809784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866792293498293e-05, + "grad_norm": 3.8280341625213623, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.87750643491745, + "num_tokens": 427758042.0, + "step": 11209 + }, + { + "epoch": 1.4260272229996183, + "ewc_loss": 0.00783477257937193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.834772986825556e-05, + "grad_norm": 3.8513238430023193, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8757097125053406, + "num_tokens": 427794591.0, + "step": 11210 + }, + { + "epoch": 1.4261544332782088, + "ewc_loss": 0.007836106233298779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836106669856235e-05, + "grad_norm": 3.8738045692443848, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8837449550628662, + "num_tokens": 427828190.0, + "step": 11211 + }, + { + "epoch": 1.4262816435567993, + "ewc_loss": 0.007833621464669704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83362120273523e-05, + "grad_norm": 3.8196611404418945, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8674947023391724, + "num_tokens": 427870363.0, + "step": 11212 + }, + { + "epoch": 1.4264088538353898, + "ewc_loss": 0.007814155891537666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814155833330005e-05, + "grad_norm": 3.8313281536102295, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8889888525009155, + "num_tokens": 427910037.0, + "step": 11213 + }, + { + "epoch": 1.4265360641139804, + "ewc_loss": 0.007826424203813076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.826424553059042e-05, + "grad_norm": 3.876621723175049, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8736351728439331, + "num_tokens": 427945782.0, + "step": 11214 + }, + { + "epoch": 1.426663274392571, + "ewc_loss": 0.007850266993045807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850267138564959e-05, + "grad_norm": 3.819108247756958, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8821085691452026, + "num_tokens": 427984635.0, + "step": 11215 + }, + { + "epoch": 1.4267904846711614, + "ewc_loss": 0.007783246226608753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.783246110193431e-05, + "grad_norm": 3.7625701427459717, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8905252814292908, + "num_tokens": 428024289.0, + "step": 11216 + }, + { + "epoch": 1.426917694949752, + "ewc_loss": 0.007782481610774994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782481407048181e-05, + "grad_norm": 3.820657730102539, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8665391206741333, + "num_tokens": 428067119.0, + "step": 11217 + }, + { + "epoch": 1.4270449052283425, + "ewc_loss": 0.007831200957298279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831200491636992e-05, + "grad_norm": 3.8188507556915283, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8736345171928406, + "num_tokens": 428108150.0, + "step": 11218 + }, + { + "epoch": 1.427172115506933, + "ewc_loss": 0.007779115345329046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77911554905586e-05, + "grad_norm": 3.7814745903015137, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8863078951835632, + "num_tokens": 428150839.0, + "step": 11219 + }, + { + "epoch": 1.4272993257855235, + "ewc_loss": 0.0077618444338440895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761844608467072e-05, + "grad_norm": 3.8221487998962402, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8635225296020508, + "num_tokens": 428194325.0, + "step": 11220 + }, + { + "epoch": 1.427426536064114, + "ewc_loss": 0.007772570941597223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772570825181901e-05, + "grad_norm": 3.8531908988952637, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8747567534446716, + "num_tokens": 428230879.0, + "step": 11221 + }, + { + "epoch": 1.4275537463427046, + "ewc_loss": 0.007779560983181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.779560837661847e-05, + "grad_norm": 3.8885715007781982, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8854876756668091, + "num_tokens": 428265795.0, + "step": 11222 + }, + { + "epoch": 1.4276809566212951, + "ewc_loss": 0.00778665766119957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786657806718722e-05, + "grad_norm": 3.8516745567321777, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8818726539611816, + "num_tokens": 428300413.0, + "step": 11223 + }, + { + "epoch": 1.4278081668998854, + "ewc_loss": 0.007746225222945213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.746225310256705e-05, + "grad_norm": 3.8210179805755615, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8952029347419739, + "num_tokens": 428339293.0, + "step": 11224 + }, + { + "epoch": 1.427935377178476, + "ewc_loss": 0.007742893882095814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.74289364926517e-05, + "grad_norm": 3.8486621379852295, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8884124159812927, + "num_tokens": 428374158.0, + "step": 11225 + }, + { + "epoch": 1.4280625874570665, + "ewc_loss": 0.0077557992190122604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.755799015285447e-05, + "grad_norm": 3.868711471557617, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8766188621520996, + "num_tokens": 428411715.0, + "step": 11226 + }, + { + "epoch": 1.428189797735657, + "ewc_loss": 0.007753416895866394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.753416866762564e-05, + "grad_norm": 3.860854387283325, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8638713955879211, + "num_tokens": 428450000.0, + "step": 11227 + }, + { + "epoch": 1.4283170080142475, + "ewc_loss": 0.007747163064777851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747163181193173e-05, + "grad_norm": 3.8497118949890137, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8824207782745361, + "num_tokens": 428484546.0, + "step": 11228 + }, + { + "epoch": 1.428444218292838, + "ewc_loss": 0.007740688975900412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.740689034108073e-05, + "grad_norm": 3.8045103549957275, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8768242597579956, + "num_tokens": 428527519.0, + "step": 11229 + }, + { + "epoch": 1.4285714285714286, + "ewc_loss": 0.007709899451583624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.709899364272133e-05, + "grad_norm": 3.8647873401641846, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8712072372436523, + "num_tokens": 428568744.0, + "step": 11230 + }, + { + "epoch": 1.4286986388500191, + "ewc_loss": 0.007776093203574419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776093116262928e-05, + "grad_norm": 3.7882330417633057, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8721616268157959, + "num_tokens": 428613050.0, + "step": 11231 + }, + { + "epoch": 1.4288258491286097, + "ewc_loss": 0.007699673529714346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.699673733441159e-05, + "grad_norm": 3.857928991317749, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8723112940788269, + "num_tokens": 428647334.0, + "step": 11232 + }, + { + "epoch": 1.4289530594072, + "ewc_loss": 0.007778781000524759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778780855005607e-05, + "grad_norm": 3.875685453414917, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8817142844200134, + "num_tokens": 428681015.0, + "step": 11233 + }, + { + "epoch": 1.4290802696857905, + "ewc_loss": 0.0077592856250703335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.759285654174164e-05, + "grad_norm": 3.864764451980591, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8676215410232544, + "num_tokens": 428718903.0, + "step": 11234 + }, + { + "epoch": 1.429207479964381, + "ewc_loss": 0.007763294503092766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763294706819579e-05, + "grad_norm": 3.8137993812561035, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8847509026527405, + "num_tokens": 428756038.0, + "step": 11235 + }, + { + "epoch": 1.4293346902429716, + "ewc_loss": 0.0077385855838656425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738585554761812e-05, + "grad_norm": 3.829437494277954, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8903844952583313, + "num_tokens": 428793908.0, + "step": 11236 + }, + { + "epoch": 1.429461900521562, + "ewc_loss": 0.007776875980198383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776876009302214e-05, + "grad_norm": 3.8477444648742676, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8696086406707764, + "num_tokens": 428830924.0, + "step": 11237 + }, + { + "epoch": 1.4295891108001526, + "ewc_loss": 0.0077814869582653046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781486783642322e-05, + "grad_norm": 3.8474552631378174, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8723464608192444, + "num_tokens": 428869083.0, + "step": 11238 + }, + { + "epoch": 1.4297163210787431, + "ewc_loss": 0.007761363871395588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761363667668775e-05, + "grad_norm": 3.9155237674713135, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8494597673416138, + "num_tokens": 428903970.0, + "step": 11239 + }, + { + "epoch": 1.4298435313573337, + "ewc_loss": 0.007787779904901981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.787779759382829e-05, + "grad_norm": 3.8279383182525635, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8801390528678894, + "num_tokens": 428940710.0, + "step": 11240 + }, + { + "epoch": 1.4299707416359242, + "ewc_loss": 0.007715348154306412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.715348328929394e-05, + "grad_norm": 3.792588233947754, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8787519931793213, + "num_tokens": 428981160.0, + "step": 11241 + }, + { + "epoch": 1.4300979519145147, + "ewc_loss": 0.007746743969619274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.746744086034596e-05, + "grad_norm": 3.8375613689422607, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8773993253707886, + "num_tokens": 429022464.0, + "step": 11242 + }, + { + "epoch": 1.4302251621931052, + "ewc_loss": 0.007776642218232155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776642451062799e-05, + "grad_norm": 3.8550820350646973, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8761833310127258, + "num_tokens": 429060964.0, + "step": 11243 + }, + { + "epoch": 1.4303523724716958, + "ewc_loss": 0.007760222069919109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76022206991911e-05, + "grad_norm": 3.825486660003662, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8770997524261475, + "num_tokens": 429101389.0, + "step": 11244 + }, + { + "epoch": 1.4304795827502863, + "ewc_loss": 0.00773804122582078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738041313132271e-05, + "grad_norm": 3.9028775691986084, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8733485341072083, + "num_tokens": 429135547.0, + "step": 11245 + }, + { + "epoch": 1.4306067930288768, + "ewc_loss": 0.007796280086040497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79628026066348e-05, + "grad_norm": 3.8035101890563965, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8773004412651062, + "num_tokens": 429177504.0, + "step": 11246 + }, + { + "epoch": 1.4307340033074674, + "ewc_loss": 0.007700115442276001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70011538406834e-05, + "grad_norm": 3.8037447929382324, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8794459104537964, + "num_tokens": 429217923.0, + "step": 11247 + }, + { + "epoch": 1.430861213586058, + "ewc_loss": 0.00774437515065074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.744375034235418e-05, + "grad_norm": 3.8924148082733154, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8778625726699829, + "num_tokens": 429251514.0, + "step": 11248 + }, + { + "epoch": 1.4309884238646482, + "ewc_loss": 0.007783091627061367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.783091859892011e-05, + "grad_norm": 3.8675906658172607, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.861186146736145, + "num_tokens": 429291747.0, + "step": 11249 + }, + { + "epoch": 1.4311156341432387, + "ewc_loss": 0.007722553797066212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72255370975472e-05, + "grad_norm": 3.814121723175049, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8718863129615784, + "num_tokens": 429333480.0, + "step": 11250 + }, + { + "epoch": 1.4312428444218293, + "ewc_loss": 0.007713327184319496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713327067904174e-05, + "grad_norm": 3.8637077808380127, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.887104868888855, + "num_tokens": 429366482.0, + "step": 11251 + }, + { + "epoch": 1.4313700547004198, + "ewc_loss": 0.007758854888379574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.758854917483404e-05, + "grad_norm": 3.8594651222229004, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8675704598426819, + "num_tokens": 429400704.0, + "step": 11252 + }, + { + "epoch": 1.4314972649790103, + "ewc_loss": 0.00775801669806242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75801672716625e-05, + "grad_norm": 3.8950212001800537, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8624071478843689, + "num_tokens": 429437463.0, + "step": 11253 + }, + { + "epoch": 1.4316244752576008, + "ewc_loss": 0.007762220688164234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.762220775475726e-05, + "grad_norm": 3.868999719619751, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8652440309524536, + "num_tokens": 429474496.0, + "step": 11254 + }, + { + "epoch": 1.4317516855361914, + "ewc_loss": 0.0077616809867322445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761680899420753e-05, + "grad_norm": 3.847139358520508, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8636831641197205, + "num_tokens": 429514641.0, + "step": 11255 + }, + { + "epoch": 1.431878895814782, + "ewc_loss": 0.007744817528873682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.74481741245836e-05, + "grad_norm": 3.8100979328155518, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8760828971862793, + "num_tokens": 429554054.0, + "step": 11256 + }, + { + "epoch": 1.4320061060933724, + "ewc_loss": 0.007756448816508055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.756448758300394e-05, + "grad_norm": 3.8475658893585205, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.87110435962677, + "num_tokens": 429594004.0, + "step": 11257 + }, + { + "epoch": 1.4321333163719627, + "ewc_loss": 0.007791090290993452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791090320097283e-05, + "grad_norm": 3.8246943950653076, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8821908831596375, + "num_tokens": 429633019.0, + "step": 11258 + }, + { + "epoch": 1.4322605266505533, + "ewc_loss": 0.007772402837872505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772402750561014e-05, + "grad_norm": 3.8965630531311035, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.876983642578125, + "num_tokens": 429666869.0, + "step": 11259 + }, + { + "epoch": 1.4323877369291438, + "ewc_loss": 0.007825205102562904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825205102562904e-05, + "grad_norm": 3.88781476020813, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8846785426139832, + "num_tokens": 429700049.0, + "step": 11260 + }, + { + "epoch": 1.4325149472077343, + "ewc_loss": 0.007806794717907906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.806794747011736e-05, + "grad_norm": 3.836750030517578, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8747384548187256, + "num_tokens": 429737517.0, + "step": 11261 + }, + { + "epoch": 1.4326421574863248, + "ewc_loss": 0.007788376417011023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.788376387907192e-05, + "grad_norm": 3.803670644760132, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8871501684188843, + "num_tokens": 429778697.0, + "step": 11262 + }, + { + "epoch": 1.4327693677649154, + "ewc_loss": 0.007787843234837055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.787843060214072e-05, + "grad_norm": 3.918609142303467, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8742939233779907, + "num_tokens": 429811883.0, + "step": 11263 + }, + { + "epoch": 1.432896578043506, + "ewc_loss": 0.007871250621974468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.871251000324264e-05, + "grad_norm": 3.808332920074463, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.880323588848114, + "num_tokens": 429851748.0, + "step": 11264 + }, + { + "epoch": 1.4330237883220964, + "ewc_loss": 0.007771058939397335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.771058881189674e-05, + "grad_norm": 3.876521110534668, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8646500706672668, + "num_tokens": 429891033.0, + "step": 11265 + }, + { + "epoch": 1.433150998600687, + "ewc_loss": 0.007849960587918758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849960820749402e-05, + "grad_norm": 3.775846481323242, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8783595561981201, + "num_tokens": 429935980.0, + "step": 11266 + }, + { + "epoch": 1.4332782088792775, + "ewc_loss": 0.007758153602480888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.758153515169397e-05, + "grad_norm": 3.9442298412323, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8649364709854126, + "num_tokens": 429970874.0, + "step": 11267 + }, + { + "epoch": 1.433405419157868, + "ewc_loss": 0.007896439172327518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896438910393044e-05, + "grad_norm": 3.8145439624786377, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8617061376571655, + "num_tokens": 430017909.0, + "step": 11268 + }, + { + "epoch": 1.4335326294364585, + "ewc_loss": 0.007752280216664076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752280362183228e-05, + "grad_norm": 3.932488441467285, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8499938249588013, + "num_tokens": 430050641.0, + "step": 11269 + }, + { + "epoch": 1.433659839715049, + "ewc_loss": 0.007885199040174484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.885199011070654e-05, + "grad_norm": 3.859842538833618, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8791505694389343, + "num_tokens": 430082985.0, + "step": 11270 + }, + { + "epoch": 1.4337870499936396, + "ewc_loss": 0.007814554497599602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814554555807263e-05, + "grad_norm": 3.8123981952667236, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8762049674987793, + "num_tokens": 430122340.0, + "step": 11271 + }, + { + "epoch": 1.4339142602722301, + "ewc_loss": 0.007820682600140572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820682367309928e-05, + "grad_norm": 3.831861734390259, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8630043268203735, + "num_tokens": 430162140.0, + "step": 11272 + }, + { + "epoch": 1.4340414705508204, + "ewc_loss": 0.007858645170927048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.858645403757691e-05, + "grad_norm": 3.8422584533691406, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.885566771030426, + "num_tokens": 430201672.0, + "step": 11273 + }, + { + "epoch": 1.434168680829411, + "ewc_loss": 0.007859291508793831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859291508793831e-05, + "grad_norm": 3.8668556213378906, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8652390837669373, + "num_tokens": 430239760.0, + "step": 11274 + }, + { + "epoch": 1.4342958911080015, + "ewc_loss": 0.007861228659749031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861229096306488e-05, + "grad_norm": 3.803497076034546, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8573848605155945, + "num_tokens": 430280527.0, + "step": 11275 + }, + { + "epoch": 1.434423101386592, + "ewc_loss": 0.007812867872416973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812867988832295e-05, + "grad_norm": 3.9233648777008057, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8716340065002441, + "num_tokens": 430312788.0, + "step": 11276 + }, + { + "epoch": 1.4345503116651825, + "ewc_loss": 0.007922169752418995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.922169606899843e-05, + "grad_norm": 3.8597161769866943, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.882479190826416, + "num_tokens": 430348044.0, + "step": 11277 + }, + { + "epoch": 1.434677521943773, + "ewc_loss": 0.007836990989744663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83699142630212e-05, + "grad_norm": 3.8227336406707764, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8902081251144409, + "num_tokens": 430385676.0, + "step": 11278 + }, + { + "epoch": 1.4348047322223636, + "ewc_loss": 0.00785429123789072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.854291470721364e-05, + "grad_norm": 3.9144856929779053, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8719945549964905, + "num_tokens": 430416976.0, + "step": 11279 + }, + { + "epoch": 1.4349319425009541, + "ewc_loss": 0.007912776432931423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.912776345619932e-05, + "grad_norm": 3.798842668533325, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8850831389427185, + "num_tokens": 430460002.0, + "step": 11280 + }, + { + "epoch": 1.4350591527795447, + "ewc_loss": 0.00781953614205122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819536403985694e-05, + "grad_norm": 3.8204123973846436, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8542065024375916, + "num_tokens": 430503172.0, + "step": 11281 + }, + { + "epoch": 1.435186363058135, + "ewc_loss": 0.007883277721703053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88327815826051e-05, + "grad_norm": 3.8722591400146484, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.862852156162262, + "num_tokens": 430541221.0, + "step": 11282 + }, + { + "epoch": 1.4353135733367255, + "ewc_loss": 0.007905157282948494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.905156962806359e-05, + "grad_norm": 3.8365938663482666, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8691181540489197, + "num_tokens": 430579527.0, + "step": 11283 + }, + { + "epoch": 1.435440783615316, + "ewc_loss": 0.00783286988735199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832870323909447e-05, + "grad_norm": 3.865468740463257, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8784744739532471, + "num_tokens": 430617263.0, + "step": 11284 + }, + { + "epoch": 1.4355679938939065, + "ewc_loss": 0.007876073010265827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876072777435184e-05, + "grad_norm": 3.863433599472046, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8873404860496521, + "num_tokens": 430649961.0, + "step": 11285 + }, + { + "epoch": 1.435695204172497, + "ewc_loss": 0.007875882089138031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.875882147345692e-05, + "grad_norm": 3.9561214447021484, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8690674304962158, + "num_tokens": 430682599.0, + "step": 11286 + }, + { + "epoch": 1.4358224144510876, + "ewc_loss": 0.007915603928267956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915603782748803e-05, + "grad_norm": 3.8794665336608887, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8468738794326782, + "num_tokens": 430721014.0, + "step": 11287 + }, + { + "epoch": 1.4359496247296781, + "ewc_loss": 0.00784924253821373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849242683732882e-05, + "grad_norm": 3.8815038204193115, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8719213008880615, + "num_tokens": 430757014.0, + "step": 11288 + }, + { + "epoch": 1.4360768350082687, + "ewc_loss": 0.007873499765992165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.873499998822808e-05, + "grad_norm": 3.822323799133301, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8749691247940063, + "num_tokens": 430796169.0, + "step": 11289 + }, + { + "epoch": 1.4362040452868592, + "ewc_loss": 0.007850738242268562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850738620618358e-05, + "grad_norm": 3.8789331912994385, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8731640577316284, + "num_tokens": 430832726.0, + "step": 11290 + }, + { + "epoch": 1.4363312555654497, + "ewc_loss": 0.007898092269897461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.898092007962987e-05, + "grad_norm": 3.903810501098633, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8633294105529785, + "num_tokens": 430866343.0, + "step": 11291 + }, + { + "epoch": 1.4364584658440402, + "ewc_loss": 0.00788626354187727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.886263483669609e-05, + "grad_norm": 3.8174142837524414, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8795904517173767, + "num_tokens": 430901622.0, + "step": 11292 + }, + { + "epoch": 1.4365856761226308, + "ewc_loss": 0.007844691164791584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844690844649449e-05, + "grad_norm": 3.8546359539031982, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8882591724395752, + "num_tokens": 430937499.0, + "step": 11293 + }, + { + "epoch": 1.4367128864012213, + "ewc_loss": 0.007917922921478748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917922630440444e-05, + "grad_norm": 3.860477924346924, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8626670837402344, + "num_tokens": 430974048.0, + "step": 11294 + }, + { + "epoch": 1.4368400966798118, + "ewc_loss": 0.007866933010518551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866932719480246e-05, + "grad_norm": 3.8199000358581543, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.883470356464386, + "num_tokens": 431010927.0, + "step": 11295 + }, + { + "epoch": 1.4369673069584024, + "ewc_loss": 0.007868918590247631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868918328313157e-05, + "grad_norm": 3.8883471488952637, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8581288456916809, + "num_tokens": 431051731.0, + "step": 11296 + }, + { + "epoch": 1.4370945172369929, + "ewc_loss": 0.007921339012682438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.921339420136064e-05, + "grad_norm": 3.838062286376953, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8883202075958252, + "num_tokens": 431087876.0, + "step": 11297 + }, + { + "epoch": 1.4372217275155832, + "ewc_loss": 0.007851417176425457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851417467463762e-05, + "grad_norm": 3.839536428451538, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8678584694862366, + "num_tokens": 431126217.0, + "step": 11298 + }, + { + "epoch": 1.4373489377941737, + "ewc_loss": 0.007868167012929916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868166721891612e-05, + "grad_norm": 3.8111047744750977, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8751199841499329, + "num_tokens": 431168049.0, + "step": 11299 + }, + { + "epoch": 1.4374761480727642, + "ewc_loss": 0.00785670056939125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85670054028742e-05, + "grad_norm": 3.821096658706665, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.872880220413208, + "num_tokens": 431211143.0, + "step": 11300 + }, + { + "epoch": 1.4376033583513548, + "ewc_loss": 0.007852902635931969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.852902490412816e-05, + "grad_norm": 3.791668653488159, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.880725622177124, + "num_tokens": 431255747.0, + "step": 11301 + }, + { + "epoch": 1.4377305686299453, + "ewc_loss": 0.00780663825571537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.806638313923031e-05, + "grad_norm": 3.8216445446014404, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8780741691589355, + "num_tokens": 431296618.0, + "step": 11302 + }, + { + "epoch": 1.4378577789085358, + "ewc_loss": 0.007831323891878128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831323455320671e-05, + "grad_norm": 3.85758638381958, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8752061128616333, + "num_tokens": 431336493.0, + "step": 11303 + }, + { + "epoch": 1.4379849891871264, + "ewc_loss": 0.007838175632059574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838175224605948e-05, + "grad_norm": 3.8675544261932373, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8731651902198792, + "num_tokens": 431376200.0, + "step": 11304 + }, + { + "epoch": 1.438112199465717, + "ewc_loss": 0.007794607430696487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794607518007979e-05, + "grad_norm": 3.949284076690674, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8531526327133179, + "num_tokens": 431414015.0, + "step": 11305 + }, + { + "epoch": 1.4382394097443074, + "ewc_loss": 0.007835090160369873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835090218577534e-05, + "grad_norm": 3.82832932472229, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8701788783073425, + "num_tokens": 431453498.0, + "step": 11306 + }, + { + "epoch": 1.4383666200228977, + "ewc_loss": 0.007739969529211521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73996944190003e-05, + "grad_norm": 3.8617465496063232, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8734849691390991, + "num_tokens": 431491126.0, + "step": 11307 + }, + { + "epoch": 1.4384938303014883, + "ewc_loss": 0.007790607400238514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.790607196511701e-05, + "grad_norm": 3.801105260848999, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8968254327774048, + "num_tokens": 431528549.0, + "step": 11308 + }, + { + "epoch": 1.4386210405800788, + "ewc_loss": 0.0077270627953112125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72706262068823e-05, + "grad_norm": 3.8025355339050293, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8782872557640076, + "num_tokens": 431569435.0, + "step": 11309 + }, + { + "epoch": 1.4387482508586693, + "ewc_loss": 0.007739607244729996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.739607099210843e-05, + "grad_norm": 3.8665716648101807, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8801168203353882, + "num_tokens": 431606125.0, + "step": 11310 + }, + { + "epoch": 1.4388754611372598, + "ewc_loss": 0.007769434712827206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769434887450188e-05, + "grad_norm": 3.85866117477417, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8685898184776306, + "num_tokens": 431643517.0, + "step": 11311 + }, + { + "epoch": 1.4390026714158504, + "ewc_loss": 0.007732439786195755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732439553365111e-05, + "grad_norm": 3.817310094833374, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8832697868347168, + "num_tokens": 431682058.0, + "step": 11312 + }, + { + "epoch": 1.439129881694441, + "ewc_loss": 0.007724660914391279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.724660827079788e-05, + "grad_norm": 3.7979390621185303, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8851110339164734, + "num_tokens": 431721241.0, + "step": 11313 + }, + { + "epoch": 1.4392570919730314, + "ewc_loss": 0.007742045447230339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742045272607356e-05, + "grad_norm": 3.8562450408935547, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8863678574562073, + "num_tokens": 431757809.0, + "step": 11314 + }, + { + "epoch": 1.439384302251622, + "ewc_loss": 0.007754217367619276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.754217222100124e-05, + "grad_norm": 3.8549892902374268, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8652151823043823, + "num_tokens": 431795931.0, + "step": 11315 + }, + { + "epoch": 1.4395115125302125, + "ewc_loss": 0.007750118616968393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.750118675176054e-05, + "grad_norm": 3.871609687805176, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8838287591934204, + "num_tokens": 431830517.0, + "step": 11316 + }, + { + "epoch": 1.439638722808803, + "ewc_loss": 0.007757626939564943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75762673583813e-05, + "grad_norm": 3.865000009536743, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8773841857910156, + "num_tokens": 431866047.0, + "step": 11317 + }, + { + "epoch": 1.4397659330873935, + "ewc_loss": 0.00775443110615015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.754431135253981e-05, + "grad_norm": 3.880614995956421, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8638846278190613, + "num_tokens": 431901602.0, + "step": 11318 + }, + { + "epoch": 1.439893143365984, + "ewc_loss": 0.0077492427080869675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749242649879307e-05, + "grad_norm": 3.945209503173828, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8692251443862915, + "num_tokens": 431934989.0, + "step": 11319 + }, + { + "epoch": 1.4400203536445746, + "ewc_loss": 0.007804335560649633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.804335473338142e-05, + "grad_norm": 3.8674468994140625, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8788067102432251, + "num_tokens": 431968664.0, + "step": 11320 + }, + { + "epoch": 1.4401475639231651, + "ewc_loss": 0.007765759713947773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.765759801259264e-05, + "grad_norm": 3.821669101715088, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8844306468963623, + "num_tokens": 432008025.0, + "step": 11321 + }, + { + "epoch": 1.4402747742017554, + "ewc_loss": 0.007758636027574539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.758635911159217e-05, + "grad_norm": 3.8385305404663086, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8777400255203247, + "num_tokens": 432043526.0, + "step": 11322 + }, + { + "epoch": 1.440401984480346, + "ewc_loss": 0.007795160636305809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.795160490786657e-05, + "grad_norm": 3.8452906608581543, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8737328052520752, + "num_tokens": 432085175.0, + "step": 11323 + }, + { + "epoch": 1.4405291947589365, + "ewc_loss": 0.007790171541273594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.790171366650611e-05, + "grad_norm": 3.893535852432251, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8662593364715576, + "num_tokens": 432122379.0, + "step": 11324 + }, + { + "epoch": 1.440656405037527, + "ewc_loss": 0.007819193415343761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819192978786305e-05, + "grad_norm": 3.88439679145813, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8761051893234253, + "num_tokens": 432157690.0, + "step": 11325 + }, + { + "epoch": 1.4407836153161175, + "ewc_loss": 0.007798332255333662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.798332080710679e-05, + "grad_norm": 3.923093318939209, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8817490339279175, + "num_tokens": 432188885.0, + "step": 11326 + }, + { + "epoch": 1.440910825594708, + "ewc_loss": 0.007830977439880371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.830977119738236e-05, + "grad_norm": 3.830796241760254, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8742888569831848, + "num_tokens": 432228804.0, + "step": 11327 + }, + { + "epoch": 1.4410380358732986, + "ewc_loss": 0.0077711958438158035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.771195669192821e-05, + "grad_norm": 3.9138097763061523, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8578013181686401, + "num_tokens": 432264150.0, + "step": 11328 + }, + { + "epoch": 1.4411652461518891, + "ewc_loss": 0.007867218926548958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867219392210245e-05, + "grad_norm": 3.8462071418762207, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8688097596168518, + "num_tokens": 432306759.0, + "step": 11329 + }, + { + "epoch": 1.4412924564304797, + "ewc_loss": 0.0077950237318873405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79502370278351e-05, + "grad_norm": 3.818347930908203, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8849680423736572, + "num_tokens": 432346928.0, + "step": 11330 + }, + { + "epoch": 1.44141966670907, + "ewc_loss": 0.007804299239069223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.804299093550071e-05, + "grad_norm": 3.8025121688842773, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8815215229988098, + "num_tokens": 432386630.0, + "step": 11331 + }, + { + "epoch": 1.4415468769876605, + "ewc_loss": 0.007795204874128103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.795204874128103e-05, + "grad_norm": 3.846642255783081, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8628291487693787, + "num_tokens": 432427331.0, + "step": 11332 + }, + { + "epoch": 1.441674087266251, + "ewc_loss": 0.007839377038180828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839377212803811e-05, + "grad_norm": 3.9022927284240723, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8668556213378906, + "num_tokens": 432462901.0, + "step": 11333 + }, + { + "epoch": 1.4418012975448415, + "ewc_loss": 0.007831096649169922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831096445443109e-05, + "grad_norm": 3.9086084365844727, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8713981509208679, + "num_tokens": 432494232.0, + "step": 11334 + }, + { + "epoch": 1.441928507823432, + "ewc_loss": 0.007836136035621166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836135773686692e-05, + "grad_norm": 3.827049493789673, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8858885765075684, + "num_tokens": 432534276.0, + "step": 11335 + }, + { + "epoch": 1.4420557181020226, + "ewc_loss": 0.007792249321937561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.792249380145222e-05, + "grad_norm": 3.846034288406372, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8666819334030151, + "num_tokens": 432577337.0, + "step": 11336 + }, + { + "epoch": 1.4421829283806131, + "ewc_loss": 0.007825830951333046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825830834917724e-05, + "grad_norm": 3.8346803188323975, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.866889476776123, + "num_tokens": 432614290.0, + "step": 11337 + }, + { + "epoch": 1.4423101386592037, + "ewc_loss": 0.007831713184714317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831713446648791e-05, + "grad_norm": 3.886228561401367, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8758343458175659, + "num_tokens": 432648558.0, + "step": 11338 + }, + { + "epoch": 1.4424373489377942, + "ewc_loss": 0.007838589139282703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838589226594195e-05, + "grad_norm": 3.9041595458984375, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8497394323348999, + "num_tokens": 432684243.0, + "step": 11339 + }, + { + "epoch": 1.4425645592163847, + "ewc_loss": 0.007849401794373989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849402027204633e-05, + "grad_norm": 3.8993611335754395, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.873498797416687, + "num_tokens": 432719355.0, + "step": 11340 + }, + { + "epoch": 1.4426917694949752, + "ewc_loss": 0.007853295654058456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853296119719744e-05, + "grad_norm": 3.8964900970458984, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8744939565658569, + "num_tokens": 432758675.0, + "step": 11341 + }, + { + "epoch": 1.4428189797735658, + "ewc_loss": 0.007850144989788532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850144902477041e-05, + "grad_norm": 3.8501107692718506, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8704842925071716, + "num_tokens": 432801677.0, + "step": 11342 + }, + { + "epoch": 1.4429461900521563, + "ewc_loss": 0.007822413928806782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.822414045222104e-05, + "grad_norm": 3.9214627742767334, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8727943897247314, + "num_tokens": 432836299.0, + "step": 11343 + }, + { + "epoch": 1.4430734003307468, + "ewc_loss": 0.007885140366852283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.885140075813979e-05, + "grad_norm": 3.9284164905548096, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8724073171615601, + "num_tokens": 432868838.0, + "step": 11344 + }, + { + "epoch": 1.4432006106093374, + "ewc_loss": 0.007862684316933155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862684287829325e-05, + "grad_norm": 3.863863945007324, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.874555230140686, + "num_tokens": 432904895.0, + "step": 11345 + }, + { + "epoch": 1.4433278208879279, + "ewc_loss": 0.007810328621417284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.810328679624945e-05, + "grad_norm": 3.8588364124298096, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8699632883071899, + "num_tokens": 432944785.0, + "step": 11346 + }, + { + "epoch": 1.4434550311665182, + "ewc_loss": 0.007838481105864048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838480814825743e-05, + "grad_norm": 3.9135308265686035, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8650079965591431, + "num_tokens": 432975645.0, + "step": 11347 + }, + { + "epoch": 1.4435822414451087, + "ewc_loss": 0.007880852557718754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88085235399194e-05, + "grad_norm": 3.854693651199341, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8704863786697388, + "num_tokens": 433016887.0, + "step": 11348 + }, + { + "epoch": 1.4437094517236992, + "ewc_loss": 0.00782308354973793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.823083433322608e-05, + "grad_norm": 3.8637146949768066, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.876350998878479, + "num_tokens": 433053666.0, + "step": 11349 + }, + { + "epoch": 1.4438366620022898, + "ewc_loss": 0.007856513373553753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856513548176736e-05, + "grad_norm": 3.8239564895629883, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8695411086082458, + "num_tokens": 433101131.0, + "step": 11350 + }, + { + "epoch": 1.4439638722808803, + "ewc_loss": 0.007836984470486641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836984877940267e-05, + "grad_norm": 3.908693552017212, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8818738460540771, + "num_tokens": 433132535.0, + "step": 11351 + }, + { + "epoch": 1.4440910825594708, + "ewc_loss": 0.007885649800300598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.885650120442733e-05, + "grad_norm": 3.9062459468841553, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8599765300750732, + "num_tokens": 433168229.0, + "step": 11352 + }, + { + "epoch": 1.4442182928380614, + "ewc_loss": 0.007838823832571507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838823512429371e-05, + "grad_norm": 3.861724615097046, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8791884779930115, + "num_tokens": 433206094.0, + "step": 11353 + }, + { + "epoch": 1.4443455031166519, + "ewc_loss": 0.007818630896508694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818630547262728e-05, + "grad_norm": 3.9093759059906006, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8627640008926392, + "num_tokens": 433241038.0, + "step": 11354 + }, + { + "epoch": 1.4444727133952424, + "ewc_loss": 0.007853314280509949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853314309613779e-05, + "grad_norm": 3.9233758449554443, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8668027520179749, + "num_tokens": 433274674.0, + "step": 11355 + }, + { + "epoch": 1.4445999236738327, + "ewc_loss": 0.00785736832767725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857368473196402e-05, + "grad_norm": 3.8666725158691406, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8757518529891968, + "num_tokens": 433314511.0, + "step": 11356 + }, + { + "epoch": 1.4447271339524232, + "ewc_loss": 0.007809313014149666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.809312955942005e-05, + "grad_norm": 3.882319450378418, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.876689076423645, + "num_tokens": 433349338.0, + "step": 11357 + }, + { + "epoch": 1.4448543442310138, + "ewc_loss": 0.007853375747799873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8533761552535e-05, + "grad_norm": 3.8751778602600098, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8849011659622192, + "num_tokens": 433382609.0, + "step": 11358 + }, + { + "epoch": 1.4449815545096043, + "ewc_loss": 0.007844128645956516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844128413125873e-05, + "grad_norm": 3.9253716468811035, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.88714200258255, + "num_tokens": 433411553.0, + "step": 11359 + }, + { + "epoch": 1.4451087647881948, + "ewc_loss": 0.007879842072725296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.879842451075092e-05, + "grad_norm": 3.8583405017852783, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8736916780471802, + "num_tokens": 433450571.0, + "step": 11360 + }, + { + "epoch": 1.4452359750667854, + "ewc_loss": 0.007827797904610634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8277982538566e-05, + "grad_norm": 3.8866357803344727, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8695634603500366, + "num_tokens": 433486994.0, + "step": 11361 + }, + { + "epoch": 1.445363185345376, + "ewc_loss": 0.007866960018873215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866959640523419e-05, + "grad_norm": 3.8572137355804443, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8871387243270874, + "num_tokens": 433521407.0, + "step": 11362 + }, + { + "epoch": 1.4454903956239664, + "ewc_loss": 0.007837616838514805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837616431061178e-05, + "grad_norm": 3.8110010623931885, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8852862119674683, + "num_tokens": 433561338.0, + "step": 11363 + }, + { + "epoch": 1.445617605902557, + "ewc_loss": 0.007832355797290802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832355913706124e-05, + "grad_norm": 3.886899471282959, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.881521463394165, + "num_tokens": 433598892.0, + "step": 11364 + }, + { + "epoch": 1.4457448161811475, + "ewc_loss": 0.00788122694939375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.881227065809071e-05, + "grad_norm": 3.8432445526123047, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8637409210205078, + "num_tokens": 433641423.0, + "step": 11365 + }, + { + "epoch": 1.445872026459738, + "ewc_loss": 0.007840503938496113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.840503531042486e-05, + "grad_norm": 3.847418785095215, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8759916424751282, + "num_tokens": 433677385.0, + "step": 11366 + }, + { + "epoch": 1.4459992367383285, + "ewc_loss": 0.00785380881279707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853809074731544e-05, + "grad_norm": 3.849616050720215, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8704288005828857, + "num_tokens": 433716901.0, + "step": 11367 + }, + { + "epoch": 1.446126447016919, + "ewc_loss": 0.007846581749618053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.846581866033375e-05, + "grad_norm": 3.848936080932617, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8716916441917419, + "num_tokens": 433755099.0, + "step": 11368 + }, + { + "epoch": 1.4462536572955096, + "ewc_loss": 0.007849151268601418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849151006666943e-05, + "grad_norm": 3.851794719696045, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8698010444641113, + "num_tokens": 433794012.0, + "step": 11369 + }, + { + "epoch": 1.4463808675741001, + "ewc_loss": 0.007851455360651016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851455302443355e-05, + "grad_norm": 3.881253957748413, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8785390257835388, + "num_tokens": 433831965.0, + "step": 11370 + }, + { + "epoch": 1.4465080778526904, + "ewc_loss": 0.007846841588616371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.846841617720202e-05, + "grad_norm": 3.83508038520813, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8744747638702393, + "num_tokens": 433870819.0, + "step": 11371 + }, + { + "epoch": 1.446635288131281, + "ewc_loss": 0.007818130776286125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818130688974634e-05, + "grad_norm": 3.8632309436798096, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8678895831108093, + "num_tokens": 433909197.0, + "step": 11372 + }, + { + "epoch": 1.4467624984098715, + "ewc_loss": 0.00784022081643343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.840221223887056e-05, + "grad_norm": 3.8850600719451904, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8796333074569702, + "num_tokens": 433946026.0, + "step": 11373 + }, + { + "epoch": 1.446889708688462, + "ewc_loss": 0.007841314189136028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841314072720706e-05, + "grad_norm": 3.8137924671173096, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8945878744125366, + "num_tokens": 433986239.0, + "step": 11374 + }, + { + "epoch": 1.4470169189670525, + "ewc_loss": 0.0077939387410879135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793938857503235e-05, + "grad_norm": 3.834719181060791, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8782140016555786, + "num_tokens": 434022557.0, + "step": 11375 + }, + { + "epoch": 1.447144129245643, + "ewc_loss": 0.007816910743713379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.816910510882735e-05, + "grad_norm": 3.865391254425049, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8772058486938477, + "num_tokens": 434059258.0, + "step": 11376 + }, + { + "epoch": 1.4472713395242336, + "ewc_loss": 0.007830290123820305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.830290269339457e-05, + "grad_norm": 3.8529765605926514, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.863083004951477, + "num_tokens": 434094872.0, + "step": 11377 + }, + { + "epoch": 1.4473985498028241, + "ewc_loss": 0.007793922908604145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793922850396484e-05, + "grad_norm": 3.8874714374542236, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8517885208129883, + "num_tokens": 434134842.0, + "step": 11378 + }, + { + "epoch": 1.4475257600814146, + "ewc_loss": 0.007842942140996456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842941704438999e-05, + "grad_norm": 3.875474452972412, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8867743015289307, + "num_tokens": 434173152.0, + "step": 11379 + }, + { + "epoch": 1.447652970360005, + "ewc_loss": 0.007788016460835934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78801622800529e-05, + "grad_norm": 3.8719723224639893, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8763231635093689, + "num_tokens": 434208318.0, + "step": 11380 + }, + { + "epoch": 1.4477801806385955, + "ewc_loss": 0.00780999381095171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.809993985574692e-05, + "grad_norm": 3.8486814498901367, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8728641867637634, + "num_tokens": 434243693.0, + "step": 11381 + }, + { + "epoch": 1.447907390917186, + "ewc_loss": 0.007803017273545265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.803017069818452e-05, + "grad_norm": 3.8719725608825684, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8625746369361877, + "num_tokens": 434283172.0, + "step": 11382 + }, + { + "epoch": 1.4480346011957765, + "ewc_loss": 0.007821237668395042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82123752287589e-05, + "grad_norm": 3.8627266883850098, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8753942251205444, + "num_tokens": 434318789.0, + "step": 11383 + }, + { + "epoch": 1.448161811474367, + "ewc_loss": 0.007791424170136452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791424286551774e-05, + "grad_norm": 3.837449789047241, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.865534245967865, + "num_tokens": 434358943.0, + "step": 11384 + }, + { + "epoch": 1.4482890217529576, + "ewc_loss": 0.007790369912981987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.790370000293478e-05, + "grad_norm": 3.8334298133850098, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8754440546035767, + "num_tokens": 434398264.0, + "step": 11385 + }, + { + "epoch": 1.4484162320315481, + "ewc_loss": 0.0077922469936311245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.792247197357938e-05, + "grad_norm": 3.9042959213256836, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8838026523590088, + "num_tokens": 434433602.0, + "step": 11386 + }, + { + "epoch": 1.4485434423101387, + "ewc_loss": 0.00783450249582529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.834502321202308e-05, + "grad_norm": 3.8190176486968994, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8667542934417725, + "num_tokens": 434480001.0, + "step": 11387 + }, + { + "epoch": 1.4486706525887292, + "ewc_loss": 0.007746129296720028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.746129267616197e-05, + "grad_norm": 3.8289945125579834, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8894342184066772, + "num_tokens": 434519285.0, + "step": 11388 + }, + { + "epoch": 1.4487978628673197, + "ewc_loss": 0.007789146155118942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.789146184222773e-05, + "grad_norm": 3.921466588973999, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8732569217681885, + "num_tokens": 434556118.0, + "step": 11389 + }, + { + "epoch": 1.4489250731459102, + "ewc_loss": 0.007834968157112598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.834967982489616e-05, + "grad_norm": 3.855691909790039, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8593918085098267, + "num_tokens": 434595350.0, + "step": 11390 + }, + { + "epoch": 1.4490522834245008, + "ewc_loss": 0.007747202645987272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.74720247136429e-05, + "grad_norm": 3.8297879695892334, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8712160587310791, + "num_tokens": 434632988.0, + "step": 11391 + }, + { + "epoch": 1.4491794937030913, + "ewc_loss": 0.007783013861626387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.783014007145539e-05, + "grad_norm": 3.8394527435302734, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8804964423179626, + "num_tokens": 434673040.0, + "step": 11392 + }, + { + "epoch": 1.4493067039816818, + "ewc_loss": 0.007764710113406181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.764709880575538e-05, + "grad_norm": 3.841794013977051, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8888641595840454, + "num_tokens": 434712201.0, + "step": 11393 + }, + { + "epoch": 1.4494339142602723, + "ewc_loss": 0.007771367207169533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.771367381792516e-05, + "grad_norm": 3.8213727474212646, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.887920618057251, + "num_tokens": 434756676.0, + "step": 11394 + }, + { + "epoch": 1.4495611245388629, + "ewc_loss": 0.007742009125649929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.742008892819285e-05, + "grad_norm": 3.8338823318481445, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.878964900970459, + "num_tokens": 434797890.0, + "step": 11395 + }, + { + "epoch": 1.4496883348174532, + "ewc_loss": 0.00774706806987524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.747067866148427e-05, + "grad_norm": 3.8394625186920166, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.887885570526123, + "num_tokens": 434836933.0, + "step": 11396 + }, + { + "epoch": 1.4498155450960437, + "ewc_loss": 0.007741290610283613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741290755802765e-05, + "grad_norm": 3.944953680038452, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8881906867027283, + "num_tokens": 434868098.0, + "step": 11397 + }, + { + "epoch": 1.4499427553746342, + "ewc_loss": 0.007795709650963545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.795709825586528e-05, + "grad_norm": 3.8249094486236572, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8719779253005981, + "num_tokens": 434907239.0, + "step": 11398 + }, + { + "epoch": 1.4500699656532248, + "ewc_loss": 0.007691692095249891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.6916920079384e-05, + "grad_norm": 3.841146230697632, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8813774585723877, + "num_tokens": 434945672.0, + "step": 11399 + }, + { + "epoch": 1.4501971759318153, + "ewc_loss": 0.007755006663501263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.755006663501263e-05, + "grad_norm": 3.8942697048187256, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8659963607788086, + "num_tokens": 434981693.0, + "step": 11400 + }, + { + "epoch": 1.4503243862104058, + "ewc_loss": 0.007770598400384188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770598313072696e-05, + "grad_norm": 3.867191791534424, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8950650691986084, + "num_tokens": 435012452.0, + "step": 11401 + }, + { + "epoch": 1.4504515964889964, + "ewc_loss": 0.007752563338726759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75256339693442e-05, + "grad_norm": 3.874462604522705, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8691842555999756, + "num_tokens": 435052711.0, + "step": 11402 + }, + { + "epoch": 1.4505788067675869, + "ewc_loss": 0.007775511592626572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.775511767249554e-05, + "grad_norm": 3.8175508975982666, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8716447353363037, + "num_tokens": 435092766.0, + "step": 11403 + }, + { + "epoch": 1.4507060170461774, + "ewc_loss": 0.007763307075947523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763307075947523e-05, + "grad_norm": 3.8852462768554688, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8600447177886963, + "num_tokens": 435132093.0, + "step": 11404 + }, + { + "epoch": 1.4508332273247677, + "ewc_loss": 0.007813376374542713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.813376578269526e-05, + "grad_norm": 3.858097791671753, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8719202280044556, + "num_tokens": 435169071.0, + "step": 11405 + }, + { + "epoch": 1.4509604376033582, + "ewc_loss": 0.007761212531477213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.7612123277504e-05, + "grad_norm": 3.970647096633911, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8851065039634705, + "num_tokens": 435202665.0, + "step": 11406 + }, + { + "epoch": 1.4510876478819488, + "ewc_loss": 0.007863597013056278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.863596692914143e-05, + "grad_norm": 3.8741137981414795, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8889390230178833, + "num_tokens": 435234719.0, + "step": 11407 + }, + { + "epoch": 1.4512148581605393, + "ewc_loss": 0.007776112761348486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776112761348486e-05, + "grad_norm": 3.8639278411865234, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8893682360649109, + "num_tokens": 435267090.0, + "step": 11408 + }, + { + "epoch": 1.4513420684391298, + "ewc_loss": 0.007820573635399342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820573227945715e-05, + "grad_norm": 3.78564190864563, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8836747407913208, + "num_tokens": 435307243.0, + "step": 11409 + }, + { + "epoch": 1.4514692787177204, + "ewc_loss": 0.007790973410010338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.790973177179694e-05, + "grad_norm": 3.8341829776763916, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8825622797012329, + "num_tokens": 435347507.0, + "step": 11410 + }, + { + "epoch": 1.4515964889963109, + "ewc_loss": 0.007856080308556557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85607990110293e-05, + "grad_norm": 3.853695869445801, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8672314882278442, + "num_tokens": 435383499.0, + "step": 11411 + }, + { + "epoch": 1.4517236992749014, + "ewc_loss": 0.007849290035665035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849289977457374e-05, + "grad_norm": 3.8617234230041504, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8749915957450867, + "num_tokens": 435423500.0, + "step": 11412 + }, + { + "epoch": 1.451850909553492, + "ewc_loss": 0.007832234725356102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832234405213967e-05, + "grad_norm": 3.8826849460601807, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8787818551063538, + "num_tokens": 435457075.0, + "step": 11413 + }, + { + "epoch": 1.4519781198320825, + "ewc_loss": 0.007855636067688465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.855636067688465e-05, + "grad_norm": 3.8430864810943604, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8725652098655701, + "num_tokens": 435498933.0, + "step": 11414 + }, + { + "epoch": 1.452105330110673, + "ewc_loss": 0.007825366221368313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825366628821939e-05, + "grad_norm": 3.8897593021392822, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8665102124214172, + "num_tokens": 435535949.0, + "step": 11415 + }, + { + "epoch": 1.4522325403892635, + "ewc_loss": 0.007878163829445839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8781638876535e-05, + "grad_norm": 3.960158109664917, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8691000938415527, + "num_tokens": 435565252.0, + "step": 11416 + }, + { + "epoch": 1.452359750667854, + "ewc_loss": 0.007884455844759941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884455408202484e-05, + "grad_norm": 3.834465742111206, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8569886684417725, + "num_tokens": 435604663.0, + "step": 11417 + }, + { + "epoch": 1.4524869609464446, + "ewc_loss": 0.007818777114152908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818776794010773e-05, + "grad_norm": 3.837218761444092, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8715111017227173, + "num_tokens": 435640883.0, + "step": 11418 + }, + { + "epoch": 1.4526141712250351, + "ewc_loss": 0.007881502620875835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.881502824602649e-05, + "grad_norm": 3.8008298873901367, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8773162961006165, + "num_tokens": 435682787.0, + "step": 11419 + }, + { + "epoch": 1.4527413815036254, + "ewc_loss": 0.007853858172893524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853857823647559e-05, + "grad_norm": 3.85087251663208, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8726235628128052, + "num_tokens": 435719982.0, + "step": 11420 + }, + { + "epoch": 1.452868591782216, + "ewc_loss": 0.007898305542767048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.898305193521082e-05, + "grad_norm": 3.9627161026000977, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8573200702667236, + "num_tokens": 435755153.0, + "step": 11421 + }, + { + "epoch": 1.4529958020608065, + "ewc_loss": 0.007953339256346226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953339081723243e-05, + "grad_norm": 3.789519786834717, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8831659555435181, + "num_tokens": 435797513.0, + "step": 11422 + }, + { + "epoch": 1.453123012339397, + "ewc_loss": 0.007806296460330486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.806296343915164e-05, + "grad_norm": 3.837064027786255, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8753910064697266, + "num_tokens": 435836835.0, + "step": 11423 + }, + { + "epoch": 1.4532502226179875, + "ewc_loss": 0.007909068837761879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909068517619744e-05, + "grad_norm": 3.855628490447998, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8656427264213562, + "num_tokens": 435875124.0, + "step": 11424 + }, + { + "epoch": 1.453377432896578, + "ewc_loss": 0.007866527885198593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866527448641136e-05, + "grad_norm": 3.7915117740631104, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8864827156066895, + "num_tokens": 435914993.0, + "step": 11425 + }, + { + "epoch": 1.4535046431751686, + "ewc_loss": 0.00782980676740408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829807145753875e-05, + "grad_norm": 3.848696708679199, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8812228441238403, + "num_tokens": 435950916.0, + "step": 11426 + }, + { + "epoch": 1.4536318534537591, + "ewc_loss": 0.007897098548710346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.897098839748651e-05, + "grad_norm": 3.864924430847168, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8693156242370605, + "num_tokens": 435991986.0, + "step": 11427 + }, + { + "epoch": 1.4537590637323496, + "ewc_loss": 0.007857021875679493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857022137613967e-05, + "grad_norm": 3.9347760677337646, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.883265495300293, + "num_tokens": 436024603.0, + "step": 11428 + }, + { + "epoch": 1.45388627401094, + "ewc_loss": 0.007902927696704865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902927609393373e-05, + "grad_norm": 3.8070640563964844, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8705406188964844, + "num_tokens": 436065540.0, + "step": 11429 + }, + { + "epoch": 1.4540134842895305, + "ewc_loss": 0.007804073393344879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.804073538864031e-05, + "grad_norm": 3.8911972045898438, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8600986003875732, + "num_tokens": 436104394.0, + "step": 11430 + }, + { + "epoch": 1.454140694568121, + "ewc_loss": 0.007893174886703491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.893174915807322e-05, + "grad_norm": 3.9094150066375732, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.861383855342865, + "num_tokens": 436139977.0, + "step": 11431 + }, + { + "epoch": 1.4542679048467115, + "ewc_loss": 0.007865501567721367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.865501538617536e-05, + "grad_norm": 3.830322742462158, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8890597224235535, + "num_tokens": 436178047.0, + "step": 11432 + }, + { + "epoch": 1.454395115125302, + "ewc_loss": 0.007811348885297775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811348768882453e-05, + "grad_norm": 3.8227312564849854, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8737319111824036, + "num_tokens": 436218331.0, + "step": 11433 + }, + { + "epoch": 1.4545223254038926, + "ewc_loss": 0.007839781232178211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839781028451398e-05, + "grad_norm": 3.866917133331299, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8715833425521851, + "num_tokens": 436255663.0, + "step": 11434 + }, + { + "epoch": 1.4546495356824831, + "ewc_loss": 0.00785911362618208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859113247832283e-05, + "grad_norm": 3.8274550437927246, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.875423789024353, + "num_tokens": 436301328.0, + "step": 11435 + }, + { + "epoch": 1.4547767459610736, + "ewc_loss": 0.007800288498401642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.800288585713133e-05, + "grad_norm": 3.885615348815918, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8841364979743958, + "num_tokens": 436336487.0, + "step": 11436 + }, + { + "epoch": 1.4549039562396642, + "ewc_loss": 0.007839557714760303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839557656552643e-05, + "grad_norm": 3.8309240341186523, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8776142597198486, + "num_tokens": 436377629.0, + "step": 11437 + }, + { + "epoch": 1.4550311665182547, + "ewc_loss": 0.007785847410559654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785847265040502e-05, + "grad_norm": 3.8153645992279053, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8871994614601135, + "num_tokens": 436421590.0, + "step": 11438 + }, + { + "epoch": 1.4551583767968452, + "ewc_loss": 0.007766570430248976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766570342937484e-05, + "grad_norm": 3.8454339504241943, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8683165311813354, + "num_tokens": 436461172.0, + "step": 11439 + }, + { + "epoch": 1.4552855870754358, + "ewc_loss": 0.007790581788867712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.790581730660051e-05, + "grad_norm": 3.848520278930664, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8836790323257446, + "num_tokens": 436497536.0, + "step": 11440 + }, + { + "epoch": 1.4554127973540263, + "ewc_loss": 0.0077689532190561295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76895321905613e-05, + "grad_norm": 3.798935651779175, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8720811605453491, + "num_tokens": 436543906.0, + "step": 11441 + }, + { + "epoch": 1.4555400076326168, + "ewc_loss": 0.007729553151875734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729553180979565e-05, + "grad_norm": 3.907151460647583, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8771897554397583, + "num_tokens": 436579462.0, + "step": 11442 + }, + { + "epoch": 1.4556672179112073, + "ewc_loss": 0.007823886349797249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.823886699043214e-05, + "grad_norm": 3.8903002738952637, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8669172525405884, + "num_tokens": 436618840.0, + "step": 11443 + }, + { + "epoch": 1.4557944281897979, + "ewc_loss": 0.007741186767816544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741186709608883e-05, + "grad_norm": 3.8821640014648438, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.862868070602417, + "num_tokens": 436654176.0, + "step": 11444 + }, + { + "epoch": 1.4559216384683882, + "ewc_loss": 0.007766605354845524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766605267534032e-05, + "grad_norm": 3.9107205867767334, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8852978944778442, + "num_tokens": 436685507.0, + "step": 11445 + }, + { + "epoch": 1.4560488487469787, + "ewc_loss": 0.007789685856550932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.789686060277745e-05, + "grad_norm": 3.8079891204833984, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8832992911338806, + "num_tokens": 436729881.0, + "step": 11446 + }, + { + "epoch": 1.4561760590255692, + "ewc_loss": 0.0077124182134866714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.712418300798163e-05, + "grad_norm": 3.851320743560791, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8737084865570068, + "num_tokens": 436772355.0, + "step": 11447 + }, + { + "epoch": 1.4563032693041598, + "ewc_loss": 0.007782158441841602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782158354530111e-05, + "grad_norm": 3.867231845855713, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8691232204437256, + "num_tokens": 436808701.0, + "step": 11448 + }, + { + "epoch": 1.4564304795827503, + "ewc_loss": 0.00776960514485836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76960514485836e-05, + "grad_norm": 3.820497512817383, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8849290609359741, + "num_tokens": 436845441.0, + "step": 11449 + }, + { + "epoch": 1.4565576898613408, + "ewc_loss": 0.007755689788609743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.755689875921234e-05, + "grad_norm": 3.836564779281616, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8766872882843018, + "num_tokens": 436885316.0, + "step": 11450 + }, + { + "epoch": 1.4566849001399313, + "ewc_loss": 0.007788955699652433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.788955554133281e-05, + "grad_norm": 3.896134614944458, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8704453110694885, + "num_tokens": 436918315.0, + "step": 11451 + }, + { + "epoch": 1.4568121104185219, + "ewc_loss": 0.007819008082151413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819008169462904e-05, + "grad_norm": 3.9057607650756836, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8670941591262817, + "num_tokens": 436956039.0, + "step": 11452 + }, + { + "epoch": 1.4569393206971124, + "ewc_loss": 0.00780204264447093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.802042819093913e-05, + "grad_norm": 3.901698589324951, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8583762049674988, + "num_tokens": 436991274.0, + "step": 11453 + }, + { + "epoch": 1.4570665309757027, + "ewc_loss": 0.007810764014720917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.810763781890273e-05, + "grad_norm": 3.812601327896118, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8857128620147705, + "num_tokens": 437029963.0, + "step": 11454 + }, + { + "epoch": 1.4571937412542932, + "ewc_loss": 0.007764329202473164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.764329347992316e-05, + "grad_norm": 3.8831307888031006, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8675164580345154, + "num_tokens": 437067030.0, + "step": 11455 + }, + { + "epoch": 1.4573209515328838, + "ewc_loss": 0.007840396836400032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.840396574465558e-05, + "grad_norm": 3.871324300765991, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8822433352470398, + "num_tokens": 437103363.0, + "step": 11456 + }, + { + "epoch": 1.4574481618114743, + "ewc_loss": 0.007809349335730076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.809349335730076e-05, + "grad_norm": 3.9226081371307373, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8809577822685242, + "num_tokens": 437131574.0, + "step": 11457 + }, + { + "epoch": 1.4575753720900648, + "ewc_loss": 0.00786573626101017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.865736552048475e-05, + "grad_norm": 3.8724327087402344, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.867078959941864, + "num_tokens": 437169537.0, + "step": 11458 + }, + { + "epoch": 1.4577025823686554, + "ewc_loss": 0.007827220484614372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.827220542822033e-05, + "grad_norm": 3.8184823989868164, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8789210319519043, + "num_tokens": 437208025.0, + "step": 11459 + }, + { + "epoch": 1.4578297926472459, + "ewc_loss": 0.007816907949745655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.816908328095451e-05, + "grad_norm": 3.8303980827331543, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8844974040985107, + "num_tokens": 437246642.0, + "step": 11460 + }, + { + "epoch": 1.4579570029258364, + "ewc_loss": 0.00784229300916195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842292689019814e-05, + "grad_norm": 3.8364312648773193, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8731977343559265, + "num_tokens": 437285274.0, + "step": 11461 + }, + { + "epoch": 1.458084213204427, + "ewc_loss": 0.007826929911971092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.826929504517466e-05, + "grad_norm": 3.775803327560425, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8911205530166626, + "num_tokens": 437325865.0, + "step": 11462 + }, + { + "epoch": 1.4582114234830175, + "ewc_loss": 0.0077875214628875256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.787521462887526e-05, + "grad_norm": 3.8683359622955322, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.879487156867981, + "num_tokens": 437364093.0, + "step": 11463 + }, + { + "epoch": 1.458338633761608, + "ewc_loss": 0.007848716340959072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848716631997377e-05, + "grad_norm": 3.8653576374053955, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8632428646087646, + "num_tokens": 437402413.0, + "step": 11464 + }, + { + "epoch": 1.4584658440401985, + "ewc_loss": 0.007812024559825659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812024705344811e-05, + "grad_norm": 3.9036388397216797, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8607767224311829, + "num_tokens": 437442583.0, + "step": 11465 + }, + { + "epoch": 1.458593054318789, + "ewc_loss": 0.007835942320525646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835942233214155e-05, + "grad_norm": 3.8414957523345947, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8749663233757019, + "num_tokens": 437485682.0, + "step": 11466 + }, + { + "epoch": 1.4587202645973796, + "ewc_loss": 0.007785255089402199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785255002090707e-05, + "grad_norm": 3.8840768337249756, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8648198843002319, + "num_tokens": 437524859.0, + "step": 11467 + }, + { + "epoch": 1.45884747487597, + "ewc_loss": 0.00782108772546053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.821087638149038e-05, + "grad_norm": 3.8500564098358154, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8671201467514038, + "num_tokens": 437563590.0, + "step": 11468 + }, + { + "epoch": 1.4589746851545604, + "ewc_loss": 0.007786886766552925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786886999383569e-05, + "grad_norm": 3.8381330966949463, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8901901245117188, + "num_tokens": 437601429.0, + "step": 11469 + }, + { + "epoch": 1.459101895433151, + "ewc_loss": 0.0077911545522511005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791154348524287e-05, + "grad_norm": 3.8816258907318115, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8620656132698059, + "num_tokens": 437639144.0, + "step": 11470 + }, + { + "epoch": 1.4592291057117415, + "ewc_loss": 0.007815038785338402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815039134584367e-05, + "grad_norm": 3.8182806968688965, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8822005987167358, + "num_tokens": 437680846.0, + "step": 11471 + }, + { + "epoch": 1.459356315990332, + "ewc_loss": 0.0077457851730287075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.745785114821047e-05, + "grad_norm": 3.8562676906585693, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.878165602684021, + "num_tokens": 437722156.0, + "step": 11472 + }, + { + "epoch": 1.4594835262689225, + "ewc_loss": 0.007803623098880053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.803623157087713e-05, + "grad_norm": 3.848614454269409, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.851425290107727, + "num_tokens": 437766905.0, + "step": 11473 + }, + { + "epoch": 1.459610736547513, + "ewc_loss": 0.007769617717713118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769617513986304e-05, + "grad_norm": 3.8931684494018555, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8742762804031372, + "num_tokens": 437801425.0, + "step": 11474 + }, + { + "epoch": 1.4597379468261036, + "ewc_loss": 0.007804491091519594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.804491178831086e-05, + "grad_norm": 3.880307197570801, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.865708589553833, + "num_tokens": 437838126.0, + "step": 11475 + }, + { + "epoch": 1.4598651571046941, + "ewc_loss": 0.0077864606864750385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786460628267378e-05, + "grad_norm": 3.854280471801758, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8703621029853821, + "num_tokens": 437876597.0, + "step": 11476 + }, + { + "epoch": 1.4599923673832846, + "ewc_loss": 0.00778046902269125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.780468877172098e-05, + "grad_norm": 3.83772349357605, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8854414224624634, + "num_tokens": 437911852.0, + "step": 11477 + }, + { + "epoch": 1.460119577661875, + "ewc_loss": 0.0077769882045686245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.776988059049472e-05, + "grad_norm": 3.847280502319336, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8754618167877197, + "num_tokens": 437953418.0, + "step": 11478 + }, + { + "epoch": 1.4602467879404655, + "ewc_loss": 0.007795469835400581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79546971898526e-05, + "grad_norm": 3.878511905670166, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8793203830718994, + "num_tokens": 437989561.0, + "step": 11479 + }, + { + "epoch": 1.460373998219056, + "ewc_loss": 0.007800146006047726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.800145976943895e-05, + "grad_norm": 3.8466033935546875, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8823500871658325, + "num_tokens": 438033464.0, + "step": 11480 + }, + { + "epoch": 1.4605012084976465, + "ewc_loss": 0.007768556475639343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.768556679366156e-05, + "grad_norm": 3.8756072521209717, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8791010975837708, + "num_tokens": 438068676.0, + "step": 11481 + }, + { + "epoch": 1.460628418776237, + "ewc_loss": 0.007794421166181564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794421253493056e-05, + "grad_norm": 3.8644168376922607, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8575494289398193, + "num_tokens": 438107867.0, + "step": 11482 + }, + { + "epoch": 1.4607556290548276, + "ewc_loss": 0.007796332705765963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.796332647558302e-05, + "grad_norm": 3.8387186527252197, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8902703523635864, + "num_tokens": 438144256.0, + "step": 11483 + }, + { + "epoch": 1.4608828393334181, + "ewc_loss": 0.0077870432287454605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.787043432472274e-05, + "grad_norm": 3.8499135971069336, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8831894397735596, + "num_tokens": 438183458.0, + "step": 11484 + }, + { + "epoch": 1.4610100496120086, + "ewc_loss": 0.0077844541519880295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.784453919157386e-05, + "grad_norm": 3.966032028198242, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8641186952590942, + "num_tokens": 438216025.0, + "step": 11485 + }, + { + "epoch": 1.4611372598905992, + "ewc_loss": 0.007868167012929916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868167449487373e-05, + "grad_norm": 3.813666820526123, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8836939334869385, + "num_tokens": 438258185.0, + "step": 11486 + }, + { + "epoch": 1.4612644701691897, + "ewc_loss": 0.007725789211690426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.725789328105748e-05, + "grad_norm": 3.8515384197235107, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.870104193687439, + "num_tokens": 438298189.0, + "step": 11487 + }, + { + "epoch": 1.4613916804477802, + "ewc_loss": 0.007812732830643654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812732656020671e-05, + "grad_norm": 3.8537330627441406, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8634299635887146, + "num_tokens": 438336593.0, + "step": 11488 + }, + { + "epoch": 1.4615188907263708, + "ewc_loss": 0.007795816753059626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.795816782163456e-05, + "grad_norm": 3.8289458751678467, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8805962800979614, + "num_tokens": 438374110.0, + "step": 11489 + }, + { + "epoch": 1.4616461010049613, + "ewc_loss": 0.007788532879203558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.788532820995897e-05, + "grad_norm": 3.877189874649048, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8602979183197021, + "num_tokens": 438413062.0, + "step": 11490 + }, + { + "epoch": 1.4617733112835518, + "ewc_loss": 0.007813713513314724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.813713455107063e-05, + "grad_norm": 3.88185453414917, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8772684931755066, + "num_tokens": 438452055.0, + "step": 11491 + }, + { + "epoch": 1.4619005215621423, + "ewc_loss": 0.007814341224730015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814341370249167e-05, + "grad_norm": 3.8196544647216797, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8758036494255066, + "num_tokens": 438494224.0, + "step": 11492 + }, + { + "epoch": 1.4620277318407329, + "ewc_loss": 0.007779762148857117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.77976238168776e-05, + "grad_norm": 3.886397361755371, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8711663484573364, + "num_tokens": 438529790.0, + "step": 11493 + }, + { + "epoch": 1.4621549421193232, + "ewc_loss": 0.007836133241653442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836133590899408e-05, + "grad_norm": 3.843947410583496, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8772423267364502, + "num_tokens": 438571371.0, + "step": 11494 + }, + { + "epoch": 1.4622821523979137, + "ewc_loss": 0.007782320491969585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782320608384907e-05, + "grad_norm": 3.8876657485961914, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8616472482681274, + "num_tokens": 438612383.0, + "step": 11495 + }, + { + "epoch": 1.4624093626765042, + "ewc_loss": 0.007817324250936508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817323785275221e-05, + "grad_norm": 3.868870496749878, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8647368550300598, + "num_tokens": 438653443.0, + "step": 11496 + }, + { + "epoch": 1.4625365729550948, + "ewc_loss": 0.007792424876242876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.792424730723724e-05, + "grad_norm": 3.912323474884033, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.863943338394165, + "num_tokens": 438689565.0, + "step": 11497 + }, + { + "epoch": 1.4626637832336853, + "ewc_loss": 0.00781519990414381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815199933247641e-05, + "grad_norm": 3.844877243041992, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8756952881813049, + "num_tokens": 438727753.0, + "step": 11498 + }, + { + "epoch": 1.4627909935122758, + "ewc_loss": 0.007780100684612989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78010071371682e-05, + "grad_norm": 3.8951659202575684, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8734996318817139, + "num_tokens": 438764667.0, + "step": 11499 + }, + { + "epoch": 1.4629182037908663, + "ewc_loss": 0.00781618058681488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.816180732334033e-05, + "grad_norm": 3.844599485397339, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8811694383621216, + "num_tokens": 438801739.0, + "step": 11500 + }, + { + "epoch": 1.4630454140694569, + "ewc_loss": 0.007783115841448307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.783115870552137e-05, + "grad_norm": 3.8602805137634277, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.876946210861206, + "num_tokens": 438840028.0, + "step": 11501 + }, + { + "epoch": 1.4631726243480474, + "ewc_loss": 0.007812896743416786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812897092662752e-05, + "grad_norm": 3.853940725326538, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8715074062347412, + "num_tokens": 438880632.0, + "step": 11502 + }, + { + "epoch": 1.4632998346266377, + "ewc_loss": 0.007778836879879236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778836879879236e-05, + "grad_norm": 3.833907127380371, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8721065521240234, + "num_tokens": 438923040.0, + "step": 11503 + }, + { + "epoch": 1.4634270449052282, + "ewc_loss": 0.007762158289551735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.762158202240244e-05, + "grad_norm": 3.8914852142333984, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.874263346195221, + "num_tokens": 438958597.0, + "step": 11504 + }, + { + "epoch": 1.4635542551838188, + "ewc_loss": 0.00779377669095993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793776603648439e-05, + "grad_norm": 3.882111072540283, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8641096949577332, + "num_tokens": 438997909.0, + "step": 11505 + }, + { + "epoch": 1.4636814654624093, + "ewc_loss": 0.00778868468478322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.788684888510033e-05, + "grad_norm": 3.9126498699188232, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8828360438346863, + "num_tokens": 439030084.0, + "step": 11506 + }, + { + "epoch": 1.4638086757409998, + "ewc_loss": 0.007809258997440338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80925911385566e-05, + "grad_norm": 3.886751174926758, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8774985074996948, + "num_tokens": 439067770.0, + "step": 11507 + }, + { + "epoch": 1.4639358860195903, + "ewc_loss": 0.007784790825098753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.784790795994923e-05, + "grad_norm": 3.875796318054199, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8771424293518066, + "num_tokens": 439102476.0, + "step": 11508 + }, + { + "epoch": 1.4640630962981809, + "ewc_loss": 0.0078005860559642315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.800586172379553e-05, + "grad_norm": 3.8329010009765625, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8840309381484985, + "num_tokens": 439138660.0, + "step": 11509 + }, + { + "epoch": 1.4641903065767714, + "ewc_loss": 0.007764337584376335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.764337351545691e-05, + "grad_norm": 3.8092494010925293, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8803874254226685, + "num_tokens": 439181690.0, + "step": 11510 + }, + { + "epoch": 1.464317516855362, + "ewc_loss": 0.007785947062075138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785946945659816e-05, + "grad_norm": 3.895296573638916, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8829730749130249, + "num_tokens": 439218825.0, + "step": 11511 + }, + { + "epoch": 1.4644447271339525, + "ewc_loss": 0.007839377969503403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839377940399572e-05, + "grad_norm": 3.8733818531036377, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8794804215431213, + "num_tokens": 439253505.0, + "step": 11512 + }, + { + "epoch": 1.464571937412543, + "ewc_loss": 0.007781135383993387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781135354889557e-05, + "grad_norm": 3.874873161315918, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8703644871711731, + "num_tokens": 439292764.0, + "step": 11513 + }, + { + "epoch": 1.4646991476911335, + "ewc_loss": 0.007808118127286434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.808118243701756e-05, + "grad_norm": 3.830827474594116, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8831751346588135, + "num_tokens": 439335269.0, + "step": 11514 + }, + { + "epoch": 1.464826357969724, + "ewc_loss": 0.007793413009494543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79341280576773e-05, + "grad_norm": 3.891396999359131, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.875711977481842, + "num_tokens": 439371977.0, + "step": 11515 + }, + { + "epoch": 1.4649535682483146, + "ewc_loss": 0.007821857929229736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82185816206038e-05, + "grad_norm": 3.8559017181396484, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8741906881332397, + "num_tokens": 439412710.0, + "step": 11516 + }, + { + "epoch": 1.465080778526905, + "ewc_loss": 0.007806956302374601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.806956273270771e-05, + "grad_norm": 3.928405523300171, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.873927891254425, + "num_tokens": 439449637.0, + "step": 11517 + }, + { + "epoch": 1.4652079888054954, + "ewc_loss": 0.007837029173970222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837029261281714e-05, + "grad_norm": 3.8319125175476074, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.869249165058136, + "num_tokens": 439495143.0, + "step": 11518 + }, + { + "epoch": 1.465335199084086, + "ewc_loss": 0.007768129464238882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.768129580654204e-05, + "grad_norm": 3.9443724155426025, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8634951114654541, + "num_tokens": 439532721.0, + "step": 11519 + }, + { + "epoch": 1.4654624093626765, + "ewc_loss": 0.007859060540795326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859060860937461e-05, + "grad_norm": 3.88875412940979, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8831580877304077, + "num_tokens": 439567938.0, + "step": 11520 + }, + { + "epoch": 1.465589619641267, + "ewc_loss": 0.007781708613038063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781708700349554e-05, + "grad_norm": 4.0418314933776855, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8835119605064392, + "num_tokens": 439595720.0, + "step": 11521 + }, + { + "epoch": 1.4657168299198575, + "ewc_loss": 0.007902064360678196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902064680820331e-05, + "grad_norm": 3.871248722076416, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8755998611450195, + "num_tokens": 439634091.0, + "step": 11522 + }, + { + "epoch": 1.465844040198448, + "ewc_loss": 0.007761762477457523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761762390146032e-05, + "grad_norm": 3.8311469554901123, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8651865720748901, + "num_tokens": 439671978.0, + "step": 11523 + }, + { + "epoch": 1.4659712504770386, + "ewc_loss": 0.0078113931231200695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8113931522239e-05, + "grad_norm": 3.8915019035339355, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.871415376663208, + "num_tokens": 439707748.0, + "step": 11524 + }, + { + "epoch": 1.466098460755629, + "ewc_loss": 0.007851455360651016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851455302443355e-05, + "grad_norm": 3.8557963371276855, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8809542655944824, + "num_tokens": 439744060.0, + "step": 11525 + }, + { + "epoch": 1.4662256710342196, + "ewc_loss": 0.007820051163434982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820050814189017e-05, + "grad_norm": 3.829479932785034, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8675519824028015, + "num_tokens": 439789277.0, + "step": 11526 + }, + { + "epoch": 1.46635288131281, + "ewc_loss": 0.007807645481079817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807645306456834e-05, + "grad_norm": 3.8273067474365234, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8904993534088135, + "num_tokens": 439827524.0, + "step": 11527 + }, + { + "epoch": 1.4664800915914005, + "ewc_loss": 0.007819206453859806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81920607551001e-05, + "grad_norm": 3.8811748027801514, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8733484148979187, + "num_tokens": 439865181.0, + "step": 11528 + }, + { + "epoch": 1.466607301869991, + "ewc_loss": 0.007840459235012531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84045914770104e-05, + "grad_norm": 3.875180244445801, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.868069052696228, + "num_tokens": 439900071.0, + "step": 11529 + }, + { + "epoch": 1.4667345121485815, + "ewc_loss": 0.007818418554961681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818418816896155e-05, + "grad_norm": 3.85166072845459, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8742662668228149, + "num_tokens": 439940839.0, + "step": 11530 + }, + { + "epoch": 1.466861722427172, + "ewc_loss": 0.007815036922693253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815036951797083e-05, + "grad_norm": 3.8619706630706787, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8708926439285278, + "num_tokens": 439978908.0, + "step": 11531 + }, + { + "epoch": 1.4669889327057626, + "ewc_loss": 0.007839002646505833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83900250098668e-05, + "grad_norm": 3.931886672973633, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8747782707214355, + "num_tokens": 440013448.0, + "step": 11532 + }, + { + "epoch": 1.467116142984353, + "ewc_loss": 0.007856895215809345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856894808355719e-05, + "grad_norm": 3.8949551582336426, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8784871101379395, + "num_tokens": 440048377.0, + "step": 11533 + }, + { + "epoch": 1.4672433532629436, + "ewc_loss": 0.007839486934244633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839487079763785e-05, + "grad_norm": 3.851323127746582, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8668944835662842, + "num_tokens": 440089806.0, + "step": 11534 + }, + { + "epoch": 1.4673705635415342, + "ewc_loss": 0.00781828910112381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818289304850623e-05, + "grad_norm": 3.961392879486084, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8803956508636475, + "num_tokens": 440124209.0, + "step": 11535 + }, + { + "epoch": 1.4674977738201247, + "ewc_loss": 0.007876860909163952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8768607636448e-05, + "grad_norm": 3.9391520023345947, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8747581243515015, + "num_tokens": 440158349.0, + "step": 11536 + }, + { + "epoch": 1.4676249840987152, + "ewc_loss": 0.007832991890609264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832991832401603e-05, + "grad_norm": 3.8627963066101074, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8771703243255615, + "num_tokens": 440195220.0, + "step": 11537 + }, + { + "epoch": 1.4677521943773058, + "ewc_loss": 0.007806774228811264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.806774374330416e-05, + "grad_norm": 3.8504910469055176, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8751305937767029, + "num_tokens": 440235158.0, + "step": 11538 + }, + { + "epoch": 1.4678794046558963, + "ewc_loss": 0.00782095454633236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820954488124698e-05, + "grad_norm": 3.8638980388641357, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8708513975143433, + "num_tokens": 440271879.0, + "step": 11539 + }, + { + "epoch": 1.4680066149344868, + "ewc_loss": 0.007846896536648273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.846896187402308e-05, + "grad_norm": 3.880843162536621, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8734942078590393, + "num_tokens": 440306454.0, + "step": 11540 + }, + { + "epoch": 1.4681338252130773, + "ewc_loss": 0.00785072986036539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850729889469221e-05, + "grad_norm": 3.908597707748413, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8619106411933899, + "num_tokens": 440341185.0, + "step": 11541 + }, + { + "epoch": 1.4682610354916679, + "ewc_loss": 0.00786975584924221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86975579103455e-05, + "grad_norm": 3.859881639480591, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8738768100738525, + "num_tokens": 440379168.0, + "step": 11542 + }, + { + "epoch": 1.4683882457702582, + "ewc_loss": 0.007832081988453865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83208233769983e-05, + "grad_norm": 3.8501665592193604, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8766137957572937, + "num_tokens": 440417945.0, + "step": 11543 + }, + { + "epoch": 1.4685154560488487, + "ewc_loss": 0.007848142646253109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848142558941618e-05, + "grad_norm": 3.899871349334717, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8592157363891602, + "num_tokens": 440457864.0, + "step": 11544 + }, + { + "epoch": 1.4686426663274392, + "ewc_loss": 0.007879002019762993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.879002077970654e-05, + "grad_norm": 3.8840138912200928, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8694130182266235, + "num_tokens": 440491709.0, + "step": 11545 + }, + { + "epoch": 1.4687698766060298, + "ewc_loss": 0.007868736982345581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868737156968564e-05, + "grad_norm": 3.809791088104248, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.881525456905365, + "num_tokens": 440533570.0, + "step": 11546 + }, + { + "epoch": 1.4688970868846203, + "ewc_loss": 0.007824969477951527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.824969361536205e-05, + "grad_norm": 3.8800997734069824, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8816970586776733, + "num_tokens": 440568360.0, + "step": 11547 + }, + { + "epoch": 1.4690242971632108, + "ewc_loss": 0.00789360050112009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.893600559327751e-05, + "grad_norm": 3.8264710903167725, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8843915462493896, + "num_tokens": 440606848.0, + "step": 11548 + }, + { + "epoch": 1.4691515074418013, + "ewc_loss": 0.007837780751287937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837780867703259e-05, + "grad_norm": 3.9121804237365723, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8868207931518555, + "num_tokens": 440638254.0, + "step": 11549 + }, + { + "epoch": 1.4692787177203919, + "ewc_loss": 0.007898914627730846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.898914918769151e-05, + "grad_norm": 3.8686985969543457, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8806450366973877, + "num_tokens": 440674122.0, + "step": 11550 + }, + { + "epoch": 1.4694059279989824, + "ewc_loss": 0.007851014845073223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851015107007697e-05, + "grad_norm": 3.784663677215576, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8883248567581177, + "num_tokens": 440718362.0, + "step": 11551 + }, + { + "epoch": 1.4695331382775727, + "ewc_loss": 0.007802295498549938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.802295294823125e-05, + "grad_norm": 3.838387966156006, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8810302019119263, + "num_tokens": 440755971.0, + "step": 11552 + }, + { + "epoch": 1.4696603485561632, + "ewc_loss": 0.007863664999604225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.863665086915717e-05, + "grad_norm": 3.8816051483154297, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8718799352645874, + "num_tokens": 440794328.0, + "step": 11553 + }, + { + "epoch": 1.4697875588347538, + "ewc_loss": 0.007843606173992157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.843605999369174e-05, + "grad_norm": 3.865455389022827, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8612072467803955, + "num_tokens": 440833383.0, + "step": 11554 + }, + { + "epoch": 1.4699147691133443, + "ewc_loss": 0.007827814668416977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82781426096335e-05, + "grad_norm": 3.7944672107696533, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8816729187965393, + "num_tokens": 440875502.0, + "step": 11555 + }, + { + "epoch": 1.4700419793919348, + "ewc_loss": 0.007801415398716927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80141563154757e-05, + "grad_norm": 3.836317777633667, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8765193223953247, + "num_tokens": 440915259.0, + "step": 11556 + }, + { + "epoch": 1.4701691896705253, + "ewc_loss": 0.007849817164242268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849817484384403e-05, + "grad_norm": 3.8663363456726074, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8719790577888489, + "num_tokens": 440951660.0, + "step": 11557 + }, + { + "epoch": 1.4702963999491159, + "ewc_loss": 0.00783421378582716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.834214193280786e-05, + "grad_norm": 3.832533597946167, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8735215663909912, + "num_tokens": 440992533.0, + "step": 11558 + }, + { + "epoch": 1.4704236102277064, + "ewc_loss": 0.007792504969984293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79250476625748e-05, + "grad_norm": 3.8993351459503174, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8626894950866699, + "num_tokens": 441027360.0, + "step": 11559 + }, + { + "epoch": 1.470550820506297, + "ewc_loss": 0.007860518991947174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.860518962843344e-05, + "grad_norm": 3.8226492404937744, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8795171976089478, + "num_tokens": 441071746.0, + "step": 11560 + }, + { + "epoch": 1.4706780307848875, + "ewc_loss": 0.00776015967130661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.760159496683627e-05, + "grad_norm": 3.894028425216675, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8736793398857117, + "num_tokens": 441105378.0, + "step": 11561 + }, + { + "epoch": 1.470805241063478, + "ewc_loss": 0.007843985222280025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.843985076760873e-05, + "grad_norm": 3.8285505771636963, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8749791383743286, + "num_tokens": 441150000.0, + "step": 11562 + }, + { + "epoch": 1.4709324513420685, + "ewc_loss": 0.007765850052237511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76585002313368e-05, + "grad_norm": 3.8588223457336426, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8674334287643433, + "num_tokens": 441188426.0, + "step": 11563 + }, + { + "epoch": 1.471059661620659, + "ewc_loss": 0.007811978925019503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811978866811842e-05, + "grad_norm": 3.8704957962036133, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8800764083862305, + "num_tokens": 441226028.0, + "step": 11564 + }, + { + "epoch": 1.4711868718992496, + "ewc_loss": 0.007781571708619595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781571912346408e-05, + "grad_norm": 3.898185968399048, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8756084442138672, + "num_tokens": 441265788.0, + "step": 11565 + }, + { + "epoch": 1.47131408217784, + "ewc_loss": 0.007800048682838678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.800048479111865e-05, + "grad_norm": 3.9000182151794434, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8850432634353638, + "num_tokens": 441297280.0, + "step": 11566 + }, + { + "epoch": 1.4714412924564304, + "ewc_loss": 0.007794805336743593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794805424055085e-05, + "grad_norm": 3.8826956748962402, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8737962245941162, + "num_tokens": 441331229.0, + "step": 11567 + }, + { + "epoch": 1.471568502735021, + "ewc_loss": 0.007783414330333471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.783414184814319e-05, + "grad_norm": 3.842996120452881, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8787916898727417, + "num_tokens": 441371064.0, + "step": 11568 + }, + { + "epoch": 1.4716957130136115, + "ewc_loss": 0.007756928913295269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75692897150293e-05, + "grad_norm": 3.9109091758728027, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8580043315887451, + "num_tokens": 441405706.0, + "step": 11569 + }, + { + "epoch": 1.471822923292202, + "ewc_loss": 0.007828805595636368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.828805246390402e-05, + "grad_norm": 3.8435301780700684, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8668680191040039, + "num_tokens": 441445966.0, + "step": 11570 + }, + { + "epoch": 1.4719501335707925, + "ewc_loss": 0.007766995113343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.766995258862153e-05, + "grad_norm": 3.9631030559539795, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8581615686416626, + "num_tokens": 441478916.0, + "step": 11571 + }, + { + "epoch": 1.472077343849383, + "ewc_loss": 0.007870722562074661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870722765801474e-05, + "grad_norm": 3.8329312801361084, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8722509145736694, + "num_tokens": 441519245.0, + "step": 11572 + }, + { + "epoch": 1.4722045541279736, + "ewc_loss": 0.007746163289994001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.746163464616984e-05, + "grad_norm": 3.8644754886627197, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8671774864196777, + "num_tokens": 441557485.0, + "step": 11573 + }, + { + "epoch": 1.472331764406564, + "ewc_loss": 0.00784274935722351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842749619157985e-05, + "grad_norm": 3.8472440242767334, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8846718072891235, + "num_tokens": 441596042.0, + "step": 11574 + }, + { + "epoch": 1.4724589746851546, + "ewc_loss": 0.007805889472365379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805889617884532e-05, + "grad_norm": 3.8730547428131104, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.884411096572876, + "num_tokens": 441634391.0, + "step": 11575 + }, + { + "epoch": 1.472586184963745, + "ewc_loss": 0.007826513610780239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.826514047337696e-05, + "grad_norm": 3.8986668586730957, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8662236928939819, + "num_tokens": 441672201.0, + "step": 11576 + }, + { + "epoch": 1.4727133952423355, + "ewc_loss": 0.0078425919637084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842591730877757e-05, + "grad_norm": 3.8438849449157715, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.872834324836731, + "num_tokens": 441713350.0, + "step": 11577 + }, + { + "epoch": 1.472840605520926, + "ewc_loss": 0.007796166930347681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.796166755724698e-05, + "grad_norm": 3.842421293258667, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8733260035514832, + "num_tokens": 441754677.0, + "step": 11578 + }, + { + "epoch": 1.4729678157995165, + "ewc_loss": 0.007840137928724289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.840137550374493e-05, + "grad_norm": 3.8355534076690674, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8657735586166382, + "num_tokens": 441797037.0, + "step": 11579 + }, + { + "epoch": 1.473095026078107, + "ewc_loss": 0.007816062308847904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.816062134224921e-05, + "grad_norm": 3.941462278366089, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8736550807952881, + "num_tokens": 441831661.0, + "step": 11580 + }, + { + "epoch": 1.4732222363566976, + "ewc_loss": 0.007865935564041138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.865935913287103e-05, + "grad_norm": 3.778439521789551, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8784589767456055, + "num_tokens": 441876933.0, + "step": 11581 + }, + { + "epoch": 1.473349446635288, + "ewc_loss": 0.007752403151243925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752403325866908e-05, + "grad_norm": 3.8444905281066895, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8962377905845642, + "num_tokens": 441918015.0, + "step": 11582 + }, + { + "epoch": 1.4734766569138786, + "ewc_loss": 0.007843595929443836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.843595813028514e-05, + "grad_norm": 3.874020576477051, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8714528679847717, + "num_tokens": 441956573.0, + "step": 11583 + }, + { + "epoch": 1.4736038671924692, + "ewc_loss": 0.00782599113881588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825990905985236e-05, + "grad_norm": 3.810988664627075, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8871957659721375, + "num_tokens": 442002064.0, + "step": 11584 + }, + { + "epoch": 1.4737310774710597, + "ewc_loss": 0.007753394544124603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75339431129396e-05, + "grad_norm": 3.873795747756958, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8811503052711487, + "num_tokens": 442040122.0, + "step": 11585 + }, + { + "epoch": 1.4738582877496502, + "ewc_loss": 0.007825011387467384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825011562090367e-05, + "grad_norm": 3.864504098892212, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8780019283294678, + "num_tokens": 442079698.0, + "step": 11586 + }, + { + "epoch": 1.4739854980282407, + "ewc_loss": 0.007783047389239073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.783047476550564e-05, + "grad_norm": 3.86063289642334, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8872854709625244, + "num_tokens": 442118252.0, + "step": 11587 + }, + { + "epoch": 1.4741127083068313, + "ewc_loss": 0.007763662841171026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.763662870274857e-05, + "grad_norm": 3.889603853225708, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8782151937484741, + "num_tokens": 442154675.0, + "step": 11588 + }, + { + "epoch": 1.4742399185854218, + "ewc_loss": 0.0077723050490021706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.772305252728984e-05, + "grad_norm": 3.9030814170837402, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.882108211517334, + "num_tokens": 442189449.0, + "step": 11589 + }, + { + "epoch": 1.4743671288640123, + "ewc_loss": 0.007767043076455593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.767043280182406e-05, + "grad_norm": 3.9345035552978516, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8761075735092163, + "num_tokens": 442222844.0, + "step": 11590 + }, + { + "epoch": 1.4744943391426026, + "ewc_loss": 0.007769849617034197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.769849617034197e-05, + "grad_norm": 3.877183437347412, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8827636241912842, + "num_tokens": 442258599.0, + "step": 11591 + }, + { + "epoch": 1.4746215494211932, + "ewc_loss": 0.0077364942990243435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.736494444543496e-05, + "grad_norm": 3.8542075157165527, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8816133737564087, + "num_tokens": 442294775.0, + "step": 11592 + }, + { + "epoch": 1.4747487596997837, + "ewc_loss": 0.0077537172473967075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75371736381203e-05, + "grad_norm": 3.836949586868286, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8847053050994873, + "num_tokens": 442330313.0, + "step": 11593 + }, + { + "epoch": 1.4748759699783742, + "ewc_loss": 0.007746884599328041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.74688451201655e-05, + "grad_norm": 3.915369749069214, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8739113807678223, + "num_tokens": 442364988.0, + "step": 11594 + }, + { + "epoch": 1.4750031802569648, + "ewc_loss": 0.007805696222931147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805696077411994e-05, + "grad_norm": 3.843431234359741, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8758545517921448, + "num_tokens": 442406927.0, + "step": 11595 + }, + { + "epoch": 1.4751303905355553, + "ewc_loss": 0.00773491058498621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.734910468570888e-05, + "grad_norm": 3.8920416831970215, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8784627914428711, + "num_tokens": 442442092.0, + "step": 11596 + }, + { + "epoch": 1.4752576008141458, + "ewc_loss": 0.007805758621543646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805758650647476e-05, + "grad_norm": 3.8289108276367188, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8819234371185303, + "num_tokens": 442483013.0, + "step": 11597 + }, + { + "epoch": 1.4753848110927363, + "ewc_loss": 0.0077480473555624485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.748047210043296e-05, + "grad_norm": 3.8156087398529053, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8802766799926758, + "num_tokens": 442525052.0, + "step": 11598 + }, + { + "epoch": 1.4755120213713269, + "ewc_loss": 0.0077780624851584435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778062717989087e-05, + "grad_norm": 3.8749802112579346, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.867154598236084, + "num_tokens": 442562543.0, + "step": 11599 + }, + { + "epoch": 1.4756392316499174, + "ewc_loss": 0.007819946855306625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819946767995134e-05, + "grad_norm": 3.881230354309082, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8800286650657654, + "num_tokens": 442604976.0, + "step": 11600 + }, + { + "epoch": 1.4757664419285077, + "ewc_loss": 0.00777463847771287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774638652335852e-05, + "grad_norm": 3.823093891143799, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8861034512519836, + "num_tokens": 442642271.0, + "step": 11601 + }, + { + "epoch": 1.4758936522070982, + "ewc_loss": 0.007743001449853182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.74300133343786e-05, + "grad_norm": 3.852863311767578, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8733877539634705, + "num_tokens": 442680508.0, + "step": 11602 + }, + { + "epoch": 1.4760208624856888, + "ewc_loss": 0.007791110780090094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791110692778602e-05, + "grad_norm": 3.878985643386841, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8662165403366089, + "num_tokens": 442718391.0, + "step": 11603 + }, + { + "epoch": 1.4761480727642793, + "ewc_loss": 0.007782097905874252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782097964081913e-05, + "grad_norm": 3.8565828800201416, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8857997059822083, + "num_tokens": 442757688.0, + "step": 11604 + }, + { + "epoch": 1.4762752830428698, + "ewc_loss": 0.007755194325000048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.755194383207709e-05, + "grad_norm": 3.841623067855835, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8870087265968323, + "num_tokens": 442796704.0, + "step": 11605 + }, + { + "epoch": 1.4764024933214603, + "ewc_loss": 0.007767407223582268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.767407078063115e-05, + "grad_norm": 3.838115930557251, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8846200704574585, + "num_tokens": 442838791.0, + "step": 11606 + }, + { + "epoch": 1.4765297036000509, + "ewc_loss": 0.007741403765976429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741403533145785e-05, + "grad_norm": 3.7990665435791016, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8762128949165344, + "num_tokens": 442882368.0, + "step": 11607 + }, + { + "epoch": 1.4766569138786414, + "ewc_loss": 0.007726739626377821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72673956817016e-05, + "grad_norm": 3.877960205078125, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8751601576805115, + "num_tokens": 442923768.0, + "step": 11608 + }, + { + "epoch": 1.476784124157232, + "ewc_loss": 0.0077740345150232315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.774034747853875e-05, + "grad_norm": 3.893051862716675, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8896089792251587, + "num_tokens": 442955656.0, + "step": 11609 + }, + { + "epoch": 1.4769113344358225, + "ewc_loss": 0.007729858625680208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.72985877119936e-05, + "grad_norm": 3.8558411598205566, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.866156280040741, + "num_tokens": 442997323.0, + "step": 11610 + }, + { + "epoch": 1.477038544714413, + "ewc_loss": 0.007708118762820959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.708118937443942e-05, + "grad_norm": 3.9210407733917236, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8626899719238281, + "num_tokens": 443033454.0, + "step": 11611 + }, + { + "epoch": 1.4771657549930035, + "ewc_loss": 0.007760067004710436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.760067092021927e-05, + "grad_norm": 3.821176767349243, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8755031824111938, + "num_tokens": 443077902.0, + "step": 11612 + }, + { + "epoch": 1.477292965271594, + "ewc_loss": 0.007684711366891861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.684711454203352e-05, + "grad_norm": 3.852099657058716, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8664566874504089, + "num_tokens": 443116515.0, + "step": 11613 + }, + { + "epoch": 1.4774201755501846, + "ewc_loss": 0.007756332866847515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.756333070574328e-05, + "grad_norm": 3.828122854232788, + "learning_rate": 1e-06, + "loss": 0.2819, + "mean_token_accuracy": 0.9008457660675049, + "num_tokens": 443154124.0, + "step": 11614 + }, + { + "epoch": 1.477547385828775, + "ewc_loss": 0.007738182321190834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.738182466709986e-05, + "grad_norm": 3.823099374771118, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.883381724357605, + "num_tokens": 443195852.0, + "step": 11615 + }, + { + "epoch": 1.4776745961073654, + "ewc_loss": 0.0077376216650009155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.737621490377933e-05, + "grad_norm": 3.9181087017059326, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8683810830116272, + "num_tokens": 443229850.0, + "step": 11616 + }, + { + "epoch": 1.477801806385956, + "ewc_loss": 0.007805926725268364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805926725268364e-05, + "grad_norm": 3.873047113418579, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8663710355758667, + "num_tokens": 443274481.0, + "step": 11617 + }, + { + "epoch": 1.4779290166645465, + "ewc_loss": 0.00772780179977417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.727801857981831e-05, + "grad_norm": 3.8295376300811768, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8805029988288879, + "num_tokens": 443311737.0, + "step": 11618 + }, + { + "epoch": 1.478056226943137, + "ewc_loss": 0.007741213776171207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741213630652055e-05, + "grad_norm": 3.911806344985962, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8680428266525269, + "num_tokens": 443345310.0, + "step": 11619 + }, + { + "epoch": 1.4781834372217275, + "ewc_loss": 0.007811222691088915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811222894815728e-05, + "grad_norm": 3.821171998977661, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8725100755691528, + "num_tokens": 443392869.0, + "step": 11620 + }, + { + "epoch": 1.478310647500318, + "ewc_loss": 0.007716390769928694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716390973655507e-05, + "grad_norm": 3.845506429672241, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8720359802246094, + "num_tokens": 443432842.0, + "step": 11621 + }, + { + "epoch": 1.4784378577789086, + "ewc_loss": 0.00777996052056551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.779960287734866e-05, + "grad_norm": 3.9138271808624268, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8776808977127075, + "num_tokens": 443465954.0, + "step": 11622 + }, + { + "epoch": 1.478565068057499, + "ewc_loss": 0.007806351408362389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.806351641193032e-05, + "grad_norm": 3.84973406791687, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8657116889953613, + "num_tokens": 443508506.0, + "step": 11623 + }, + { + "epoch": 1.4786922783360896, + "ewc_loss": 0.007750016171485186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.750016084173694e-05, + "grad_norm": 3.897803783416748, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8745520710945129, + "num_tokens": 443543575.0, + "step": 11624 + }, + { + "epoch": 1.47881948861468, + "ewc_loss": 0.007818406447768211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818406447768211e-05, + "grad_norm": 3.871805429458618, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8763589262962341, + "num_tokens": 443578504.0, + "step": 11625 + }, + { + "epoch": 1.4789466988932705, + "ewc_loss": 0.007779744919389486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.779744919389486e-05, + "grad_norm": 3.8547160625457764, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8873237371444702, + "num_tokens": 443614647.0, + "step": 11626 + }, + { + "epoch": 1.479073909171861, + "ewc_loss": 0.007798307575285435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.798307342454791e-05, + "grad_norm": 3.8722195625305176, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8713805675506592, + "num_tokens": 443649244.0, + "step": 11627 + }, + { + "epoch": 1.4792011194504515, + "ewc_loss": 0.007807926274836063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807926158420742e-05, + "grad_norm": 3.847236156463623, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8703498840332031, + "num_tokens": 443688374.0, + "step": 11628 + }, + { + "epoch": 1.479328329729042, + "ewc_loss": 0.007794530596584082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794530392857268e-05, + "grad_norm": 3.8693106174468994, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.847273051738739, + "num_tokens": 443730929.0, + "step": 11629 + }, + { + "epoch": 1.4794555400076326, + "ewc_loss": 0.00781209534034133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812095282133669e-05, + "grad_norm": 3.9189324378967285, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8713968396186829, + "num_tokens": 443763067.0, + "step": 11630 + }, + { + "epoch": 1.479582750286223, + "ewc_loss": 0.00785148050636053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851480768295005e-05, + "grad_norm": 3.8805532455444336, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8642631769180298, + "num_tokens": 443801273.0, + "step": 11631 + }, + { + "epoch": 1.4797099605648136, + "ewc_loss": 0.0078023020178079605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.802301843184978e-05, + "grad_norm": 3.865817070007324, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8625342845916748, + "num_tokens": 443840394.0, + "step": 11632 + }, + { + "epoch": 1.4798371708434042, + "ewc_loss": 0.00782064814120531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820648170309141e-05, + "grad_norm": 3.874444007873535, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8720431327819824, + "num_tokens": 443877486.0, + "step": 11633 + }, + { + "epoch": 1.4799643811219947, + "ewc_loss": 0.007841957733035088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841957994969562e-05, + "grad_norm": 3.859829902648926, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8761138916015625, + "num_tokens": 443913763.0, + "step": 11634 + }, + { + "epoch": 1.4800915914005852, + "ewc_loss": 0.007828138768672943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.828138768672943e-05, + "grad_norm": 3.8608102798461914, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8825832605361938, + "num_tokens": 443946840.0, + "step": 11635 + }, + { + "epoch": 1.4802188016791757, + "ewc_loss": 0.007845885120332241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.845885556889698e-05, + "grad_norm": 3.8653252124786377, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8597930669784546, + "num_tokens": 443992439.0, + "step": 11636 + }, + { + "epoch": 1.4803460119577663, + "ewc_loss": 0.007851333357393742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851333793951198e-05, + "grad_norm": 3.838545799255371, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8846991062164307, + "num_tokens": 444029348.0, + "step": 11637 + }, + { + "epoch": 1.4804732222363568, + "ewc_loss": 0.007847240194678307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847240340197459e-05, + "grad_norm": 3.8399312496185303, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8825298547744751, + "num_tokens": 444068386.0, + "step": 11638 + }, + { + "epoch": 1.4806004325149473, + "ewc_loss": 0.007841787301003933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841787009965628e-05, + "grad_norm": 3.859642267227173, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8641592264175415, + "num_tokens": 444110307.0, + "step": 11639 + }, + { + "epoch": 1.4807276427935376, + "ewc_loss": 0.007864266633987427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86426680861041e-05, + "grad_norm": 3.8303346633911133, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.877048671245575, + "num_tokens": 444153919.0, + "step": 11640 + }, + { + "epoch": 1.4808548530721282, + "ewc_loss": 0.007829741574823856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829741662135348e-05, + "grad_norm": 3.879544973373413, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8765877485275269, + "num_tokens": 444190575.0, + "step": 11641 + }, + { + "epoch": 1.4809820633507187, + "ewc_loss": 0.007865140214562416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.865140651119873e-05, + "grad_norm": 3.8723304271698, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8690228462219238, + "num_tokens": 444226108.0, + "step": 11642 + }, + { + "epoch": 1.4811092736293092, + "ewc_loss": 0.007839279249310493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83927898737602e-05, + "grad_norm": 3.835944890975952, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8711645603179932, + "num_tokens": 444268162.0, + "step": 11643 + }, + { + "epoch": 1.4812364839078997, + "ewc_loss": 0.007814926095306873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814926357241347e-05, + "grad_norm": 3.826078414916992, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8602482676506042, + "num_tokens": 444309273.0, + "step": 11644 + }, + { + "epoch": 1.4813636941864903, + "ewc_loss": 0.00783206894993782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832069240976125e-05, + "grad_norm": 3.952256917953491, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8796831369400024, + "num_tokens": 444338697.0, + "step": 11645 + }, + { + "epoch": 1.4814909044650808, + "ewc_loss": 0.007898773066699505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.898773037595674e-05, + "grad_norm": 3.8431131839752197, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8848066329956055, + "num_tokens": 444377954.0, + "step": 11646 + }, + { + "epoch": 1.4816181147436713, + "ewc_loss": 0.00776674272492528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76674278313294e-05, + "grad_norm": 3.8151354789733887, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8753159046173096, + "num_tokens": 444416996.0, + "step": 11647 + }, + { + "epoch": 1.4817453250222619, + "ewc_loss": 0.007815909571945667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815909339115024e-05, + "grad_norm": 3.8705949783325195, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8821799755096436, + "num_tokens": 444455550.0, + "step": 11648 + }, + { + "epoch": 1.4818725353008524, + "ewc_loss": 0.007847229018807411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847228698665276e-05, + "grad_norm": 3.8650827407836914, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8709800243377686, + "num_tokens": 444493979.0, + "step": 11649 + }, + { + "epoch": 1.4819997455794427, + "ewc_loss": 0.007810032460838556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.810032548150048e-05, + "grad_norm": 3.8283674716949463, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8745559453964233, + "num_tokens": 444540622.0, + "step": 11650 + }, + { + "epoch": 1.4821269558580332, + "ewc_loss": 0.007800614461302757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.800614548614249e-05, + "grad_norm": 3.9080140590667725, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.873238205909729, + "num_tokens": 444577772.0, + "step": 11651 + }, + { + "epoch": 1.4822541661366238, + "ewc_loss": 0.007839232683181763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839232421247289e-05, + "grad_norm": 3.901320457458496, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8615890741348267, + "num_tokens": 444615523.0, + "step": 11652 + }, + { + "epoch": 1.4823813764152143, + "ewc_loss": 0.007826532237231731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.826532237231731e-05, + "grad_norm": 3.82145619392395, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8731545209884644, + "num_tokens": 444659785.0, + "step": 11653 + }, + { + "epoch": 1.4825085866938048, + "ewc_loss": 0.007784141693264246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.784141780575737e-05, + "grad_norm": 3.925734758377075, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8715043067932129, + "num_tokens": 444700806.0, + "step": 11654 + }, + { + "epoch": 1.4826357969723953, + "ewc_loss": 0.007870396599173546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870396802900359e-05, + "grad_norm": 3.909311532974243, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8743084669113159, + "num_tokens": 444734320.0, + "step": 11655 + }, + { + "epoch": 1.4827630072509859, + "ewc_loss": 0.007821092382073402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.821092731319368e-05, + "grad_norm": 3.874894857406616, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8694592714309692, + "num_tokens": 444770215.0, + "step": 11656 + }, + { + "epoch": 1.4828902175295764, + "ewc_loss": 0.007799222599714994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.799222657922655e-05, + "grad_norm": 3.854278326034546, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8724950551986694, + "num_tokens": 444805432.0, + "step": 11657 + }, + { + "epoch": 1.483017427808167, + "ewc_loss": 0.00781919714063406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819197344360873e-05, + "grad_norm": 3.9467523097991943, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8760385513305664, + "num_tokens": 444832398.0, + "step": 11658 + }, + { + "epoch": 1.4831446380867574, + "ewc_loss": 0.007887711748480797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887711399234831e-05, + "grad_norm": 3.782933235168457, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8776272535324097, + "num_tokens": 444876128.0, + "step": 11659 + }, + { + "epoch": 1.483271848365348, + "ewc_loss": 0.007785468362271786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785468187648803e-05, + "grad_norm": 3.8644070625305176, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8713769912719727, + "num_tokens": 444915188.0, + "step": 11660 + }, + { + "epoch": 1.4833990586439385, + "ewc_loss": 0.00788905844092369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889058906584978e-05, + "grad_norm": 3.817783832550049, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8793721199035645, + "num_tokens": 444953528.0, + "step": 11661 + }, + { + "epoch": 1.483526268922529, + "ewc_loss": 0.007851094007492065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851094414945692e-05, + "grad_norm": 3.8554513454437256, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8683192729949951, + "num_tokens": 444996469.0, + "step": 11662 + }, + { + "epoch": 1.4836534792011196, + "ewc_loss": 0.007877110503613949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877110328990966e-05, + "grad_norm": 3.7961647510528564, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8908272981643677, + "num_tokens": 445034298.0, + "step": 11663 + }, + { + "epoch": 1.48378068947971, + "ewc_loss": 0.007828783243894577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82878341851756e-05, + "grad_norm": 3.8166747093200684, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8728702068328857, + "num_tokens": 445079010.0, + "step": 11664 + }, + { + "epoch": 1.4839078997583004, + "ewc_loss": 0.007876958698034286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87695826147683e-05, + "grad_norm": 3.834564208984375, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8870607614517212, + "num_tokens": 445116009.0, + "step": 11665 + }, + { + "epoch": 1.484035110036891, + "ewc_loss": 0.00786297395825386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862973870942369e-05, + "grad_norm": 3.8393023014068604, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8860560655593872, + "num_tokens": 445156313.0, + "step": 11666 + }, + { + "epoch": 1.4841623203154815, + "ewc_loss": 0.007841039448976517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84103904152289e-05, + "grad_norm": 3.9236621856689453, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.874814510345459, + "num_tokens": 445187424.0, + "step": 11667 + }, + { + "epoch": 1.484289530594072, + "ewc_loss": 0.007879061624407768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87906174082309e-05, + "grad_norm": 3.8151681423187256, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8765867948532104, + "num_tokens": 445233448.0, + "step": 11668 + }, + { + "epoch": 1.4844167408726625, + "ewc_loss": 0.007791904732584953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79190449975431e-05, + "grad_norm": 3.8389620780944824, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8855852484703064, + "num_tokens": 445273663.0, + "step": 11669 + }, + { + "epoch": 1.484543951151253, + "ewc_loss": 0.007835880853235722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835881115170196e-05, + "grad_norm": 3.8674418926239014, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8808460831642151, + "num_tokens": 445313790.0, + "step": 11670 + }, + { + "epoch": 1.4846711614298436, + "ewc_loss": 0.007811091840267181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811091927578673e-05, + "grad_norm": 3.868396043777466, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8832446932792664, + "num_tokens": 445351818.0, + "step": 11671 + }, + { + "epoch": 1.484798371708434, + "ewc_loss": 0.007803311571478844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.803311746101826e-05, + "grad_norm": 3.904245376586914, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8518747687339783, + "num_tokens": 445391763.0, + "step": 11672 + }, + { + "epoch": 1.4849255819870246, + "ewc_loss": 0.007815887220203876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815887511242181e-05, + "grad_norm": 3.9003193378448486, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8673131465911865, + "num_tokens": 445426723.0, + "step": 11673 + }, + { + "epoch": 1.485052792265615, + "ewc_loss": 0.007799237966537476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.799237937433645e-05, + "grad_norm": 3.8359382152557373, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.879889965057373, + "num_tokens": 445465635.0, + "step": 11674 + }, + { + "epoch": 1.4851800025442055, + "ewc_loss": 0.0077634030021727085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76340311858803e-05, + "grad_norm": 3.8776466846466064, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8743166923522949, + "num_tokens": 445502819.0, + "step": 11675 + }, + { + "epoch": 1.485307212822796, + "ewc_loss": 0.007820354774594307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82035494921729e-05, + "grad_norm": 3.864320755004883, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8833739161491394, + "num_tokens": 445541861.0, + "step": 11676 + }, + { + "epoch": 1.4854344231013865, + "ewc_loss": 0.007780618034303188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.780618034303188e-05, + "grad_norm": 3.8685779571533203, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8905189037322998, + "num_tokens": 445578058.0, + "step": 11677 + }, + { + "epoch": 1.485561633379977, + "ewc_loss": 0.007793215569108725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793215627316386e-05, + "grad_norm": 3.9071996212005615, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8599997162818909, + "num_tokens": 445612896.0, + "step": 11678 + }, + { + "epoch": 1.4856888436585676, + "ewc_loss": 0.00782354548573494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.823545456631109e-05, + "grad_norm": 3.8931257724761963, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8686844110488892, + "num_tokens": 445647849.0, + "step": 11679 + }, + { + "epoch": 1.485816053937158, + "ewc_loss": 0.007816033437848091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.816033030394465e-05, + "grad_norm": 3.879246711730957, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.861341655254364, + "num_tokens": 445689199.0, + "step": 11680 + }, + { + "epoch": 1.4859432642157486, + "ewc_loss": 0.007810645271092653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.810645183781162e-05, + "grad_norm": 3.874216318130493, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8740014433860779, + "num_tokens": 445727869.0, + "step": 11681 + }, + { + "epoch": 1.4860704744943392, + "ewc_loss": 0.00782002229243517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820022437954322e-05, + "grad_norm": 3.849365472793579, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8771333694458008, + "num_tokens": 445770105.0, + "step": 11682 + }, + { + "epoch": 1.4861976847729297, + "ewc_loss": 0.007792501710355282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.792501855874434e-05, + "grad_norm": 3.876580238342285, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.856572687625885, + "num_tokens": 445812950.0, + "step": 11683 + }, + { + "epoch": 1.4863248950515202, + "ewc_loss": 0.007820780389010906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82078059273772e-05, + "grad_norm": 3.8519093990325928, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8730713129043579, + "num_tokens": 445850811.0, + "step": 11684 + }, + { + "epoch": 1.4864521053301107, + "ewc_loss": 0.007790521252900362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.790521340211853e-05, + "grad_norm": 3.8423876762390137, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.877366304397583, + "num_tokens": 445891967.0, + "step": 11685 + }, + { + "epoch": 1.4865793156087013, + "ewc_loss": 0.007815591990947723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815592107363045e-05, + "grad_norm": 3.893972158432007, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8765629529953003, + "num_tokens": 445927936.0, + "step": 11686 + }, + { + "epoch": 1.4867065258872918, + "ewc_loss": 0.007825098000466824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825098145985976e-05, + "grad_norm": 3.857222080230713, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.886667013168335, + "num_tokens": 445962481.0, + "step": 11687 + }, + { + "epoch": 1.4868337361658823, + "ewc_loss": 0.007785948924720287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.7859491284471e-05, + "grad_norm": 3.8283116817474365, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8883104920387268, + "num_tokens": 446001445.0, + "step": 11688 + }, + { + "epoch": 1.4869609464444726, + "ewc_loss": 0.007775573525577784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.775573612889275e-05, + "grad_norm": 3.842010498046875, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8921878933906555, + "num_tokens": 446037904.0, + "step": 11689 + }, + { + "epoch": 1.4870881567230632, + "ewc_loss": 0.007797162979841232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79716283432208e-05, + "grad_norm": 3.853414535522461, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8831843137741089, + "num_tokens": 446077583.0, + "step": 11690 + }, + { + "epoch": 1.4872153670016537, + "ewc_loss": 0.007797167170792818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.797167199896649e-05, + "grad_norm": 3.949244499206543, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8747738599777222, + "num_tokens": 446110123.0, + "step": 11691 + }, + { + "epoch": 1.4873425772802442, + "ewc_loss": 0.007845656014978886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84565563662909e-05, + "grad_norm": 3.867887496948242, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8737800717353821, + "num_tokens": 446149832.0, + "step": 11692 + }, + { + "epoch": 1.4874697875588347, + "ewc_loss": 0.007760065607726574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.760065636830404e-05, + "grad_norm": 3.855944871902466, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8827023506164551, + "num_tokens": 446184211.0, + "step": 11693 + }, + { + "epoch": 1.4875969978374253, + "ewc_loss": 0.00779561884701252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79561887611635e-05, + "grad_norm": 3.8510537147521973, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8683685064315796, + "num_tokens": 446229620.0, + "step": 11694 + }, + { + "epoch": 1.4877242081160158, + "ewc_loss": 0.007798075210303068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.798075239406899e-05, + "grad_norm": 3.8691465854644775, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8771244287490845, + "num_tokens": 446267134.0, + "step": 11695 + }, + { + "epoch": 1.4878514183946063, + "ewc_loss": 0.007786428555846214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.786428614053875e-05, + "grad_norm": 3.8982772827148438, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8650059700012207, + "num_tokens": 446303001.0, + "step": 11696 + }, + { + "epoch": 1.4879786286731969, + "ewc_loss": 0.007810161914676428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81016206019558e-05, + "grad_norm": 3.9507381916046143, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8736560344696045, + "num_tokens": 446334286.0, + "step": 11697 + }, + { + "epoch": 1.4881058389517874, + "ewc_loss": 0.00782568659633398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825686770956963e-05, + "grad_norm": 3.8865480422973633, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.891025722026825, + "num_tokens": 446369547.0, + "step": 11698 + }, + { + "epoch": 1.4882330492303777, + "ewc_loss": 0.007781819440424442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78181929490529e-05, + "grad_norm": 3.891817092895508, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8608995079994202, + "num_tokens": 446408179.0, + "step": 11699 + }, + { + "epoch": 1.4883602595089682, + "ewc_loss": 0.007826069369912148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82606948632747e-05, + "grad_norm": 3.8222594261169434, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8776596784591675, + "num_tokens": 446448533.0, + "step": 11700 + }, + { + "epoch": 1.4884874697875587, + "ewc_loss": 0.007771845441311598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.771845412207767e-05, + "grad_norm": 3.821376085281372, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8725292682647705, + "num_tokens": 446489445.0, + "step": 11701 + }, + { + "epoch": 1.4886146800661493, + "ewc_loss": 0.00780788017436862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807880319887772e-05, + "grad_norm": 3.8566884994506836, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8884292840957642, + "num_tokens": 446526530.0, + "step": 11702 + }, + { + "epoch": 1.4887418903447398, + "ewc_loss": 0.007822058163583279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82205825089477e-05, + "grad_norm": 3.9935646057128906, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8751717209815979, + "num_tokens": 446558315.0, + "step": 11703 + }, + { + "epoch": 1.4888691006233303, + "ewc_loss": 0.00789374578744173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.893745350884274e-05, + "grad_norm": 3.817592144012451, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8782445788383484, + "num_tokens": 446598548.0, + "step": 11704 + }, + { + "epoch": 1.4889963109019209, + "ewc_loss": 0.007754235994070768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75423613958992e-05, + "grad_norm": 3.8921337127685547, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8713111281394958, + "num_tokens": 446632111.0, + "step": 11705 + }, + { + "epoch": 1.4891235211805114, + "ewc_loss": 0.007871377281844616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87137687439099e-05, + "grad_norm": 3.872192144393921, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8840970993041992, + "num_tokens": 446669296.0, + "step": 11706 + }, + { + "epoch": 1.489250731459102, + "ewc_loss": 0.00783788226544857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837882731109858e-05, + "grad_norm": 3.8295743465423584, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8773549795150757, + "num_tokens": 446707891.0, + "step": 11707 + }, + { + "epoch": 1.4893779417376924, + "ewc_loss": 0.007818427868187428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818427548045292e-05, + "grad_norm": 3.8360352516174316, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8658397793769836, + "num_tokens": 446749455.0, + "step": 11708 + }, + { + "epoch": 1.489505152016283, + "ewc_loss": 0.007844937965273857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84493749961257e-05, + "grad_norm": 3.848832607269287, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8869695067405701, + "num_tokens": 446787001.0, + "step": 11709 + }, + { + "epoch": 1.4896323622948735, + "ewc_loss": 0.00785066932439804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850669499021024e-05, + "grad_norm": 3.903381824493408, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8755391836166382, + "num_tokens": 446827262.0, + "step": 11710 + }, + { + "epoch": 1.489759572573464, + "ewc_loss": 0.00786192249506712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86192249506712e-05, + "grad_norm": 3.894700288772583, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8897046446800232, + "num_tokens": 446860514.0, + "step": 11711 + }, + { + "epoch": 1.4898867828520546, + "ewc_loss": 0.00784269254654646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842692139092833e-05, + "grad_norm": 3.8714873790740967, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8670129179954529, + "num_tokens": 446902605.0, + "step": 11712 + }, + { + "epoch": 1.490013993130645, + "ewc_loss": 0.007831612601876259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831613038433716e-05, + "grad_norm": 3.879291534423828, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8810226321220398, + "num_tokens": 446938638.0, + "step": 11713 + }, + { + "epoch": 1.4901412034092354, + "ewc_loss": 0.007853242568671703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853242277633399e-05, + "grad_norm": 3.8476526737213135, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8811265826225281, + "num_tokens": 446979286.0, + "step": 11714 + }, + { + "epoch": 1.490268413687826, + "ewc_loss": 0.00782561395317316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825614011380821e-05, + "grad_norm": 3.898228168487549, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.860786497592926, + "num_tokens": 447017405.0, + "step": 11715 + }, + { + "epoch": 1.4903956239664164, + "ewc_loss": 0.007860773243010044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86077362135984e-05, + "grad_norm": 3.849865198135376, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8760399222373962, + "num_tokens": 447056733.0, + "step": 11716 + }, + { + "epoch": 1.490522834245007, + "ewc_loss": 0.007827243767678738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.827243825886399e-05, + "grad_norm": 3.849612236022949, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8727807998657227, + "num_tokens": 447094493.0, + "step": 11717 + }, + { + "epoch": 1.4906500445235975, + "ewc_loss": 0.007842961698770523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842962077120319e-05, + "grad_norm": 3.859785795211792, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8900289535522461, + "num_tokens": 447134039.0, + "step": 11718 + }, + { + "epoch": 1.490777254802188, + "ewc_loss": 0.007835976779460907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835976430214942e-05, + "grad_norm": 3.8582680225372314, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8828865885734558, + "num_tokens": 447168320.0, + "step": 11719 + }, + { + "epoch": 1.4909044650807786, + "ewc_loss": 0.007843547500669956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8435470641125e-05, + "grad_norm": 3.917900323867798, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8606698513031006, + "num_tokens": 447205640.0, + "step": 11720 + }, + { + "epoch": 1.491031675359369, + "ewc_loss": 0.00787216518074274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872164860600606e-05, + "grad_norm": 3.840805768966675, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8889862298965454, + "num_tokens": 447239831.0, + "step": 11721 + }, + { + "epoch": 1.4911588856379596, + "ewc_loss": 0.007819089107215405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819088932592422e-05, + "grad_norm": 3.875473737716675, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8607312440872192, + "num_tokens": 447281100.0, + "step": 11722 + }, + { + "epoch": 1.49128609591655, + "ewc_loss": 0.007872221060097218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872220885474235e-05, + "grad_norm": 3.9132795333862305, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8547837734222412, + "num_tokens": 447316137.0, + "step": 11723 + }, + { + "epoch": 1.4914133061951405, + "ewc_loss": 0.00785576831549406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.855768490117043e-05, + "grad_norm": 3.8757219314575195, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8675440549850464, + "num_tokens": 447357922.0, + "step": 11724 + }, + { + "epoch": 1.491540516473731, + "ewc_loss": 0.00783534161746502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835341239115223e-05, + "grad_norm": 3.8787286281585693, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8742572069168091, + "num_tokens": 447394905.0, + "step": 11725 + }, + { + "epoch": 1.4916677267523215, + "ewc_loss": 0.00788073055446148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880730845499784e-05, + "grad_norm": 3.868558645248413, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8873723745346069, + "num_tokens": 447435379.0, + "step": 11726 + }, + { + "epoch": 1.491794937030912, + "ewc_loss": 0.007857331074774265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85733136581257e-05, + "grad_norm": 3.8618950843811035, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8671252727508545, + "num_tokens": 447473058.0, + "step": 11727 + }, + { + "epoch": 1.4919221473095026, + "ewc_loss": 0.00786215253174305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862152415327728e-05, + "grad_norm": 3.880836248397827, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.884223997592926, + "num_tokens": 447510966.0, + "step": 11728 + }, + { + "epoch": 1.492049357588093, + "ewc_loss": 0.00785308051854372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853080751374364e-05, + "grad_norm": 3.7756378650665283, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8780311346054077, + "num_tokens": 447554747.0, + "step": 11729 + }, + { + "epoch": 1.4921765678666836, + "ewc_loss": 0.007803722750395536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.803722837707028e-05, + "grad_norm": 3.888414144515991, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8777521252632141, + "num_tokens": 447594426.0, + "step": 11730 + }, + { + "epoch": 1.4923037781452742, + "ewc_loss": 0.007894053123891354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.894053123891354e-05, + "grad_norm": 3.854997158050537, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8676166534423828, + "num_tokens": 447634786.0, + "step": 11731 + }, + { + "epoch": 1.4924309884238647, + "ewc_loss": 0.007835060358047485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835060387151316e-05, + "grad_norm": 3.89788556098938, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8738987445831299, + "num_tokens": 447668682.0, + "step": 11732 + }, + { + "epoch": 1.4925581987024552, + "ewc_loss": 0.00786113552749157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861135964049026e-05, + "grad_norm": 3.8731653690338135, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8690734505653381, + "num_tokens": 447705566.0, + "step": 11733 + }, + { + "epoch": 1.4926854089810457, + "ewc_loss": 0.007844028063118458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844028004910797e-05, + "grad_norm": 3.8345279693603516, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8726186156272888, + "num_tokens": 447746385.0, + "step": 11734 + }, + { + "epoch": 1.4928126192596363, + "ewc_loss": 0.007807021960616112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807021756889299e-05, + "grad_norm": 3.8039629459381104, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8797447085380554, + "num_tokens": 447790483.0, + "step": 11735 + }, + { + "epoch": 1.4929398295382268, + "ewc_loss": 0.007803113665431738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80311384005472e-05, + "grad_norm": 3.8703341484069824, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8743208050727844, + "num_tokens": 447826872.0, + "step": 11736 + }, + { + "epoch": 1.4930670398168173, + "ewc_loss": 0.00786322820931673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.863228529458866e-05, + "grad_norm": 3.90722918510437, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8785496354103088, + "num_tokens": 447864071.0, + "step": 11737 + }, + { + "epoch": 1.4931942500954076, + "ewc_loss": 0.007842781953513622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842781633371487e-05, + "grad_norm": 3.88468599319458, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8587967157363892, + "num_tokens": 447903320.0, + "step": 11738 + }, + { + "epoch": 1.4933214603739982, + "ewc_loss": 0.007812884636223316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812884723534808e-05, + "grad_norm": 3.8662476539611816, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8878806829452515, + "num_tokens": 447940253.0, + "step": 11739 + }, + { + "epoch": 1.4934486706525887, + "ewc_loss": 0.007800332270562649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.800332241458818e-05, + "grad_norm": 3.8546786308288574, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8807829022407532, + "num_tokens": 447978277.0, + "step": 11740 + }, + { + "epoch": 1.4935758809311792, + "ewc_loss": 0.007815954275429249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815954450052232e-05, + "grad_norm": 3.9033846855163574, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8646055459976196, + "num_tokens": 448018798.0, + "step": 11741 + }, + { + "epoch": 1.4937030912097697, + "ewc_loss": 0.007819102145731449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819102029316127e-05, + "grad_norm": 3.864790439605713, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8646035194396973, + "num_tokens": 448057469.0, + "step": 11742 + }, + { + "epoch": 1.4938303014883603, + "ewc_loss": 0.007784613408148289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.784613262629136e-05, + "grad_norm": 3.9362659454345703, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8633243441581726, + "num_tokens": 448089032.0, + "step": 11743 + }, + { + "epoch": 1.4939575117669508, + "ewc_loss": 0.007860510610044003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.860510959289968e-05, + "grad_norm": 3.9248013496398926, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8706415891647339, + "num_tokens": 448122438.0, + "step": 11744 + }, + { + "epoch": 1.4940847220455413, + "ewc_loss": 0.007826144807040691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.826145156286657e-05, + "grad_norm": 3.8278110027313232, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8740653991699219, + "num_tokens": 448166844.0, + "step": 11745 + }, + { + "epoch": 1.4942119323241319, + "ewc_loss": 0.007782172877341509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.782172906445339e-05, + "grad_norm": 3.9847445487976074, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8654549717903137, + "num_tokens": 448197766.0, + "step": 11746 + }, + { + "epoch": 1.4943391426027224, + "ewc_loss": 0.00791732408106327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917324546724558e-05, + "grad_norm": 3.8580873012542725, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8621973991394043, + "num_tokens": 448240850.0, + "step": 11747 + }, + { + "epoch": 1.4944663528813127, + "ewc_loss": 0.007790297269821167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.790297240717337e-05, + "grad_norm": 3.890756368637085, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8835972547531128, + "num_tokens": 448274187.0, + "step": 11748 + }, + { + "epoch": 1.4945935631599032, + "ewc_loss": 0.007874656468629837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.874656148487702e-05, + "grad_norm": 3.888582468032837, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8815128803253174, + "num_tokens": 448311069.0, + "step": 11749 + }, + { + "epoch": 1.4947207734384937, + "ewc_loss": 0.007870697416365147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870697299949825e-05, + "grad_norm": 3.8246829509735107, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8748046159744263, + "num_tokens": 448351536.0, + "step": 11750 + }, + { + "epoch": 1.4948479837170843, + "ewc_loss": 0.007838434539735317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838434976292774e-05, + "grad_norm": 3.92661190032959, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8711210489273071, + "num_tokens": 448385108.0, + "step": 11751 + }, + { + "epoch": 1.4949751939956748, + "ewc_loss": 0.007928202860057354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928202830953524e-05, + "grad_norm": 3.854215145111084, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8860161304473877, + "num_tokens": 448423272.0, + "step": 11752 + }, + { + "epoch": 1.4951024042742653, + "ewc_loss": 0.007847229018807411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847229426261038e-05, + "grad_norm": 3.8552567958831787, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8798396587371826, + "num_tokens": 448462904.0, + "step": 11753 + }, + { + "epoch": 1.4952296145528559, + "ewc_loss": 0.00788936484605074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889364496804774e-05, + "grad_norm": 3.862684726715088, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8718903660774231, + "num_tokens": 448503257.0, + "step": 11754 + }, + { + "epoch": 1.4953568248314464, + "ewc_loss": 0.007872868329286575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872868445701897e-05, + "grad_norm": 3.918841600418091, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8803424835205078, + "num_tokens": 448535299.0, + "step": 11755 + }, + { + "epoch": 1.495484035110037, + "ewc_loss": 0.007900894619524479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90089470683597e-05, + "grad_norm": 3.886059522628784, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8808714747428894, + "num_tokens": 448568878.0, + "step": 11756 + }, + { + "epoch": 1.4956112453886274, + "ewc_loss": 0.007873493246734142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.873493450460956e-05, + "grad_norm": 3.8325397968292236, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8931893706321716, + "num_tokens": 448604959.0, + "step": 11757 + }, + { + "epoch": 1.495738455667218, + "ewc_loss": 0.007861352525651455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861352787585929e-05, + "grad_norm": 3.905247449874878, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8874536156654358, + "num_tokens": 448638487.0, + "step": 11758 + }, + { + "epoch": 1.4958656659458085, + "ewc_loss": 0.007933784276247025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.933784218039364e-05, + "grad_norm": 3.8869245052337646, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.880315363407135, + "num_tokens": 448677212.0, + "step": 11759 + }, + { + "epoch": 1.495992876224399, + "ewc_loss": 0.007870319299399853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870319677749649e-05, + "grad_norm": 3.850843906402588, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.869361162185669, + "num_tokens": 448719654.0, + "step": 11760 + }, + { + "epoch": 1.4961200865029896, + "ewc_loss": 0.00785338506102562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853384886402637e-05, + "grad_norm": 3.8117339611053467, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8774060010910034, + "num_tokens": 448760783.0, + "step": 11761 + }, + { + "epoch": 1.49624729678158, + "ewc_loss": 0.007854935713112354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85493612056598e-05, + "grad_norm": 3.895076274871826, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8645845651626587, + "num_tokens": 448802843.0, + "step": 11762 + }, + { + "epoch": 1.4963745070601704, + "ewc_loss": 0.007915647700428963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91564816609025e-05, + "grad_norm": 3.929868221282959, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8716333508491516, + "num_tokens": 448838430.0, + "step": 11763 + }, + { + "epoch": 1.496501717338761, + "ewc_loss": 0.0078785615041852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878561154939234e-05, + "grad_norm": 3.836806535720825, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8655810952186584, + "num_tokens": 448880450.0, + "step": 11764 + }, + { + "epoch": 1.4966289276173514, + "ewc_loss": 0.007820758037269115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820758037269115e-05, + "grad_norm": 3.854618549346924, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8777410984039307, + "num_tokens": 448919352.0, + "step": 11765 + }, + { + "epoch": 1.496756137895942, + "ewc_loss": 0.007870192639529705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8701923484914e-05, + "grad_norm": 3.842823028564453, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8520393371582031, + "num_tokens": 448965402.0, + "step": 11766 + }, + { + "epoch": 1.4968833481745325, + "ewc_loss": 0.007841813378036022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84181320341304e-05, + "grad_norm": 3.9094109535217285, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8862886428833008, + "num_tokens": 448996983.0, + "step": 11767 + }, + { + "epoch": 1.497010558453123, + "ewc_loss": 0.007880439050495625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880439079599455e-05, + "grad_norm": 3.862386703491211, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8625615835189819, + "num_tokens": 449036555.0, + "step": 11768 + }, + { + "epoch": 1.4971377687317136, + "ewc_loss": 0.007831279188394547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831279071979225e-05, + "grad_norm": 3.8800435066223145, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8863537907600403, + "num_tokens": 449070545.0, + "step": 11769 + }, + { + "epoch": 1.497264979010304, + "ewc_loss": 0.007856951095163822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856950833229348e-05, + "grad_norm": 3.933680772781372, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8599103093147278, + "num_tokens": 449108859.0, + "step": 11770 + }, + { + "epoch": 1.4973921892888946, + "ewc_loss": 0.007878877222537994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87887693149969e-05, + "grad_norm": 3.9145936965942383, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8716768026351929, + "num_tokens": 449145333.0, + "step": 11771 + }, + { + "epoch": 1.497519399567485, + "ewc_loss": 0.007848287001252174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848286622902378e-05, + "grad_norm": 3.8822174072265625, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8698833584785461, + "num_tokens": 449181292.0, + "step": 11772 + }, + { + "epoch": 1.4976466098460754, + "ewc_loss": 0.007847756147384644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847756205592304e-05, + "grad_norm": 3.8898966312408447, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8795964121818542, + "num_tokens": 449213940.0, + "step": 11773 + }, + { + "epoch": 1.497773820124666, + "ewc_loss": 0.007857495918869972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85749580245465e-05, + "grad_norm": 3.8662586212158203, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8851679563522339, + "num_tokens": 449252487.0, + "step": 11774 + }, + { + "epoch": 1.4979010304032565, + "ewc_loss": 0.007847664877772331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847664528526366e-05, + "grad_norm": 3.9133636951446533, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8695695400238037, + "num_tokens": 449287268.0, + "step": 11775 + }, + { + "epoch": 1.498028240681847, + "ewc_loss": 0.00788409635424614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884096703492105e-05, + "grad_norm": 3.8410356044769287, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8725438117980957, + "num_tokens": 449326975.0, + "step": 11776 + }, + { + "epoch": 1.4981554509604376, + "ewc_loss": 0.007802082225680351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80208210926503e-05, + "grad_norm": 3.872330904006958, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8740085363388062, + "num_tokens": 449366061.0, + "step": 11777 + }, + { + "epoch": 1.498282661239028, + "ewc_loss": 0.007865747436881065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.865747465984896e-05, + "grad_norm": 3.8832099437713623, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8829473257064819, + "num_tokens": 449401763.0, + "step": 11778 + }, + { + "epoch": 1.4984098715176186, + "ewc_loss": 0.007842243649065495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842243940103799e-05, + "grad_norm": 3.873471975326538, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8925009965896606, + "num_tokens": 449436605.0, + "step": 11779 + }, + { + "epoch": 1.4985370817962091, + "ewc_loss": 0.007837769575417042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837769953766838e-05, + "grad_norm": 3.8568873405456543, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8702855706214905, + "num_tokens": 449474523.0, + "step": 11780 + }, + { + "epoch": 1.4986642920747997, + "ewc_loss": 0.007844916544854641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844916399335489e-05, + "grad_norm": 3.846628189086914, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8869181871414185, + "num_tokens": 449513301.0, + "step": 11781 + }, + { + "epoch": 1.4987915023533902, + "ewc_loss": 0.0078076887875795364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807688962202519e-05, + "grad_norm": 3.9173805713653564, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8701895475387573, + "num_tokens": 449547826.0, + "step": 11782 + }, + { + "epoch": 1.4989187126319807, + "ewc_loss": 0.007867339067161083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867338717915118e-05, + "grad_norm": 3.8950932025909424, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8750742673873901, + "num_tokens": 449584818.0, + "step": 11783 + }, + { + "epoch": 1.4990459229105713, + "ewc_loss": 0.007818486541509628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818486483301967e-05, + "grad_norm": 3.8866381645202637, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8561999797821045, + "num_tokens": 449625368.0, + "step": 11784 + }, + { + "epoch": 1.4991731331891618, + "ewc_loss": 0.007830348797142506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83034847700037e-05, + "grad_norm": 3.8598246574401855, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8634213805198669, + "num_tokens": 449666320.0, + "step": 11785 + }, + { + "epoch": 1.4993003434677523, + "ewc_loss": 0.007818472571671009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8184726589825e-05, + "grad_norm": 3.8458902835845947, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8661646842956543, + "num_tokens": 449705993.0, + "step": 11786 + }, + { + "epoch": 1.4994275537463426, + "ewc_loss": 0.00781929213553667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819291931809857e-05, + "grad_norm": 3.9090468883514404, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8692926168441772, + "num_tokens": 449747088.0, + "step": 11787 + }, + { + "epoch": 1.4995547640249332, + "ewc_loss": 0.007845858111977577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.845857908250764e-05, + "grad_norm": 3.8326029777526855, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8770639896392822, + "num_tokens": 449789840.0, + "step": 11788 + }, + { + "epoch": 1.4996819743035237, + "ewc_loss": 0.0077851866371929646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.785186608089134e-05, + "grad_norm": 3.879054069519043, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8744491934776306, + "num_tokens": 449830599.0, + "step": 11789 + }, + { + "epoch": 1.4998091845821142, + "ewc_loss": 0.00781338568776846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.813386037014425e-05, + "grad_norm": 3.8943991661071777, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8786258697509766, + "num_tokens": 449864476.0, + "step": 11790 + }, + { + "epoch": 1.4999363948607047, + "ewc_loss": 0.007807671558111906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807671499904245e-05, + "grad_norm": 3.901224374771118, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8586218357086182, + "num_tokens": 449903791.0, + "step": 11791 + }, + { + "epoch": 1.5000636051392953, + "ewc_loss": 0.007803007494658232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.803007611073554e-05, + "grad_norm": 3.835444927215576, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8751413822174072, + "num_tokens": 449945478.0, + "step": 11792 + }, + { + "epoch": 1.5001908154178858, + "ewc_loss": 0.007761992979794741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761993038002402e-05, + "grad_norm": 3.8598201274871826, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8723843097686768, + "num_tokens": 449986175.0, + "step": 11793 + }, + { + "epoch": 1.5003180256964763, + "ewc_loss": 0.007792789954692125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.792789983795956e-05, + "grad_norm": 3.895939826965332, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8715965747833252, + "num_tokens": 450025537.0, + "step": 11794 + }, + { + "epoch": 1.5004452359750666, + "ewc_loss": 0.007804779335856438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.804779306752607e-05, + "grad_norm": 3.895634174346924, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8592169880867004, + "num_tokens": 450068269.0, + "step": 11795 + }, + { + "epoch": 1.5005724462536572, + "ewc_loss": 0.0077994996681809425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.799499871907756e-05, + "grad_norm": 3.8804097175598145, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.890129566192627, + "num_tokens": 450106234.0, + "step": 11796 + }, + { + "epoch": 1.5006996565322477, + "ewc_loss": 0.007794803008437157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.7948032412678e-05, + "grad_norm": 3.8783187866210938, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8846850991249084, + "num_tokens": 450141916.0, + "step": 11797 + }, + { + "epoch": 1.5008268668108382, + "ewc_loss": 0.007788605522364378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.788605580572039e-05, + "grad_norm": 3.917527437210083, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8602859973907471, + "num_tokens": 450175427.0, + "step": 11798 + }, + { + "epoch": 1.5009540770894287, + "ewc_loss": 0.00781368650496006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81368653406389e-05, + "grad_norm": 3.938586473464966, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8759954571723938, + "num_tokens": 450211104.0, + "step": 11799 + }, + { + "epoch": 1.5010812873680193, + "ewc_loss": 0.0078057493083179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805749191902578e-05, + "grad_norm": 3.911618232727051, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.865797758102417, + "num_tokens": 450244844.0, + "step": 11800 + }, + { + "epoch": 1.5012084976466098, + "ewc_loss": 0.00780018325895071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.800183084327728e-05, + "grad_norm": 3.8582139015197754, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8741511106491089, + "num_tokens": 450286094.0, + "step": 11801 + }, + { + "epoch": 1.5013357079252003, + "ewc_loss": 0.007787901908159256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.787901995470747e-05, + "grad_norm": 3.877640724182129, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8666120767593384, + "num_tokens": 450324938.0, + "step": 11802 + }, + { + "epoch": 1.5014629182037909, + "ewc_loss": 0.007849764078855515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849764369893819e-05, + "grad_norm": 3.8780593872070312, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8750096559524536, + "num_tokens": 450368572.0, + "step": 11803 + }, + { + "epoch": 1.5015901284823814, + "ewc_loss": 0.007821009494364262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.821009785402566e-05, + "grad_norm": 3.8982291221618652, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8545935153961182, + "num_tokens": 450410926.0, + "step": 11804 + }, + { + "epoch": 1.501717338760972, + "ewc_loss": 0.007852109149098396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.852108683437109e-05, + "grad_norm": 3.8745789527893066, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8780169486999512, + "num_tokens": 450450206.0, + "step": 11805 + }, + { + "epoch": 1.5018445490395624, + "ewc_loss": 0.007835755124688148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83575524110347e-05, + "grad_norm": 3.929417133331299, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8555664420127869, + "num_tokens": 450488998.0, + "step": 11806 + }, + { + "epoch": 1.501971759318153, + "ewc_loss": 0.007879313081502914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.879313488956541e-05, + "grad_norm": 3.829317569732666, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8853976726531982, + "num_tokens": 450530476.0, + "step": 11807 + }, + { + "epoch": 1.5020989695967435, + "ewc_loss": 0.007795870769768953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.795870624249801e-05, + "grad_norm": 3.897139072418213, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8824246525764465, + "num_tokens": 450560655.0, + "step": 11808 + }, + { + "epoch": 1.502226179875334, + "ewc_loss": 0.007895986549556255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.895986345829442e-05, + "grad_norm": 3.846984624862671, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8769078254699707, + "num_tokens": 450601816.0, + "step": 11809 + }, + { + "epoch": 1.5023533901539246, + "ewc_loss": 0.007824522443115711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.824522617738694e-05, + "grad_norm": 3.9150822162628174, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8784601092338562, + "num_tokens": 450634182.0, + "step": 11810 + }, + { + "epoch": 1.502480600432515, + "ewc_loss": 0.007921069860458374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.921069482108578e-05, + "grad_norm": 3.8761045932769775, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8703894019126892, + "num_tokens": 450672705.0, + "step": 11811 + }, + { + "epoch": 1.5026078107111056, + "ewc_loss": 0.007869548164308071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.869548426242545e-05, + "grad_norm": 3.9346702098846436, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8612256050109863, + "num_tokens": 450705636.0, + "step": 11812 + }, + { + "epoch": 1.502735020989696, + "ewc_loss": 0.007918477058410645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918477058410645e-05, + "grad_norm": 4.0048441886901855, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8653966188430786, + "num_tokens": 450736165.0, + "step": 11813 + }, + { + "epoch": 1.5028622312682864, + "ewc_loss": 0.007951932959258556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.951932639116421e-05, + "grad_norm": 3.8882408142089844, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8752323985099792, + "num_tokens": 450776939.0, + "step": 11814 + }, + { + "epoch": 1.502989441546877, + "ewc_loss": 0.007861950434744358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861950871301815e-05, + "grad_norm": 3.8809304237365723, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8800270557403564, + "num_tokens": 450811994.0, + "step": 11815 + }, + { + "epoch": 1.5031166518254675, + "ewc_loss": 0.007902611047029495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902611105237156e-05, + "grad_norm": 3.888336658477783, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8746452331542969, + "num_tokens": 450853688.0, + "step": 11816 + }, + { + "epoch": 1.503243862104058, + "ewc_loss": 0.007910406216979027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910405838629231e-05, + "grad_norm": 3.909316062927246, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8747257590293884, + "num_tokens": 450892100.0, + "step": 11817 + }, + { + "epoch": 1.5033710723826486, + "ewc_loss": 0.007900997996330261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.900998025434092e-05, + "grad_norm": 3.8341453075408936, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8906583786010742, + "num_tokens": 450936383.0, + "step": 11818 + }, + { + "epoch": 1.503498282661239, + "ewc_loss": 0.007835628464818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835628639440984e-05, + "grad_norm": 3.9657740592956543, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8829339146614075, + "num_tokens": 450965525.0, + "step": 11819 + }, + { + "epoch": 1.5036254929398294, + "ewc_loss": 0.007935049943625927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935050234664232e-05, + "grad_norm": 3.8575260639190674, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8633326888084412, + "num_tokens": 451005874.0, + "step": 11820 + }, + { + "epoch": 1.50375270321842, + "ewc_loss": 0.007813423871994019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.813423871994019e-05, + "grad_norm": 3.939293384552002, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8548758029937744, + "num_tokens": 451040668.0, + "step": 11821 + }, + { + "epoch": 1.5038799134970104, + "ewc_loss": 0.007935179397463799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935179019114003e-05, + "grad_norm": 3.8551547527313232, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8713942170143127, + "num_tokens": 451082747.0, + "step": 11822 + }, + { + "epoch": 1.504007123775601, + "ewc_loss": 0.007841022685170174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841023034416139e-05, + "grad_norm": 3.930924654006958, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8813294768333435, + "num_tokens": 451120344.0, + "step": 11823 + }, + { + "epoch": 1.5041343340541915, + "ewc_loss": 0.007909238338470459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909238775027916e-05, + "grad_norm": 3.8923447132110596, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8683599233627319, + "num_tokens": 451156564.0, + "step": 11824 + }, + { + "epoch": 1.504261544332782, + "ewc_loss": 0.007862715050578117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862714846851304e-05, + "grad_norm": 3.8951382637023926, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.869786262512207, + "num_tokens": 451197252.0, + "step": 11825 + }, + { + "epoch": 1.5043887546113726, + "ewc_loss": 0.007871421985328197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.871421985328197e-05, + "grad_norm": 3.9149973392486572, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8630446195602417, + "num_tokens": 451236150.0, + "step": 11826 + }, + { + "epoch": 1.504515964889963, + "ewc_loss": 0.007887435145676136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887434912845492e-05, + "grad_norm": 3.8483569622039795, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8714161515235901, + "num_tokens": 451275751.0, + "step": 11827 + }, + { + "epoch": 1.5046431751685536, + "ewc_loss": 0.007849178276956081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849177927710116e-05, + "grad_norm": 3.839871406555176, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8950076103210449, + "num_tokens": 451316778.0, + "step": 11828 + }, + { + "epoch": 1.5047703854471441, + "ewc_loss": 0.00784650444984436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.846504740882665e-05, + "grad_norm": 3.8374345302581787, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8737729787826538, + "num_tokens": 451359921.0, + "step": 11829 + }, + { + "epoch": 1.5048975957257347, + "ewc_loss": 0.007855817675590515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85581796662882e-05, + "grad_norm": 3.910492420196533, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8824155926704407, + "num_tokens": 451395616.0, + "step": 11830 + }, + { + "epoch": 1.5050248060043252, + "ewc_loss": 0.007887136191129684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88713587098755e-05, + "grad_norm": 4.005446434020996, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8726494908332825, + "num_tokens": 451427766.0, + "step": 11831 + }, + { + "epoch": 1.5051520162829157, + "ewc_loss": 0.007919433526694775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919433119241148e-05, + "grad_norm": 3.914780378341675, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8840080499649048, + "num_tokens": 451463073.0, + "step": 11832 + }, + { + "epoch": 1.5052792265615063, + "ewc_loss": 0.007818098179996014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81809794716537e-05, + "grad_norm": 3.8319051265716553, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8941226005554199, + "num_tokens": 451503039.0, + "step": 11833 + }, + { + "epoch": 1.5054064368400968, + "ewc_loss": 0.007821982726454735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.821982580935583e-05, + "grad_norm": 3.957369089126587, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8572011590003967, + "num_tokens": 451541599.0, + "step": 11834 + }, + { + "epoch": 1.5055336471186873, + "ewc_loss": 0.007918532937765121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918533083284274e-05, + "grad_norm": 3.8735053539276123, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8608452081680298, + "num_tokens": 451579278.0, + "step": 11835 + }, + { + "epoch": 1.5056608573972778, + "ewc_loss": 0.007824333384633064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.824333442840725e-05, + "grad_norm": 3.8786911964416504, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8765114545822144, + "num_tokens": 451618591.0, + "step": 11836 + }, + { + "epoch": 1.5057880676758684, + "ewc_loss": 0.007859321311116219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859321340220049e-05, + "grad_norm": 3.926487684249878, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8750155568122864, + "num_tokens": 451654453.0, + "step": 11837 + }, + { + "epoch": 1.5059152779544587, + "ewc_loss": 0.00788058340549469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880583871155977e-05, + "grad_norm": 3.9473023414611816, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8731961846351624, + "num_tokens": 451688302.0, + "step": 11838 + }, + { + "epoch": 1.5060424882330492, + "ewc_loss": 0.007856572978198528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856573211029172e-05, + "grad_norm": 3.823958158493042, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8799428939819336, + "num_tokens": 451727301.0, + "step": 11839 + }, + { + "epoch": 1.5061696985116397, + "ewc_loss": 0.007791550830006599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.79155088821426e-05, + "grad_norm": 3.865128755569458, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8597615957260132, + "num_tokens": 451770471.0, + "step": 11840 + }, + { + "epoch": 1.5062969087902303, + "ewc_loss": 0.007864521816372871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.864522194722667e-05, + "grad_norm": 3.8697822093963623, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8633397817611694, + "num_tokens": 451809413.0, + "step": 11841 + }, + { + "epoch": 1.5064241190688208, + "ewc_loss": 0.007846501655876637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.846501830499619e-05, + "grad_norm": 3.8491039276123047, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8726128339767456, + "num_tokens": 451850177.0, + "step": 11842 + }, + { + "epoch": 1.5065513293474113, + "ewc_loss": 0.007830994203686714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83099458203651e-05, + "grad_norm": 3.897757053375244, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.862740159034729, + "num_tokens": 451887977.0, + "step": 11843 + }, + { + "epoch": 1.5066785396260016, + "ewc_loss": 0.007869311608374119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.869311230024323e-05, + "grad_norm": 3.8532774448394775, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8845849633216858, + "num_tokens": 451927403.0, + "step": 11844 + }, + { + "epoch": 1.5068057499045922, + "ewc_loss": 0.007819312624633312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819313032086939e-05, + "grad_norm": 3.8592240810394287, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8742629289627075, + "num_tokens": 451967350.0, + "step": 11845 + }, + { + "epoch": 1.5069329601831827, + "ewc_loss": 0.007837626151740551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837625889806077e-05, + "grad_norm": 3.859208583831787, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8747314214706421, + "num_tokens": 452008664.0, + "step": 11846 + }, + { + "epoch": 1.5070601704617732, + "ewc_loss": 0.007829512469470501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829512469470501e-05, + "grad_norm": 3.851957321166992, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8810319900512695, + "num_tokens": 452047303.0, + "step": 11847 + }, + { + "epoch": 1.5071873807403637, + "ewc_loss": 0.007830447517335415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.830447430023924e-05, + "grad_norm": 3.90201473236084, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8759198188781738, + "num_tokens": 452087359.0, + "step": 11848 + }, + { + "epoch": 1.5073145910189543, + "ewc_loss": 0.007840774953365326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.840774924261495e-05, + "grad_norm": 3.8555543422698975, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8599954843521118, + "num_tokens": 452129426.0, + "step": 11849 + }, + { + "epoch": 1.5074418012975448, + "ewc_loss": 0.007796891964972019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.796892168698832e-05, + "grad_norm": 3.8718791007995605, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.878360390663147, + "num_tokens": 452169134.0, + "step": 11850 + }, + { + "epoch": 1.5075690115761353, + "ewc_loss": 0.007833624258637428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.833624113118276e-05, + "grad_norm": 3.8602099418640137, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8645579814910889, + "num_tokens": 452209962.0, + "step": 11851 + }, + { + "epoch": 1.5076962218547258, + "ewc_loss": 0.007811501622200012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811501563992351e-05, + "grad_norm": 3.860285520553589, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8776718378067017, + "num_tokens": 452245978.0, + "step": 11852 + }, + { + "epoch": 1.5078234321333164, + "ewc_loss": 0.007820242084562778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82024217187427e-05, + "grad_norm": 3.8901538848876953, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8840517997741699, + "num_tokens": 452281013.0, + "step": 11853 + }, + { + "epoch": 1.507950642411907, + "ewc_loss": 0.007833953015506268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.833952986402437e-05, + "grad_norm": 3.8629093170166016, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8891814947128296, + "num_tokens": 452317174.0, + "step": 11854 + }, + { + "epoch": 1.5080778526904974, + "ewc_loss": 0.007802288047969341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.802288018865511e-05, + "grad_norm": 3.797823905944824, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8704126477241516, + "num_tokens": 452362974.0, + "step": 11855 + }, + { + "epoch": 1.508205062969088, + "ewc_loss": 0.007770270109176636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770270167384297e-05, + "grad_norm": 3.913609266281128, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8855942487716675, + "num_tokens": 452400701.0, + "step": 11856 + }, + { + "epoch": 1.5083322732476785, + "ewc_loss": 0.007862363941967487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8623641456943e-05, + "grad_norm": 3.8604021072387695, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8832471370697021, + "num_tokens": 452438541.0, + "step": 11857 + }, + { + "epoch": 1.508459483526269, + "ewc_loss": 0.007783011998981237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.783011824358255e-05, + "grad_norm": 3.90486478805542, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8692363500595093, + "num_tokens": 452476918.0, + "step": 11858 + }, + { + "epoch": 1.5085866938048595, + "ewc_loss": 0.007824042811989784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.824042404536158e-05, + "grad_norm": 3.9090840816497803, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8788819313049316, + "num_tokens": 452509231.0, + "step": 11859 + }, + { + "epoch": 1.50871390408345, + "ewc_loss": 0.007805236149579287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805236236890778e-05, + "grad_norm": 3.8787879943847656, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8689892292022705, + "num_tokens": 452547079.0, + "step": 11860 + }, + { + "epoch": 1.5088411143620406, + "ewc_loss": 0.007780023384839296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78002358856611e-05, + "grad_norm": 3.890678882598877, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8854172229766846, + "num_tokens": 452580215.0, + "step": 11861 + }, + { + "epoch": 1.508968324640631, + "ewc_loss": 0.007839731872081757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839732279535383e-05, + "grad_norm": 3.9090163707733154, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8801334500312805, + "num_tokens": 452615778.0, + "step": 11862 + }, + { + "epoch": 1.5090955349192214, + "ewc_loss": 0.007838073186576366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838073361199349e-05, + "grad_norm": 3.8988566398620605, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.877935528755188, + "num_tokens": 452652184.0, + "step": 11863 + }, + { + "epoch": 1.509222745197812, + "ewc_loss": 0.00782910455018282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829104288248345e-05, + "grad_norm": 3.8791115283966064, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8608824014663696, + "num_tokens": 452691345.0, + "step": 11864 + }, + { + "epoch": 1.5093499554764025, + "ewc_loss": 0.007828439585864544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.828439265722409e-05, + "grad_norm": 3.8645095825195312, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8751605749130249, + "num_tokens": 452729982.0, + "step": 11865 + }, + { + "epoch": 1.509477165754993, + "ewc_loss": 0.00782154593616724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.821546023478732e-05, + "grad_norm": 3.850658655166626, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8735387325286865, + "num_tokens": 452771272.0, + "step": 11866 + }, + { + "epoch": 1.5096043760335836, + "ewc_loss": 0.007824577391147614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8245771874208e-05, + "grad_norm": 3.857701063156128, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8705741167068481, + "num_tokens": 452812231.0, + "step": 11867 + }, + { + "epoch": 1.509731586312174, + "ewc_loss": 0.007832986302673817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832986011635512e-05, + "grad_norm": 3.856532573699951, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8626980185508728, + "num_tokens": 452858406.0, + "step": 11868 + }, + { + "epoch": 1.5098587965907644, + "ewc_loss": 0.007831493392586708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831493712728843e-05, + "grad_norm": 3.8659324645996094, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8645845651626587, + "num_tokens": 452898793.0, + "step": 11869 + }, + { + "epoch": 1.509986006869355, + "ewc_loss": 0.007844329811632633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844329957151785e-05, + "grad_norm": 3.8791003227233887, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8795419931411743, + "num_tokens": 452937929.0, + "step": 11870 + }, + { + "epoch": 1.5101132171479454, + "ewc_loss": 0.007838981226086617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838981400709599e-05, + "grad_norm": 3.931622266769409, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8757033944129944, + "num_tokens": 452975436.0, + "step": 11871 + }, + { + "epoch": 1.510240427426536, + "ewc_loss": 0.007872034795582294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872034620959312e-05, + "grad_norm": 3.8791515827178955, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8742126226425171, + "num_tokens": 453016334.0, + "step": 11872 + }, + { + "epoch": 1.5103676377051265, + "ewc_loss": 0.007805640809237957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805640780134127e-05, + "grad_norm": 3.832364320755005, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8780574202537537, + "num_tokens": 453053630.0, + "step": 11873 + }, + { + "epoch": 1.510494847983717, + "ewc_loss": 0.007798998616635799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.798998558428138e-05, + "grad_norm": 3.937464952468872, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8561121821403503, + "num_tokens": 453092341.0, + "step": 11874 + }, + { + "epoch": 1.5106220582623076, + "ewc_loss": 0.007868539541959763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868539250921458e-05, + "grad_norm": 3.9274613857269287, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8830258846282959, + "num_tokens": 453126859.0, + "step": 11875 + }, + { + "epoch": 1.510749268540898, + "ewc_loss": 0.007815993390977383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815993012627587e-05, + "grad_norm": 3.8652713298797607, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8723446130752563, + "num_tokens": 453167327.0, + "step": 11876 + }, + { + "epoch": 1.5108764788194886, + "ewc_loss": 0.0077919079922139645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791908137733117e-05, + "grad_norm": 3.8690273761749268, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8804467916488647, + "num_tokens": 453205901.0, + "step": 11877 + }, + { + "epoch": 1.5110036890980791, + "ewc_loss": 0.00782284140586853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.822841871529818e-05, + "grad_norm": 3.87520170211792, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8734885454177856, + "num_tokens": 453241856.0, + "step": 11878 + }, + { + "epoch": 1.5111308993766697, + "ewc_loss": 0.00780965993180871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.809660019120201e-05, + "grad_norm": 3.873199939727783, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.882291853427887, + "num_tokens": 453282528.0, + "step": 11879 + }, + { + "epoch": 1.5112581096552602, + "ewc_loss": 0.00781643483787775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.816434663254768e-05, + "grad_norm": 3.951920986175537, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8645922541618347, + "num_tokens": 453318762.0, + "step": 11880 + }, + { + "epoch": 1.5113853199338507, + "ewc_loss": 0.007860674522817135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.860674668336287e-05, + "grad_norm": 3.9346234798431396, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8892302513122559, + "num_tokens": 453352482.0, + "step": 11881 + }, + { + "epoch": 1.5115125302124413, + "ewc_loss": 0.007822046987712383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.822047336958349e-05, + "grad_norm": 3.8707568645477295, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8727267980575562, + "num_tokens": 453393692.0, + "step": 11882 + }, + { + "epoch": 1.5116397404910318, + "ewc_loss": 0.007805539760738611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80553964432329e-05, + "grad_norm": 3.872189998626709, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.878270149230957, + "num_tokens": 453429217.0, + "step": 11883 + }, + { + "epoch": 1.5117669507696223, + "ewc_loss": 0.007838534191250801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838533929316327e-05, + "grad_norm": 3.8676915168762207, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8606405258178711, + "num_tokens": 453468190.0, + "step": 11884 + }, + { + "epoch": 1.5118941610482128, + "ewc_loss": 0.0078087905421853065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.808790542185307e-05, + "grad_norm": 3.8506646156311035, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8670507669448853, + "num_tokens": 453509893.0, + "step": 11885 + }, + { + "epoch": 1.5120213713268034, + "ewc_loss": 0.00783122330904007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831223047105595e-05, + "grad_norm": 3.9386403560638428, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8786739110946655, + "num_tokens": 453546972.0, + "step": 11886 + }, + { + "epoch": 1.5121485816053937, + "ewc_loss": 0.00788107980042696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.881080091465265e-05, + "grad_norm": 3.929912567138672, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8857913017272949, + "num_tokens": 453578480.0, + "step": 11887 + }, + { + "epoch": 1.5122757918839842, + "ewc_loss": 0.007846943102777004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8469434811268e-05, + "grad_norm": 3.87199330329895, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8733096122741699, + "num_tokens": 453623248.0, + "step": 11888 + }, + { + "epoch": 1.5124030021625747, + "ewc_loss": 0.00783169362694025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831693801563233e-05, + "grad_norm": 3.8320913314819336, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8923231959342957, + "num_tokens": 453660088.0, + "step": 11889 + }, + { + "epoch": 1.5125302124411653, + "ewc_loss": 0.007818659767508507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818659651093185e-05, + "grad_norm": 3.8877055644989014, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8756985068321228, + "num_tokens": 453698931.0, + "step": 11890 + }, + { + "epoch": 1.5126574227197558, + "ewc_loss": 0.007867089472711086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867089152568951e-05, + "grad_norm": 3.907247543334961, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8780421018600464, + "num_tokens": 453734220.0, + "step": 11891 + }, + { + "epoch": 1.5127846329983463, + "ewc_loss": 0.007859393954277039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859394099796191e-05, + "grad_norm": 3.865847110748291, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8749592304229736, + "num_tokens": 453774056.0, + "step": 11892 + }, + { + "epoch": 1.5129118432769366, + "ewc_loss": 0.007839120924472809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839121099095792e-05, + "grad_norm": 3.9253621101379395, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8808038234710693, + "num_tokens": 453806616.0, + "step": 11893 + }, + { + "epoch": 1.5130390535555271, + "ewc_loss": 0.007902437821030617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902437937445939e-05, + "grad_norm": 3.8967275619506836, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8839303255081177, + "num_tokens": 453843838.0, + "step": 11894 + }, + { + "epoch": 1.5131662638341177, + "ewc_loss": 0.007862839847803116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862839993322268e-05, + "grad_norm": 3.935086965560913, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8824450373649597, + "num_tokens": 453876888.0, + "step": 11895 + }, + { + "epoch": 1.5132934741127082, + "ewc_loss": 0.007908506318926811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908506086096168e-05, + "grad_norm": 3.8460500240325928, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8911254405975342, + "num_tokens": 453915334.0, + "step": 11896 + }, + { + "epoch": 1.5134206843912987, + "ewc_loss": 0.00782319437712431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.823194755474105e-05, + "grad_norm": 3.8726449012756348, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8838030099868774, + "num_tokens": 453950590.0, + "step": 11897 + }, + { + "epoch": 1.5135478946698893, + "ewc_loss": 0.007877813652157784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877813186496496e-05, + "grad_norm": 3.8470957279205322, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.880398690700531, + "num_tokens": 453992590.0, + "step": 11898 + }, + { + "epoch": 1.5136751049484798, + "ewc_loss": 0.00784571934491396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.845719665056095e-05, + "grad_norm": 3.8180222511291504, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8796167969703674, + "num_tokens": 454038619.0, + "step": 11899 + }, + { + "epoch": 1.5138023152270703, + "ewc_loss": 0.007822047919034958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82204806455411e-05, + "grad_norm": 3.8770041465759277, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8670550584793091, + "num_tokens": 454081564.0, + "step": 11900 + }, + { + "epoch": 1.5139295255056608, + "ewc_loss": 0.007852962240576744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.852962153265253e-05, + "grad_norm": 3.8926665782928467, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8624222278594971, + "num_tokens": 454125581.0, + "step": 11901 + }, + { + "epoch": 1.5140567357842514, + "ewc_loss": 0.007838663645088673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83866344136186e-05, + "grad_norm": 3.8832528591156006, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8831427097320557, + "num_tokens": 454163961.0, + "step": 11902 + }, + { + "epoch": 1.514183946062842, + "ewc_loss": 0.0078115989454090595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811599061824381e-05, + "grad_norm": 3.9074556827545166, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8687092661857605, + "num_tokens": 454200410.0, + "step": 11903 + }, + { + "epoch": 1.5143111563414324, + "ewc_loss": 0.007832605391740799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83260547905229e-05, + "grad_norm": 3.9193954467773438, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8623156547546387, + "num_tokens": 454237143.0, + "step": 11904 + }, + { + "epoch": 1.514438366620023, + "ewc_loss": 0.007836822420358658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836822624085471e-05, + "grad_norm": 3.924652099609375, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8833622932434082, + "num_tokens": 454274022.0, + "step": 11905 + }, + { + "epoch": 1.5145655768986135, + "ewc_loss": 0.007828759029507637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.828758680261672e-05, + "grad_norm": 3.910879373550415, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8703585863113403, + "num_tokens": 454307955.0, + "step": 11906 + }, + { + "epoch": 1.514692787177204, + "ewc_loss": 0.007829777896404266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829778041923419e-05, + "grad_norm": 3.8890933990478516, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8819457292556763, + "num_tokens": 454345449.0, + "step": 11907 + }, + { + "epoch": 1.5148199974557945, + "ewc_loss": 0.007828346453607082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.828346133464947e-05, + "grad_norm": 3.8973042964935303, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8649089336395264, + "num_tokens": 454383096.0, + "step": 11908 + }, + { + "epoch": 1.514947207734385, + "ewc_loss": 0.007840977981686592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.840977923478931e-05, + "grad_norm": 3.9075188636779785, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8646949529647827, + "num_tokens": 454420142.0, + "step": 11909 + }, + { + "epoch": 1.5150744180129756, + "ewc_loss": 0.007848037406802177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848037785151973e-05, + "grad_norm": 3.9062628746032715, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8731680512428284, + "num_tokens": 454454991.0, + "step": 11910 + }, + { + "epoch": 1.515201628291566, + "ewc_loss": 0.007841721177101135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841720798751339e-05, + "grad_norm": 3.8947572708129883, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8726211786270142, + "num_tokens": 454490161.0, + "step": 11911 + }, + { + "epoch": 1.5153288385701564, + "ewc_loss": 0.007873435504734516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.873435242800042e-05, + "grad_norm": 3.8615610599517822, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8659142255783081, + "num_tokens": 454534767.0, + "step": 11912 + }, + { + "epoch": 1.515456048848747, + "ewc_loss": 0.007854608818888664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.854608702473342e-05, + "grad_norm": 3.8682494163513184, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.88422691822052, + "num_tokens": 454573246.0, + "step": 11913 + }, + { + "epoch": 1.5155832591273375, + "ewc_loss": 0.007880016230046749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880016346462071e-05, + "grad_norm": 3.9258761405944824, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8879472017288208, + "num_tokens": 454607626.0, + "step": 11914 + }, + { + "epoch": 1.515710469405928, + "ewc_loss": 0.007896681316196918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896681199781597e-05, + "grad_norm": 3.938382387161255, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8680951595306396, + "num_tokens": 454639840.0, + "step": 11915 + }, + { + "epoch": 1.5158376796845185, + "ewc_loss": 0.00790682528167963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.906825339887291e-05, + "grad_norm": 3.889582395553589, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8653532266616821, + "num_tokens": 454681445.0, + "step": 11916 + }, + { + "epoch": 1.515964889963109, + "ewc_loss": 0.007872967049479485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87296739872545e-05, + "grad_norm": 3.86103892326355, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8670603632926941, + "num_tokens": 454726191.0, + "step": 11917 + }, + { + "epoch": 1.5160921002416994, + "ewc_loss": 0.00784921646118164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849216490285471e-05, + "grad_norm": 3.8558030128479004, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8811967372894287, + "num_tokens": 454765112.0, + "step": 11918 + }, + { + "epoch": 1.51621931052029, + "ewc_loss": 0.007878079079091549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878079486545175e-05, + "grad_norm": 3.890627861022949, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8731366395950317, + "num_tokens": 454805254.0, + "step": 11919 + }, + { + "epoch": 1.5163465207988804, + "ewc_loss": 0.007882758975028992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.882758654886857e-05, + "grad_norm": 3.919403076171875, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8671011924743652, + "num_tokens": 454841344.0, + "step": 11920 + }, + { + "epoch": 1.516473731077471, + "ewc_loss": 0.0078700827434659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870082481531426e-05, + "grad_norm": 3.864727258682251, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8883974552154541, + "num_tokens": 454879986.0, + "step": 11921 + }, + { + "epoch": 1.5166009413560615, + "ewc_loss": 0.007851213216781616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851213013054803e-05, + "grad_norm": 3.942253589630127, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8662649989128113, + "num_tokens": 454917731.0, + "step": 11922 + }, + { + "epoch": 1.516728151634652, + "ewc_loss": 0.007904245518147945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.904245285317302e-05, + "grad_norm": 3.895782947540283, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8800234794616699, + "num_tokens": 454952837.0, + "step": 11923 + }, + { + "epoch": 1.5168553619132426, + "ewc_loss": 0.007842040620744228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842040213290602e-05, + "grad_norm": 3.872600793838501, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8784953355789185, + "num_tokens": 454992751.0, + "step": 11924 + }, + { + "epoch": 1.516982572191833, + "ewc_loss": 0.007856728509068489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856728188926354e-05, + "grad_norm": 3.9108896255493164, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8735734820365906, + "num_tokens": 455029829.0, + "step": 11925 + }, + { + "epoch": 1.5171097824704236, + "ewc_loss": 0.007869037799537182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86903765401803e-05, + "grad_norm": 3.8499319553375244, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8746950626373291, + "num_tokens": 455072167.0, + "step": 11926 + }, + { + "epoch": 1.5172369927490141, + "ewc_loss": 0.007815017364919186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815017306711525e-05, + "grad_norm": 3.9102766513824463, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.882510781288147, + "num_tokens": 455108433.0, + "step": 11927 + }, + { + "epoch": 1.5173642030276047, + "ewc_loss": 0.00787748396396637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877483585616574e-05, + "grad_norm": 3.862213611602783, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8776775598526001, + "num_tokens": 455148943.0, + "step": 11928 + }, + { + "epoch": 1.5174914133061952, + "ewc_loss": 0.007829258218407631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829258538549766e-05, + "grad_norm": 3.950887441635132, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8713873624801636, + "num_tokens": 455185099.0, + "step": 11929 + }, + { + "epoch": 1.5176186235847857, + "ewc_loss": 0.007903458550572395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.903458754299209e-05, + "grad_norm": 3.831312656402588, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8884309530258179, + "num_tokens": 455225322.0, + "step": 11930 + }, + { + "epoch": 1.5177458338633762, + "ewc_loss": 0.007810910698026419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81091075623408e-05, + "grad_norm": 3.900306224822998, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8737347722053528, + "num_tokens": 455262830.0, + "step": 11931 + }, + { + "epoch": 1.5178730441419668, + "ewc_loss": 0.00788953248411417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889532571425661e-05, + "grad_norm": 3.8644320964813232, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8802757859230042, + "num_tokens": 455300825.0, + "step": 11932 + }, + { + "epoch": 1.5180002544205573, + "ewc_loss": 0.007832612842321396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832612755009905e-05, + "grad_norm": 3.9053144454956055, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8733828663825989, + "num_tokens": 455336209.0, + "step": 11933 + }, + { + "epoch": 1.5181274646991478, + "ewc_loss": 0.007868131622672081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868131797295064e-05, + "grad_norm": 3.867337942123413, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8744308352470398, + "num_tokens": 455374909.0, + "step": 11934 + }, + { + "epoch": 1.5182546749777384, + "ewc_loss": 0.007823288440704346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.823288615327328e-05, + "grad_norm": 3.846613645553589, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8872892260551453, + "num_tokens": 455417179.0, + "step": 11935 + }, + { + "epoch": 1.5183818852563287, + "ewc_loss": 0.007835025899112225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835025462554768e-05, + "grad_norm": 3.909111738204956, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8626059889793396, + "num_tokens": 455457923.0, + "step": 11936 + }, + { + "epoch": 1.5185090955349192, + "ewc_loss": 0.007874407805502415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.874408038333058e-05, + "grad_norm": 3.923593521118164, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.866385817527771, + "num_tokens": 455492932.0, + "step": 11937 + }, + { + "epoch": 1.5186363058135097, + "ewc_loss": 0.00784437544643879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844375795684755e-05, + "grad_norm": 3.8657338619232178, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8656370639801025, + "num_tokens": 455540392.0, + "step": 11938 + }, + { + "epoch": 1.5187635160921003, + "ewc_loss": 0.007814556360244751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814556010998785e-05, + "grad_norm": 3.969252824783325, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8705945014953613, + "num_tokens": 455574461.0, + "step": 11939 + }, + { + "epoch": 1.5188907263706908, + "ewc_loss": 0.007884400896728039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884400838520378e-05, + "grad_norm": 3.9293925762176514, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8696557879447937, + "num_tokens": 455608315.0, + "step": 11940 + }, + { + "epoch": 1.5190179366492813, + "ewc_loss": 0.007823179475963116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.823179475963116e-05, + "grad_norm": 3.800685167312622, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8785738945007324, + "num_tokens": 455654927.0, + "step": 11941 + }, + { + "epoch": 1.5191451469278716, + "ewc_loss": 0.007781977765262127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.781977910781279e-05, + "grad_norm": 3.8771183490753174, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8621325492858887, + "num_tokens": 455695745.0, + "step": 11942 + }, + { + "epoch": 1.5192723572064621, + "ewc_loss": 0.007875574752688408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.875574374338612e-05, + "grad_norm": 3.8598268032073975, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8756169676780701, + "num_tokens": 455734652.0, + "step": 11943 + }, + { + "epoch": 1.5193995674850527, + "ewc_loss": 0.007819019258022308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819019083399326e-05, + "grad_norm": 3.866183042526245, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8521062731742859, + "num_tokens": 455776063.0, + "step": 11944 + }, + { + "epoch": 1.5195267777636432, + "ewc_loss": 0.007828488945960999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.828488742234185e-05, + "grad_norm": 3.8399245738983154, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8686636090278625, + "num_tokens": 455820595.0, + "step": 11945 + }, + { + "epoch": 1.5196539880422337, + "ewc_loss": 0.007817232981324196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817232835805044e-05, + "grad_norm": 3.881077527999878, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8794524073600769, + "num_tokens": 455859553.0, + "step": 11946 + }, + { + "epoch": 1.5197811983208243, + "ewc_loss": 0.00783123541623354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83123541623354e-05, + "grad_norm": 3.9161601066589355, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8735522627830505, + "num_tokens": 455892676.0, + "step": 11947 + }, + { + "epoch": 1.5199084085994148, + "ewc_loss": 0.007847392000257969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847391680115834e-05, + "grad_norm": 3.8656179904937744, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8680025935173035, + "num_tokens": 455932106.0, + "step": 11948 + }, + { + "epoch": 1.5200356188780053, + "ewc_loss": 0.007802691776305437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.802691834513098e-05, + "grad_norm": 3.8427894115448, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8790384531021118, + "num_tokens": 455971555.0, + "step": 11949 + }, + { + "epoch": 1.5201628291565958, + "ewc_loss": 0.007811214309185743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.811214163666591e-05, + "grad_norm": 3.8707334995269775, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8691871166229248, + "num_tokens": 456014177.0, + "step": 11950 + }, + { + "epoch": 1.5202900394351864, + "ewc_loss": 0.007823031395673752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.823031774023548e-05, + "grad_norm": 3.936885356903076, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8810796141624451, + "num_tokens": 456048229.0, + "step": 11951 + }, + { + "epoch": 1.520417249713777, + "ewc_loss": 0.007848981767892838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848981476854533e-05, + "grad_norm": 3.8452188968658447, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8932210206985474, + "num_tokens": 456085815.0, + "step": 11952 + }, + { + "epoch": 1.5205444599923674, + "ewc_loss": 0.0077867028303444386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78670291765593e-05, + "grad_norm": 3.866701364517212, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8788701891899109, + "num_tokens": 456128971.0, + "step": 11953 + }, + { + "epoch": 1.520671670270958, + "ewc_loss": 0.007834304124116898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.834304415155202e-05, + "grad_norm": 3.9796040058135986, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.857006311416626, + "num_tokens": 456159705.0, + "step": 11954 + }, + { + "epoch": 1.5207988805495485, + "ewc_loss": 0.007875080220401287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.875080336816609e-05, + "grad_norm": 3.908015012741089, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.864681601524353, + "num_tokens": 456198251.0, + "step": 11955 + }, + { + "epoch": 1.520926090828139, + "ewc_loss": 0.0077889752574265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78897519921884e-05, + "grad_norm": 3.88435697555542, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8669686317443848, + "num_tokens": 456237612.0, + "step": 11956 + }, + { + "epoch": 1.5210533011067295, + "ewc_loss": 0.007825183682143688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825184002285823e-05, + "grad_norm": 3.8899028301239014, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8738645315170288, + "num_tokens": 456273665.0, + "step": 11957 + }, + { + "epoch": 1.52118051138532, + "ewc_loss": 0.007830841466784477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.830841059330851e-05, + "grad_norm": 3.931407928466797, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8775059580802917, + "num_tokens": 456305467.0, + "step": 11958 + }, + { + "epoch": 1.5213077216639106, + "ewc_loss": 0.007869584485888481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.869584806030616e-05, + "grad_norm": 3.884030342102051, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.881344199180603, + "num_tokens": 456342951.0, + "step": 11959 + }, + { + "epoch": 1.521434931942501, + "ewc_loss": 0.007832866162061691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832865958334878e-05, + "grad_norm": 3.9051320552825928, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8745158910751343, + "num_tokens": 456375846.0, + "step": 11960 + }, + { + "epoch": 1.5215621422210914, + "ewc_loss": 0.007862589322030544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862588972784579e-05, + "grad_norm": 3.9251272678375244, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8839430809020996, + "num_tokens": 456407993.0, + "step": 11961 + }, + { + "epoch": 1.521689352499682, + "ewc_loss": 0.007876815274357796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876815652707592e-05, + "grad_norm": 3.9264001846313477, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8558601140975952, + "num_tokens": 456444278.0, + "step": 11962 + }, + { + "epoch": 1.5218165627782725, + "ewc_loss": 0.007878907024860382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878906762925908e-05, + "grad_norm": 3.8871073722839355, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8829785585403442, + "num_tokens": 456483950.0, + "step": 11963 + }, + { + "epoch": 1.521943773056863, + "ewc_loss": 0.007861890830099583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861890480853617e-05, + "grad_norm": 3.881751537322998, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8759864568710327, + "num_tokens": 456527438.0, + "step": 11964 + }, + { + "epoch": 1.5220709833354535, + "ewc_loss": 0.007867000997066498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86700111348182e-05, + "grad_norm": 3.9241368770599365, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8843705654144287, + "num_tokens": 456560572.0, + "step": 11965 + }, + { + "epoch": 1.5221981936140438, + "ewc_loss": 0.00790535006672144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.905350503278896e-05, + "grad_norm": 3.8280558586120605, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8661589622497559, + "num_tokens": 456604781.0, + "step": 11966 + }, + { + "epoch": 1.5223254038926344, + "ewc_loss": 0.007811187766492367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81118797021918e-05, + "grad_norm": 3.858447313308716, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8885703682899475, + "num_tokens": 456639415.0, + "step": 11967 + }, + { + "epoch": 1.522452614171225, + "ewc_loss": 0.007867910899221897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867911335779354e-05, + "grad_norm": 3.907343864440918, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8860325813293457, + "num_tokens": 456671183.0, + "step": 11968 + }, + { + "epoch": 1.5225798244498154, + "ewc_loss": 0.007899717427790165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.899717456893995e-05, + "grad_norm": 3.8714585304260254, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8647385835647583, + "num_tokens": 456712995.0, + "step": 11969 + }, + { + "epoch": 1.522707034728406, + "ewc_loss": 0.00785931944847107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859319885028526e-05, + "grad_norm": 3.8917245864868164, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8707165718078613, + "num_tokens": 456750587.0, + "step": 11970 + }, + { + "epoch": 1.5228342450069965, + "ewc_loss": 0.007876741699874401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876741437939927e-05, + "grad_norm": 3.8985891342163086, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8808210492134094, + "num_tokens": 456787503.0, + "step": 11971 + }, + { + "epoch": 1.522961455285587, + "ewc_loss": 0.007869579829275608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.869579712860286e-05, + "grad_norm": 3.857924699783325, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8527552485466003, + "num_tokens": 456831645.0, + "step": 11972 + }, + { + "epoch": 1.5230886655641775, + "ewc_loss": 0.007837223820388317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837224256945774e-05, + "grad_norm": 3.87733793258667, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8803805112838745, + "num_tokens": 456869143.0, + "step": 11973 + }, + { + "epoch": 1.523215875842768, + "ewc_loss": 0.007864026352763176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.864026702009141e-05, + "grad_norm": 3.8685214519500732, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.873162567615509, + "num_tokens": 456906465.0, + "step": 11974 + }, + { + "epoch": 1.5233430861213586, + "ewc_loss": 0.007853229530155659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853229180909693e-05, + "grad_norm": 3.9119248390197754, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8738451600074768, + "num_tokens": 456942048.0, + "step": 11975 + }, + { + "epoch": 1.5234702963999491, + "ewc_loss": 0.007892432622611523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.892432768130675e-05, + "grad_norm": 3.8791418075561523, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8919904232025146, + "num_tokens": 456975710.0, + "step": 11976 + }, + { + "epoch": 1.5235975066785397, + "ewc_loss": 0.007841598242521286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841598562663421e-05, + "grad_norm": 3.8377089500427246, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.865257978439331, + "num_tokens": 457022295.0, + "step": 11977 + }, + { + "epoch": 1.5237247169571302, + "ewc_loss": 0.00785265676677227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.852656563045457e-05, + "grad_norm": 3.8892784118652344, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8682990074157715, + "num_tokens": 457060433.0, + "step": 11978 + }, + { + "epoch": 1.5238519272357207, + "ewc_loss": 0.007887733168900013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887733227107674e-05, + "grad_norm": 3.883096218109131, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8796584606170654, + "num_tokens": 457101760.0, + "step": 11979 + }, + { + "epoch": 1.5239791375143112, + "ewc_loss": 0.007848805747926235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84880539868027e-05, + "grad_norm": 3.8666417598724365, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8892529606819153, + "num_tokens": 457142676.0, + "step": 11980 + }, + { + "epoch": 1.5241063477929018, + "ewc_loss": 0.007851236499845982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851236296119168e-05, + "grad_norm": 3.893536329269409, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8811609745025635, + "num_tokens": 457179507.0, + "step": 11981 + }, + { + "epoch": 1.5242335580714923, + "ewc_loss": 0.007856226526200771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856226875446737e-05, + "grad_norm": 3.8552896976470947, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.874927818775177, + "num_tokens": 457221563.0, + "step": 11982 + }, + { + "epoch": 1.5243607683500828, + "ewc_loss": 0.007825767621397972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825767534086481e-05, + "grad_norm": 3.9340226650238037, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8724386096000671, + "num_tokens": 457258634.0, + "step": 11983 + }, + { + "epoch": 1.5244879786286734, + "ewc_loss": 0.007887869141995907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887869287515059e-05, + "grad_norm": 3.874967575073242, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.888653039932251, + "num_tokens": 457293097.0, + "step": 11984 + }, + { + "epoch": 1.5246151889072637, + "ewc_loss": 0.007822549901902676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.822550105629489e-05, + "grad_norm": 3.848102331161499, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8895481824874878, + "num_tokens": 457333525.0, + "step": 11985 + }, + { + "epoch": 1.5247423991858542, + "ewc_loss": 0.007829231210052967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829230889910832e-05, + "grad_norm": 3.814955234527588, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.860542893409729, + "num_tokens": 457379772.0, + "step": 11986 + }, + { + "epoch": 1.5248696094644447, + "ewc_loss": 0.007816494442522526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.816494326107204e-05, + "grad_norm": 3.8602869510650635, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.880381166934967, + "num_tokens": 457415852.0, + "step": 11987 + }, + { + "epoch": 1.5249968197430352, + "ewc_loss": 0.00785007793456316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85007796366699e-05, + "grad_norm": 3.8880839347839355, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8793259263038635, + "num_tokens": 457449771.0, + "step": 11988 + }, + { + "epoch": 1.5251240300216258, + "ewc_loss": 0.00784303992986679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.843039929866791e-05, + "grad_norm": 3.896080493927002, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8710699677467346, + "num_tokens": 457489601.0, + "step": 11989 + }, + { + "epoch": 1.5252512403002163, + "ewc_loss": 0.007839806377887726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839806494303048e-05, + "grad_norm": 3.8309121131896973, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8885498046875, + "num_tokens": 457530516.0, + "step": 11990 + }, + { + "epoch": 1.5253784505788066, + "ewc_loss": 0.00781582947820425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.815829303581268e-05, + "grad_norm": 3.896080732345581, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8718951940536499, + "num_tokens": 457568299.0, + "step": 11991 + }, + { + "epoch": 1.5255056608573971, + "ewc_loss": 0.007873672060668468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.873672439018264e-05, + "grad_norm": 3.9804766178131104, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8744325637817383, + "num_tokens": 457600476.0, + "step": 11992 + }, + { + "epoch": 1.5256328711359877, + "ewc_loss": 0.007887754589319229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887755054980516e-05, + "grad_norm": 3.923239231109619, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8956789970397949, + "num_tokens": 457632024.0, + "step": 11993 + }, + { + "epoch": 1.5257600814145782, + "ewc_loss": 0.007823786698281765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8237870184239e-05, + "grad_norm": 3.8579049110412598, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8812234401702881, + "num_tokens": 457670805.0, + "step": 11994 + }, + { + "epoch": 1.5258872916931687, + "ewc_loss": 0.007835746742784977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835746509954333e-05, + "grad_norm": 3.9026546478271484, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.875408947467804, + "num_tokens": 457702468.0, + "step": 11995 + }, + { + "epoch": 1.5260145019717593, + "ewc_loss": 0.007876357063651085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876357267377898e-05, + "grad_norm": 3.882369041442871, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8829548358917236, + "num_tokens": 457740443.0, + "step": 11996 + }, + { + "epoch": 1.5261417122503498, + "ewc_loss": 0.007855381816625595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.855381409171969e-05, + "grad_norm": 3.886197805404663, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8771663904190063, + "num_tokens": 457775833.0, + "step": 11997 + }, + { + "epoch": 1.5262689225289403, + "ewc_loss": 0.007864933460950851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.864933286327869e-05, + "grad_norm": 3.8646657466888428, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8808165192604065, + "num_tokens": 457815850.0, + "step": 11998 + }, + { + "epoch": 1.5263961328075308, + "ewc_loss": 0.007862184196710587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86218442954123e-05, + "grad_norm": 3.9023358821868896, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.882757306098938, + "num_tokens": 457854946.0, + "step": 11999 + }, + { + "epoch": 1.5265233430861214, + "ewc_loss": 0.00787291582673788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87291573942639e-05, + "grad_norm": 3.875648260116577, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8805474042892456, + "num_tokens": 457890775.0, + "step": 12000 + }, + { + "epoch": 1.526650553364712, + "ewc_loss": 0.007864431478083134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86443124525249e-05, + "grad_norm": 3.881197214126587, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8810029029846191, + "num_tokens": 457928143.0, + "step": 12001 + }, + { + "epoch": 1.5267777636433024, + "ewc_loss": 0.00787082128226757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870820991229266e-05, + "grad_norm": 3.920111894607544, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8799980878829956, + "num_tokens": 457965999.0, + "step": 12002 + }, + { + "epoch": 1.526904973921893, + "ewc_loss": 0.007888474501669407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.888474647188559e-05, + "grad_norm": 3.8948051929473877, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8743030428886414, + "num_tokens": 458006638.0, + "step": 12003 + }, + { + "epoch": 1.5270321842004835, + "ewc_loss": 0.007856360636651516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856360753066838e-05, + "grad_norm": 3.8523218631744385, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8751155734062195, + "num_tokens": 458047522.0, + "step": 12004 + }, + { + "epoch": 1.527159394479074, + "ewc_loss": 0.007823992520570755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.823992928024381e-05, + "grad_norm": 3.836063861846924, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8749321103096008, + "num_tokens": 458091219.0, + "step": 12005 + }, + { + "epoch": 1.5272866047576645, + "ewc_loss": 0.007829216308891773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829216337995604e-05, + "grad_norm": 3.9130311012268066, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8653439283370972, + "num_tokens": 458129208.0, + "step": 12006 + }, + { + "epoch": 1.527413815036255, + "ewc_loss": 0.007869956083595753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86995587986894e-05, + "grad_norm": 3.9290480613708496, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8708027601242065, + "num_tokens": 458163632.0, + "step": 12007 + }, + { + "epoch": 1.5275410253148456, + "ewc_loss": 0.007835732772946358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835732685634866e-05, + "grad_norm": 3.87636661529541, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8693498373031616, + "num_tokens": 458202377.0, + "step": 12008 + }, + { + "epoch": 1.527668235593436, + "ewc_loss": 0.007824781350791454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.824781641829759e-05, + "grad_norm": 3.913207769393921, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8765889406204224, + "num_tokens": 458239901.0, + "step": 12009 + }, + { + "epoch": 1.5277954458720264, + "ewc_loss": 0.007868526503443718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868526154197752e-05, + "grad_norm": 3.896723747253418, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8707247972488403, + "num_tokens": 458280291.0, + "step": 12010 + }, + { + "epoch": 1.527922656150617, + "ewc_loss": 0.007826688699424267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.826688670320436e-05, + "grad_norm": 3.8762693405151367, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8818407654762268, + "num_tokens": 458316931.0, + "step": 12011 + }, + { + "epoch": 1.5280498664292075, + "ewc_loss": 0.007835661992430687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835662108846009e-05, + "grad_norm": 3.8595733642578125, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8685574531555176, + "num_tokens": 458356477.0, + "step": 12012 + }, + { + "epoch": 1.528177076707798, + "ewc_loss": 0.007832691073417664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832690607756376e-05, + "grad_norm": 3.934765100479126, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8612782955169678, + "num_tokens": 458401340.0, + "step": 12013 + }, + { + "epoch": 1.5283042869863885, + "ewc_loss": 0.00788026861846447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880268822191283e-05, + "grad_norm": 3.926846981048584, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8818909525871277, + "num_tokens": 458436190.0, + "step": 12014 + }, + { + "epoch": 1.5284314972649788, + "ewc_loss": 0.007840781472623348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.840781472623348e-05, + "grad_norm": 3.944620370864868, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8760595321655273, + "num_tokens": 458465513.0, + "step": 12015 + }, + { + "epoch": 1.5285587075435694, + "ewc_loss": 0.0078574875369668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857487798901275e-05, + "grad_norm": 3.878453493118286, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8727890849113464, + "num_tokens": 458505465.0, + "step": 12016 + }, + { + "epoch": 1.52868591782216, + "ewc_loss": 0.007817558012902737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817558071110398e-05, + "grad_norm": 3.8649985790252686, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8737236857414246, + "num_tokens": 458545087.0, + "step": 12017 + }, + { + "epoch": 1.5288131281007504, + "ewc_loss": 0.007827025838196278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.827026274753734e-05, + "grad_norm": 3.9707283973693848, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.882468044757843, + "num_tokens": 458583274.0, + "step": 12018 + }, + { + "epoch": 1.528940338379341, + "ewc_loss": 0.00790531374514103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.905313395895064e-05, + "grad_norm": 3.862651824951172, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8689671158790588, + "num_tokens": 458626026.0, + "step": 12019 + }, + { + "epoch": 1.5290675486579315, + "ewc_loss": 0.007815779186785221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81577909947373e-05, + "grad_norm": 3.8169589042663574, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8856209516525269, + "num_tokens": 458668774.0, + "step": 12020 + }, + { + "epoch": 1.529194758936522, + "ewc_loss": 0.007841828279197216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841828482924029e-05, + "grad_norm": 3.9385600090026855, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8789937496185303, + "num_tokens": 458702256.0, + "step": 12021 + }, + { + "epoch": 1.5293219692151125, + "ewc_loss": 0.007901503704488277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901503704488277e-05, + "grad_norm": 3.916579484939575, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8487938046455383, + "num_tokens": 458741429.0, + "step": 12022 + }, + { + "epoch": 1.529449179493703, + "ewc_loss": 0.007839987985789776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839987665647641e-05, + "grad_norm": 3.8861987590789795, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8789719939231873, + "num_tokens": 458775608.0, + "step": 12023 + }, + { + "epoch": 1.5295763897722936, + "ewc_loss": 0.007840962149202824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84096191637218e-05, + "grad_norm": 3.8749494552612305, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8792393207550049, + "num_tokens": 458811451.0, + "step": 12024 + }, + { + "epoch": 1.5297036000508841, + "ewc_loss": 0.00784970074892044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849700341466814e-05, + "grad_norm": 3.853644371032715, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8783106207847595, + "num_tokens": 458851761.0, + "step": 12025 + }, + { + "epoch": 1.5298308103294747, + "ewc_loss": 0.007833064533770084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.833064591977745e-05, + "grad_norm": 3.8813130855560303, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8691169619560242, + "num_tokens": 458888948.0, + "step": 12026 + }, + { + "epoch": 1.5299580206080652, + "ewc_loss": 0.007875938899815083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.875938899815083e-05, + "grad_norm": 3.9314775466918945, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8737439513206482, + "num_tokens": 458922466.0, + "step": 12027 + }, + { + "epoch": 1.5300852308866557, + "ewc_loss": 0.00788844283670187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.888442632975057e-05, + "grad_norm": 3.916388750076294, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8705686926841736, + "num_tokens": 458956947.0, + "step": 12028 + }, + { + "epoch": 1.5302124411652462, + "ewc_loss": 0.007859509438276291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859509059926495e-05, + "grad_norm": 3.844904899597168, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8927285671234131, + "num_tokens": 458993634.0, + "step": 12029 + }, + { + "epoch": 1.5303396514438368, + "ewc_loss": 0.00784311629831791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84311632742174e-05, + "grad_norm": 3.833265542984009, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8841207027435303, + "num_tokens": 459034314.0, + "step": 12030 + }, + { + "epoch": 1.5304668617224273, + "ewc_loss": 0.007863002829253674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.863002974772826e-05, + "grad_norm": 3.8738222122192383, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8804313540458679, + "num_tokens": 459072117.0, + "step": 12031 + }, + { + "epoch": 1.5305940720010178, + "ewc_loss": 0.007882040925323963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.882040517870337e-05, + "grad_norm": 3.8749477863311768, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8693045973777771, + "num_tokens": 459112382.0, + "step": 12032 + }, + { + "epoch": 1.5307212822796084, + "ewc_loss": 0.00787254050374031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872540300013497e-05, + "grad_norm": 3.8567912578582764, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8869562149047852, + "num_tokens": 459148973.0, + "step": 12033 + }, + { + "epoch": 1.5308484925581987, + "ewc_loss": 0.007862722501158714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862722122808918e-05, + "grad_norm": 3.949068546295166, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8734158277511597, + "num_tokens": 459183242.0, + "step": 12034 + }, + { + "epoch": 1.5309757028367892, + "ewc_loss": 0.007927177473902702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.927177648525685e-05, + "grad_norm": 3.883112907409668, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8914971947669983, + "num_tokens": 459219606.0, + "step": 12035 + }, + { + "epoch": 1.5311029131153797, + "ewc_loss": 0.007848060689866543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848061068216339e-05, + "grad_norm": 3.8851318359375, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8920320272445679, + "num_tokens": 459253763.0, + "step": 12036 + }, + { + "epoch": 1.5312301233939702, + "ewc_loss": 0.00788334384560585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.883343641879037e-05, + "grad_norm": 3.8724091053009033, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8786081075668335, + "num_tokens": 459291187.0, + "step": 12037 + }, + { + "epoch": 1.5313573336725608, + "ewc_loss": 0.007872108370065689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872108108131215e-05, + "grad_norm": 3.8992860317230225, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8747870922088623, + "num_tokens": 459327423.0, + "step": 12038 + }, + { + "epoch": 1.5314845439511513, + "ewc_loss": 0.007887502200901508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887502579251304e-05, + "grad_norm": 3.882993698120117, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8727413415908813, + "num_tokens": 459366207.0, + "step": 12039 + }, + { + "epoch": 1.5316117542297416, + "ewc_loss": 0.007869052700698376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86905293352902e-05, + "grad_norm": 3.8768415451049805, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8678128719329834, + "num_tokens": 459408338.0, + "step": 12040 + }, + { + "epoch": 1.5317389645083321, + "ewc_loss": 0.00786697119474411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866971282055601e-05, + "grad_norm": 3.8986318111419678, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8856912851333618, + "num_tokens": 459443420.0, + "step": 12041 + }, + { + "epoch": 1.5318661747869227, + "ewc_loss": 0.007882917299866676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.882917270762846e-05, + "grad_norm": 3.9795355796813965, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8609803915023804, + "num_tokens": 459475835.0, + "step": 12042 + }, + { + "epoch": 1.5319933850655132, + "ewc_loss": 0.0079287588596344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928758714115247e-05, + "grad_norm": 3.8722472190856934, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8751794099807739, + "num_tokens": 459511993.0, + "step": 12043 + }, + { + "epoch": 1.5321205953441037, + "ewc_loss": 0.007862985134124756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862985512474552e-05, + "grad_norm": 3.885157585144043, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8840548396110535, + "num_tokens": 459546221.0, + "step": 12044 + }, + { + "epoch": 1.5322478056226942, + "ewc_loss": 0.007891354151070118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.891354471212253e-05, + "grad_norm": 3.820934534072876, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8794182538986206, + "num_tokens": 459587271.0, + "step": 12045 + }, + { + "epoch": 1.5323750159012848, + "ewc_loss": 0.007844907231628895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844907668186352e-05, + "grad_norm": 3.9041202068328857, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8639166951179504, + "num_tokens": 459623388.0, + "step": 12046 + }, + { + "epoch": 1.5325022261798753, + "ewc_loss": 0.007926960475742817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926960097393021e-05, + "grad_norm": 3.826261043548584, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8755375146865845, + "num_tokens": 459666440.0, + "step": 12047 + }, + { + "epoch": 1.5326294364584658, + "ewc_loss": 0.007855219766497612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.855219882912934e-05, + "grad_norm": 3.8878304958343506, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8635356426239014, + "num_tokens": 459708373.0, + "step": 12048 + }, + { + "epoch": 1.5327566467370564, + "ewc_loss": 0.007911411114037037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911411375971511e-05, + "grad_norm": 3.858867883682251, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.886921226978302, + "num_tokens": 459746707.0, + "step": 12049 + }, + { + "epoch": 1.532883857015647, + "ewc_loss": 0.00787143874913454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87143872003071e-05, + "grad_norm": 3.8370859622955322, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8818603754043579, + "num_tokens": 459792493.0, + "step": 12050 + }, + { + "epoch": 1.5330110672942374, + "ewc_loss": 0.007873442955315113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.873442518757656e-05, + "grad_norm": 3.8877310752868652, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8835042119026184, + "num_tokens": 459828897.0, + "step": 12051 + }, + { + "epoch": 1.533138277572828, + "ewc_loss": 0.007872493006289005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872493006289005e-05, + "grad_norm": 3.901918411254883, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8674875497817993, + "num_tokens": 459866274.0, + "step": 12052 + }, + { + "epoch": 1.5332654878514185, + "ewc_loss": 0.007860204204916954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86020391387865e-05, + "grad_norm": 3.8613882064819336, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8902267813682556, + "num_tokens": 459902442.0, + "step": 12053 + }, + { + "epoch": 1.533392698130009, + "ewc_loss": 0.007824990898370743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.824991189409047e-05, + "grad_norm": 3.855626106262207, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8824374675750732, + "num_tokens": 459943331.0, + "step": 12054 + }, + { + "epoch": 1.5335199084085995, + "ewc_loss": 0.007837101817131042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837102020857856e-05, + "grad_norm": 3.935487985610962, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8806015253067017, + "num_tokens": 459975349.0, + "step": 12055 + }, + { + "epoch": 1.53364711868719, + "ewc_loss": 0.007891851477324963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.891851419117302e-05, + "grad_norm": 3.9020988941192627, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8893091082572937, + "num_tokens": 460008295.0, + "step": 12056 + }, + { + "epoch": 1.5337743289657806, + "ewc_loss": 0.007837844081223011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837844168534502e-05, + "grad_norm": 3.907202959060669, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8799619078636169, + "num_tokens": 460045892.0, + "step": 12057 + }, + { + "epoch": 1.533901539244371, + "ewc_loss": 0.007837307639420033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837307930458337e-05, + "grad_norm": 3.8607187271118164, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.877021074295044, + "num_tokens": 460084255.0, + "step": 12058 + }, + { + "epoch": 1.5340287495229614, + "ewc_loss": 0.007831262424588203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831262337276712e-05, + "grad_norm": 3.9067723751068115, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8621944189071655, + "num_tokens": 460123706.0, + "step": 12059 + }, + { + "epoch": 1.534155959801552, + "ewc_loss": 0.00785878673195839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.858786557335407e-05, + "grad_norm": 3.8934311866760254, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.862046480178833, + "num_tokens": 460162351.0, + "step": 12060 + }, + { + "epoch": 1.5342831700801425, + "ewc_loss": 0.007830643095076084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.830643153283745e-05, + "grad_norm": 3.9151642322540283, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8663156628608704, + "num_tokens": 460200135.0, + "step": 12061 + }, + { + "epoch": 1.534410380358733, + "ewc_loss": 0.007860473357141018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.860473124310374e-05, + "grad_norm": 3.8374056816101074, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.870468020439148, + "num_tokens": 460243585.0, + "step": 12062 + }, + { + "epoch": 1.5345375906373235, + "ewc_loss": 0.007805849425494671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.805849600117654e-05, + "grad_norm": 3.9215171337127686, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8703528642654419, + "num_tokens": 460283695.0, + "step": 12063 + }, + { + "epoch": 1.5346648009159138, + "ewc_loss": 0.007885941304266453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8859411587473e-05, + "grad_norm": 3.9071431159973145, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8835724592208862, + "num_tokens": 460317241.0, + "step": 12064 + }, + { + "epoch": 1.5347920111945044, + "ewc_loss": 0.007847029715776443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847030065022409e-05, + "grad_norm": 3.9061119556427, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.876844048500061, + "num_tokens": 460351900.0, + "step": 12065 + }, + { + "epoch": 1.534919221473095, + "ewc_loss": 0.007847226224839687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84722578828223e-05, + "grad_norm": 3.8529694080352783, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8779610395431519, + "num_tokens": 460391672.0, + "step": 12066 + }, + { + "epoch": 1.5350464317516854, + "ewc_loss": 0.007827335968613625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.827336230548099e-05, + "grad_norm": 3.878072500228882, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8688197135925293, + "num_tokens": 460434120.0, + "step": 12067 + }, + { + "epoch": 1.535173642030276, + "ewc_loss": 0.00784609466791153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.846094376873225e-05, + "grad_norm": 3.9021105766296387, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8770161271095276, + "num_tokens": 460474214.0, + "step": 12068 + }, + { + "epoch": 1.5353008523088665, + "ewc_loss": 0.007856144569814205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856144657125697e-05, + "grad_norm": 3.9497265815734863, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.881129264831543, + "num_tokens": 460507549.0, + "step": 12069 + }, + { + "epoch": 1.535428062587457, + "ewc_loss": 0.007872235961258411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872236164985225e-05, + "grad_norm": 3.903947353363037, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8688710927963257, + "num_tokens": 460546391.0, + "step": 12070 + }, + { + "epoch": 1.5355552728660475, + "ewc_loss": 0.007839079946279526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839079626137391e-05, + "grad_norm": 3.923825263977051, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8656847476959229, + "num_tokens": 460585232.0, + "step": 12071 + }, + { + "epoch": 1.535682483144638, + "ewc_loss": 0.00785725750029087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857257151044905e-05, + "grad_norm": 3.905609369277954, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8823744654655457, + "num_tokens": 460624398.0, + "step": 12072 + }, + { + "epoch": 1.5358096934232286, + "ewc_loss": 0.00785117782652378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851177360862494e-05, + "grad_norm": 3.8767828941345215, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8700467348098755, + "num_tokens": 460667193.0, + "step": 12073 + }, + { + "epoch": 1.5359369037018191, + "ewc_loss": 0.007829897105693817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829897367628291e-05, + "grad_norm": 3.878927707672119, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8533756732940674, + "num_tokens": 460710614.0, + "step": 12074 + }, + { + "epoch": 1.5360641139804097, + "ewc_loss": 0.007856444455683231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856444426579401e-05, + "grad_norm": 3.8846137523651123, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8719478845596313, + "num_tokens": 460751079.0, + "step": 12075 + }, + { + "epoch": 1.5361913242590002, + "ewc_loss": 0.007856826297938824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856826414354146e-05, + "grad_norm": 3.864258050918579, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8741961717605591, + "num_tokens": 460786768.0, + "step": 12076 + }, + { + "epoch": 1.5363185345375907, + "ewc_loss": 0.00785010401159525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850104157114401e-05, + "grad_norm": 3.905168294906616, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.86643385887146, + "num_tokens": 460824757.0, + "step": 12077 + }, + { + "epoch": 1.5364457448161812, + "ewc_loss": 0.007883939892053604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.883939542807639e-05, + "grad_norm": 3.921499490737915, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8690352439880371, + "num_tokens": 460865024.0, + "step": 12078 + }, + { + "epoch": 1.5365729550947718, + "ewc_loss": 0.007867463864386082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867463864386082e-05, + "grad_norm": 3.935717821121216, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8667623400688171, + "num_tokens": 460900895.0, + "step": 12079 + }, + { + "epoch": 1.5367001653733623, + "ewc_loss": 0.007884291931986809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884291699156165e-05, + "grad_norm": 3.865309000015259, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8807005882263184, + "num_tokens": 460936876.0, + "step": 12080 + }, + { + "epoch": 1.5368273756519528, + "ewc_loss": 0.007836520671844482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836520671844482e-05, + "grad_norm": 3.91959547996521, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8612666130065918, + "num_tokens": 460975297.0, + "step": 12081 + }, + { + "epoch": 1.5369545859305433, + "ewc_loss": 0.007880029268562794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880029443185776e-05, + "grad_norm": 3.937612771987915, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8648066520690918, + "num_tokens": 461008603.0, + "step": 12082 + }, + { + "epoch": 1.5370817962091337, + "ewc_loss": 0.007889277301728725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889277185313404e-05, + "grad_norm": 3.908478021621704, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8670610189437866, + "num_tokens": 461047770.0, + "step": 12083 + }, + { + "epoch": 1.5372090064877242, + "ewc_loss": 0.007861620746552944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861620542826131e-05, + "grad_norm": 3.873166084289551, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8639809489250183, + "num_tokens": 461088554.0, + "step": 12084 + }, + { + "epoch": 1.5373362167663147, + "ewc_loss": 0.007862651720643044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862651546020061e-05, + "grad_norm": 3.912921667098999, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8543249368667603, + "num_tokens": 461126070.0, + "step": 12085 + }, + { + "epoch": 1.5374634270449052, + "ewc_loss": 0.007915099151432514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915098831290379e-05, + "grad_norm": 3.890071153640747, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8726897835731506, + "num_tokens": 461166394.0, + "step": 12086 + }, + { + "epoch": 1.5375906373234958, + "ewc_loss": 0.00786855723708868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868557440815493e-05, + "grad_norm": 3.837214946746826, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8673403263092041, + "num_tokens": 461208263.0, + "step": 12087 + }, + { + "epoch": 1.5377178476020863, + "ewc_loss": 0.007865918800234795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.865918450988829e-05, + "grad_norm": 3.8716320991516113, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8856043219566345, + "num_tokens": 461245054.0, + "step": 12088 + }, + { + "epoch": 1.5378450578806766, + "ewc_loss": 0.007893792353570461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.893792644608766e-05, + "grad_norm": 3.9279959201812744, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8650669455528259, + "num_tokens": 461281584.0, + "step": 12089 + }, + { + "epoch": 1.5379722681592671, + "ewc_loss": 0.00791722722351551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917227048892528e-05, + "grad_norm": 3.8984503746032715, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8838666677474976, + "num_tokens": 461322247.0, + "step": 12090 + }, + { + "epoch": 1.5380994784378577, + "ewc_loss": 0.007871422916650772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.871422712923959e-05, + "grad_norm": 3.9251041412353516, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8643319606781006, + "num_tokens": 461358371.0, + "step": 12091 + }, + { + "epoch": 1.5382266887164482, + "ewc_loss": 0.00789814442396164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.898144394857809e-05, + "grad_norm": 3.873213529586792, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8796791434288025, + "num_tokens": 461394666.0, + "step": 12092 + }, + { + "epoch": 1.5383538989950387, + "ewc_loss": 0.007867017760872841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867017848184332e-05, + "grad_norm": 3.877711772918701, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8862677812576294, + "num_tokens": 461430875.0, + "step": 12093 + }, + { + "epoch": 1.5384811092736292, + "ewc_loss": 0.007904624566435814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.904624362709001e-05, + "grad_norm": 3.8972463607788086, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8709765076637268, + "num_tokens": 461467412.0, + "step": 12094 + }, + { + "epoch": 1.5386083195522198, + "ewc_loss": 0.007895109243690968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.895108865341172e-05, + "grad_norm": 3.9134654998779297, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8556991815567017, + "num_tokens": 461501970.0, + "step": 12095 + }, + { + "epoch": 1.5387355298308103, + "ewc_loss": 0.007904788479208946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.904788799351081e-05, + "grad_norm": 3.9076969623565674, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.862409234046936, + "num_tokens": 461542735.0, + "step": 12096 + }, + { + "epoch": 1.5388627401094008, + "ewc_loss": 0.007896332070231438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896331953816116e-05, + "grad_norm": 3.8345229625701904, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8773006796836853, + "num_tokens": 461583883.0, + "step": 12097 + }, + { + "epoch": 1.5389899503879914, + "ewc_loss": 0.007863057777285576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.863057544454932e-05, + "grad_norm": 3.88606333732605, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.86837238073349, + "num_tokens": 461625124.0, + "step": 12098 + }, + { + "epoch": 1.5391171606665819, + "ewc_loss": 0.00792775023728609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.927750266389921e-05, + "grad_norm": 3.9258060455322266, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8653931617736816, + "num_tokens": 461660273.0, + "step": 12099 + }, + { + "epoch": 1.5392443709451724, + "ewc_loss": 0.007897837087512016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89783734944649e-05, + "grad_norm": 3.861088275909424, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8696445822715759, + "num_tokens": 461698890.0, + "step": 12100 + }, + { + "epoch": 1.539371581223763, + "ewc_loss": 0.007864839397370815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.864839426474646e-05, + "grad_norm": 3.8851318359375, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8736506104469299, + "num_tokens": 461736927.0, + "step": 12101 + }, + { + "epoch": 1.5394987915023535, + "ewc_loss": 0.00788747426122427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887474203016609e-05, + "grad_norm": 3.844064474105835, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8731324672698975, + "num_tokens": 461777796.0, + "step": 12102 + }, + { + "epoch": 1.539626001780944, + "ewc_loss": 0.007868529297411442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868529064580798e-05, + "grad_norm": 3.867630958557129, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8846953511238098, + "num_tokens": 461814656.0, + "step": 12103 + }, + { + "epoch": 1.5397532120595345, + "ewc_loss": 0.007888856343925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.888856634963304e-05, + "grad_norm": 3.9111168384552, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8572187423706055, + "num_tokens": 461852186.0, + "step": 12104 + }, + { + "epoch": 1.539880422338125, + "ewc_loss": 0.00791865959763527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91865968494676e-05, + "grad_norm": 3.8570897579193115, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8875131011009216, + "num_tokens": 461893724.0, + "step": 12105 + }, + { + "epoch": 1.5400076326167156, + "ewc_loss": 0.007874335162341595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.874335278756917e-05, + "grad_norm": 3.9056496620178223, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8614230155944824, + "num_tokens": 461935380.0, + "step": 12106 + }, + { + "epoch": 1.5401348428953059, + "ewc_loss": 0.007910831831395626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91083148214966e-05, + "grad_norm": 3.906578779220581, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8802590370178223, + "num_tokens": 461971463.0, + "step": 12107 + }, + { + "epoch": 1.5402620531738964, + "ewc_loss": 0.007883980870246887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88398101576604e-05, + "grad_norm": 3.8770413398742676, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8817669153213501, + "num_tokens": 462007514.0, + "step": 12108 + }, + { + "epoch": 1.540389263452487, + "ewc_loss": 0.007865076884627342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86507735028863e-05, + "grad_norm": 3.8661043643951416, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8875026702880859, + "num_tokens": 462046022.0, + "step": 12109 + }, + { + "epoch": 1.5405164737310775, + "ewc_loss": 0.007866894826292992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866894884500653e-05, + "grad_norm": 3.836038827896118, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8791109919548035, + "num_tokens": 462089367.0, + "step": 12110 + }, + { + "epoch": 1.540643684009668, + "ewc_loss": 0.007863174192607403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.863173959776759e-05, + "grad_norm": 3.8709521293640137, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8814511299133301, + "num_tokens": 462128558.0, + "step": 12111 + }, + { + "epoch": 1.5407708942882585, + "ewc_loss": 0.007877154275774956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877154712332413e-05, + "grad_norm": 3.8920323848724365, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.882269024848938, + "num_tokens": 462162639.0, + "step": 12112 + }, + { + "epoch": 1.5408981045668488, + "ewc_loss": 0.007867449894547462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867450040066615e-05, + "grad_norm": 3.8909974098205566, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8749370574951172, + "num_tokens": 462199303.0, + "step": 12113 + }, + { + "epoch": 1.5410253148454394, + "ewc_loss": 0.007861247286200523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861247286200523e-05, + "grad_norm": 3.8212602138519287, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8727720975875854, + "num_tokens": 462246095.0, + "step": 12114 + }, + { + "epoch": 1.54115252512403, + "ewc_loss": 0.007819977588951588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.819977327017114e-05, + "grad_norm": 3.8759565353393555, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8713229894638062, + "num_tokens": 462286653.0, + "step": 12115 + }, + { + "epoch": 1.5412797354026204, + "ewc_loss": 0.007881462574005127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88146280683577e-05, + "grad_norm": 3.8719077110290527, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.870665431022644, + "num_tokens": 462328295.0, + "step": 12116 + }, + { + "epoch": 1.541406945681211, + "ewc_loss": 0.00785510241985321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.855102739995345e-05, + "grad_norm": 3.987170696258545, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.87711101770401, + "num_tokens": 462365175.0, + "step": 12117 + }, + { + "epoch": 1.5415341559598015, + "ewc_loss": 0.007913599722087383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913599984021857e-05, + "grad_norm": 3.8960940837860107, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.867047905921936, + "num_tokens": 462407044.0, + "step": 12118 + }, + { + "epoch": 1.541661366238392, + "ewc_loss": 0.007803217973560095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.803217886248603e-05, + "grad_norm": 3.8931705951690674, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8695720434188843, + "num_tokens": 462443819.0, + "step": 12119 + }, + { + "epoch": 1.5417885765169825, + "ewc_loss": 0.007872897200286388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872897549532354e-05, + "grad_norm": 3.8844971656799316, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8848442435264587, + "num_tokens": 462487319.0, + "step": 12120 + }, + { + "epoch": 1.541915786795573, + "ewc_loss": 0.007824278436601162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.824278873158619e-05, + "grad_norm": 3.8871731758117676, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8756226897239685, + "num_tokens": 462527026.0, + "step": 12121 + }, + { + "epoch": 1.5420429970741636, + "ewc_loss": 0.007824894040822983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.824894419172779e-05, + "grad_norm": 4.071070194244385, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8648035526275635, + "num_tokens": 462557552.0, + "step": 12122 + }, + { + "epoch": 1.5421702073527541, + "ewc_loss": 0.007926867343485355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926867692731321e-05, + "grad_norm": 3.85666823387146, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.875153124332428, + "num_tokens": 462595243.0, + "step": 12123 + }, + { + "epoch": 1.5422974176313446, + "ewc_loss": 0.0077414182014763355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.741418085061014e-05, + "grad_norm": 3.8646440505981445, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8765609264373779, + "num_tokens": 462636077.0, + "step": 12124 + }, + { + "epoch": 1.5424246279099352, + "ewc_loss": 0.007839720696210861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8397206380032e-05, + "grad_norm": 3.8892714977264404, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8871837854385376, + "num_tokens": 462668407.0, + "step": 12125 + }, + { + "epoch": 1.5425518381885257, + "ewc_loss": 0.007836326025426388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836325676180422e-05, + "grad_norm": 3.866269111633301, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.87442946434021, + "num_tokens": 462707678.0, + "step": 12126 + }, + { + "epoch": 1.5426790484671162, + "ewc_loss": 0.007813689298927784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.813689444446936e-05, + "grad_norm": 3.946500778198242, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8675912022590637, + "num_tokens": 462739722.0, + "step": 12127 + }, + { + "epoch": 1.5428062587457068, + "ewc_loss": 0.00787921342998743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.879213808337227e-05, + "grad_norm": 3.9629862308502197, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8824485540390015, + "num_tokens": 462770369.0, + "step": 12128 + }, + { + "epoch": 1.5429334690242973, + "ewc_loss": 0.007884247228503227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884247315814719e-05, + "grad_norm": 3.8634421825408936, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8801989555358887, + "num_tokens": 462806126.0, + "step": 12129 + }, + { + "epoch": 1.5430606793028878, + "ewc_loss": 0.007830016314983368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.830015965737402e-05, + "grad_norm": 3.882180690765381, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.875996470451355, + "num_tokens": 462846585.0, + "step": 12130 + }, + { + "epoch": 1.5431878895814783, + "ewc_loss": 0.007892865687608719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.892865687608719e-05, + "grad_norm": 3.907572031021118, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8714429140090942, + "num_tokens": 462889892.0, + "step": 12131 + }, + { + "epoch": 1.5433150998600687, + "ewc_loss": 0.007880493998527527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880494376877323e-05, + "grad_norm": 3.894766330718994, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8740885853767395, + "num_tokens": 462927612.0, + "step": 12132 + }, + { + "epoch": 1.5434423101386592, + "ewc_loss": 0.007872873917222023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872873538872227e-05, + "grad_norm": 3.9014546871185303, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8770390748977661, + "num_tokens": 462971661.0, + "step": 12133 + }, + { + "epoch": 1.5435695204172497, + "ewc_loss": 0.007874535396695137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.874535367591307e-05, + "grad_norm": 3.8540477752685547, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8796219825744629, + "num_tokens": 463011260.0, + "step": 12134 + }, + { + "epoch": 1.5436967306958402, + "ewc_loss": 0.007844804786145687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84480434958823e-05, + "grad_norm": 3.8945016860961914, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8829829692840576, + "num_tokens": 463045395.0, + "step": 12135 + }, + { + "epoch": 1.5438239409744308, + "ewc_loss": 0.00786980427801609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.869804539950565e-05, + "grad_norm": 3.9524917602539062, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8659929037094116, + "num_tokens": 463081015.0, + "step": 12136 + }, + { + "epoch": 1.5439511512530213, + "ewc_loss": 0.007896781899034977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896782335592434e-05, + "grad_norm": 3.9291908740997314, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8746691942214966, + "num_tokens": 463115658.0, + "step": 12137 + }, + { + "epoch": 1.5440783615316116, + "ewc_loss": 0.007843363098800182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84336298238486e-05, + "grad_norm": 3.8399274349212646, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8882086277008057, + "num_tokens": 463152084.0, + "step": 12138 + }, + { + "epoch": 1.5442055718102021, + "ewc_loss": 0.007820413447916508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820413156878203e-05, + "grad_norm": 3.8886189460754395, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8637080192565918, + "num_tokens": 463190137.0, + "step": 12139 + }, + { + "epoch": 1.5443327820887927, + "ewc_loss": 0.007890814915299416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89081459515728e-05, + "grad_norm": 3.9162161350250244, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8740429878234863, + "num_tokens": 463227626.0, + "step": 12140 + }, + { + "epoch": 1.5444599923673832, + "ewc_loss": 0.007871183566749096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.871183333918452e-05, + "grad_norm": 3.8808109760284424, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8749629259109497, + "num_tokens": 463266451.0, + "step": 12141 + }, + { + "epoch": 1.5445872026459737, + "ewc_loss": 0.00783675629645586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836756412871182e-05, + "grad_norm": 3.8999953269958496, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.864298939704895, + "num_tokens": 463305327.0, + "step": 12142 + }, + { + "epoch": 1.5447144129245642, + "ewc_loss": 0.007891129702329636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.891129644121975e-05, + "grad_norm": 3.9114041328430176, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8715503215789795, + "num_tokens": 463345890.0, + "step": 12143 + }, + { + "epoch": 1.5448416232031548, + "ewc_loss": 0.00787829328328371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878293399699032e-05, + "grad_norm": 3.917119026184082, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8787909150123596, + "num_tokens": 463381583.0, + "step": 12144 + }, + { + "epoch": 1.5449688334817453, + "ewc_loss": 0.007880407385528088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880407065385953e-05, + "grad_norm": 3.9179301261901855, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8839539885520935, + "num_tokens": 463414442.0, + "step": 12145 + }, + { + "epoch": 1.5450960437603358, + "ewc_loss": 0.007878893055021763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878892938606441e-05, + "grad_norm": 3.8949599266052246, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8818054795265198, + "num_tokens": 463449517.0, + "step": 12146 + }, + { + "epoch": 1.5452232540389264, + "ewc_loss": 0.007861987687647343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861987251089886e-05, + "grad_norm": 3.8658816814422607, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8819026350975037, + "num_tokens": 463487952.0, + "step": 12147 + }, + { + "epoch": 1.5453504643175169, + "ewc_loss": 0.00785762257874012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857622404117137e-05, + "grad_norm": 3.959205389022827, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.874670147895813, + "num_tokens": 463520724.0, + "step": 12148 + }, + { + "epoch": 1.5454776745961074, + "ewc_loss": 0.007932927459478378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932927837828174e-05, + "grad_norm": 3.850895643234253, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8717209696769714, + "num_tokens": 463564691.0, + "step": 12149 + }, + { + "epoch": 1.545604884874698, + "ewc_loss": 0.007838712073862553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.838712190277874e-05, + "grad_norm": 3.94516921043396, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8724799156188965, + "num_tokens": 463599037.0, + "step": 12150 + }, + { + "epoch": 1.5457320951532885, + "ewc_loss": 0.007936556823551655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936557085486129e-05, + "grad_norm": 3.9811909198760986, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8636496663093567, + "num_tokens": 463630744.0, + "step": 12151 + }, + { + "epoch": 1.545859305431879, + "ewc_loss": 0.007916666567325592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.916666800156236e-05, + "grad_norm": 3.940584421157837, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.871930718421936, + "num_tokens": 463663551.0, + "step": 12152 + }, + { + "epoch": 1.5459865157104695, + "ewc_loss": 0.00789849553257227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.898495096014813e-05, + "grad_norm": 3.882985830307007, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8862793445587158, + "num_tokens": 463702446.0, + "step": 12153 + }, + { + "epoch": 1.54611372598906, + "ewc_loss": 0.007887767627835274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88776742410846e-05, + "grad_norm": 3.935044527053833, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8664929270744324, + "num_tokens": 463739324.0, + "step": 12154 + }, + { + "epoch": 1.5462409362676506, + "ewc_loss": 0.007937437854707241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.937438203953207e-05, + "grad_norm": 3.8998122215270996, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8566551208496094, + "num_tokens": 463780765.0, + "step": 12155 + }, + { + "epoch": 1.5463681465462409, + "ewc_loss": 0.007915869355201721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915869355201721e-05, + "grad_norm": 3.9192757606506348, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.852555513381958, + "num_tokens": 463817727.0, + "step": 12156 + }, + { + "epoch": 1.5464953568248314, + "ewc_loss": 0.007933911867439747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.933912274893373e-05, + "grad_norm": 3.797334909439087, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8900319337844849, + "num_tokens": 463865793.0, + "step": 12157 + }, + { + "epoch": 1.546622567103422, + "ewc_loss": 0.007846247404813766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.846247171983123e-05, + "grad_norm": 3.8667044639587402, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8752840757369995, + "num_tokens": 463908299.0, + "step": 12158 + }, + { + "epoch": 1.5467497773820125, + "ewc_loss": 0.007953985594213009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953985186759382e-05, + "grad_norm": 3.9176552295684814, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8726370334625244, + "num_tokens": 463944513.0, + "step": 12159 + }, + { + "epoch": 1.546876987660603, + "ewc_loss": 0.007940075360238552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940075010992587e-05, + "grad_norm": 3.938502550125122, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8677993416786194, + "num_tokens": 463982862.0, + "step": 12160 + }, + { + "epoch": 1.5470041979391935, + "ewc_loss": 0.007917696610093117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917696348158643e-05, + "grad_norm": 3.8497965335845947, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8789405822753906, + "num_tokens": 464023357.0, + "step": 12161 + }, + { + "epoch": 1.5471314082177838, + "ewc_loss": 0.007873043417930603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.873043796280399e-05, + "grad_norm": 3.848191022872925, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8746199011802673, + "num_tokens": 464071889.0, + "step": 12162 + }, + { + "epoch": 1.5472586184963744, + "ewc_loss": 0.007879662327468395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87966200732626e-05, + "grad_norm": 3.9206392765045166, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8625795841217041, + "num_tokens": 464112334.0, + "step": 12163 + }, + { + "epoch": 1.5473858287749649, + "ewc_loss": 0.007906448096036911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.906447717687115e-05, + "grad_norm": 3.8929688930511475, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8697112798690796, + "num_tokens": 464156833.0, + "step": 12164 + }, + { + "epoch": 1.5475130390535554, + "ewc_loss": 0.007848200388252735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84820003900677e-05, + "grad_norm": 3.8830642700195312, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8792291879653931, + "num_tokens": 464192989.0, + "step": 12165 + }, + { + "epoch": 1.547640249332146, + "ewc_loss": 0.00785069540143013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850695692468435e-05, + "grad_norm": 3.8580124378204346, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8799624443054199, + "num_tokens": 464232855.0, + "step": 12166 + }, + { + "epoch": 1.5477674596107365, + "ewc_loss": 0.007817370817065239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817371078999713e-05, + "grad_norm": 3.909482955932617, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8592404723167419, + "num_tokens": 464271860.0, + "step": 12167 + }, + { + "epoch": 1.547894669889327, + "ewc_loss": 0.007853345014154911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853344868635759e-05, + "grad_norm": 3.855673313140869, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8880988359451294, + "num_tokens": 464312925.0, + "step": 12168 + }, + { + "epoch": 1.5480218801679175, + "ewc_loss": 0.007813842035830021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.813842239556834e-05, + "grad_norm": 3.9155144691467285, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8705859184265137, + "num_tokens": 464349190.0, + "step": 12169 + }, + { + "epoch": 1.548149090446508, + "ewc_loss": 0.007844514213502407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844514038879424e-05, + "grad_norm": 3.832958936691284, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8706116676330566, + "num_tokens": 464392031.0, + "step": 12170 + }, + { + "epoch": 1.5482763007250986, + "ewc_loss": 0.0077772908844053745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.777290738886222e-05, + "grad_norm": 3.919229507446289, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8723016381263733, + "num_tokens": 464424884.0, + "step": 12171 + }, + { + "epoch": 1.5484035110036891, + "ewc_loss": 0.007853284478187561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853284478187561e-05, + "grad_norm": 3.9056341648101807, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8685912489891052, + "num_tokens": 464466047.0, + "step": 12172 + }, + { + "epoch": 1.5485307212822796, + "ewc_loss": 0.007807190530002117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807190559105948e-05, + "grad_norm": 3.9381234645843506, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8741182088851929, + "num_tokens": 464497293.0, + "step": 12173 + }, + { + "epoch": 1.5486579315608702, + "ewc_loss": 0.007853413932025433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853413990233094e-05, + "grad_norm": 3.8812615871429443, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8627339005470276, + "num_tokens": 464541873.0, + "step": 12174 + }, + { + "epoch": 1.5487851418394607, + "ewc_loss": 0.00781008368358016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.810083479853347e-05, + "grad_norm": 3.8879754543304443, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8773159384727478, + "num_tokens": 464579712.0, + "step": 12175 + }, + { + "epoch": 1.5489123521180512, + "ewc_loss": 0.007852436974644661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.852436829125509e-05, + "grad_norm": 3.9067044258117676, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8725141286849976, + "num_tokens": 464612277.0, + "step": 12176 + }, + { + "epoch": 1.5490395623966418, + "ewc_loss": 0.007847258821129799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847259257687256e-05, + "grad_norm": 3.918813467025757, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8729082942008972, + "num_tokens": 464647062.0, + "step": 12177 + }, + { + "epoch": 1.5491667726752323, + "ewc_loss": 0.007866287603974342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866287342039868e-05, + "grad_norm": 3.903871536254883, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.877402663230896, + "num_tokens": 464677998.0, + "step": 12178 + }, + { + "epoch": 1.5492939829538228, + "ewc_loss": 0.007862625643610954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86262535257265e-05, + "grad_norm": 3.848567485809326, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8686944246292114, + "num_tokens": 464719205.0, + "step": 12179 + }, + { + "epoch": 1.5494211932324133, + "ewc_loss": 0.007836067117750645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836067379685119e-05, + "grad_norm": 3.8572652339935303, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.879647970199585, + "num_tokens": 464760758.0, + "step": 12180 + }, + { + "epoch": 1.5495484035110036, + "ewc_loss": 0.007869100198149681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.869100227253512e-05, + "grad_norm": 3.910750150680542, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8785160779953003, + "num_tokens": 464797422.0, + "step": 12181 + }, + { + "epoch": 1.5496756137895942, + "ewc_loss": 0.007886976934969425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88697725511156e-05, + "grad_norm": 3.8854403495788574, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8756414651870728, + "num_tokens": 464832889.0, + "step": 12182 + }, + { + "epoch": 1.5498028240681847, + "ewc_loss": 0.00785843189805746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.858432218199596e-05, + "grad_norm": 3.925349235534668, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8671920895576477, + "num_tokens": 464867743.0, + "step": 12183 + }, + { + "epoch": 1.5499300343467752, + "ewc_loss": 0.007911368273198605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911368447821587e-05, + "grad_norm": 3.911294460296631, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8625779151916504, + "num_tokens": 464904666.0, + "step": 12184 + }, + { + "epoch": 1.5500572446253658, + "ewc_loss": 0.00788953434675932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889534026617184e-05, + "grad_norm": 3.908437490463257, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8782029747962952, + "num_tokens": 464942886.0, + "step": 12185 + }, + { + "epoch": 1.5501844549039563, + "ewc_loss": 0.007887366227805614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887366518843919e-05, + "grad_norm": 3.9357197284698486, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8734394907951355, + "num_tokens": 464977463.0, + "step": 12186 + }, + { + "epoch": 1.5503116651825466, + "ewc_loss": 0.007906333543360233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.906333485152572e-05, + "grad_norm": 3.883178472518921, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8852492570877075, + "num_tokens": 465015732.0, + "step": 12187 + }, + { + "epoch": 1.5504388754611371, + "ewc_loss": 0.007875978015363216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.875978189986199e-05, + "grad_norm": 3.818467140197754, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8821201324462891, + "num_tokens": 465060789.0, + "step": 12188 + }, + { + "epoch": 1.5505660857397277, + "ewc_loss": 0.00785751361399889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857513264752924e-05, + "grad_norm": 3.89909291267395, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8713533282279968, + "num_tokens": 465097907.0, + "step": 12189 + }, + { + "epoch": 1.5506932960183182, + "ewc_loss": 0.007917015813291073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917016046121716e-05, + "grad_norm": 3.8550567626953125, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8826386332511902, + "num_tokens": 465139401.0, + "step": 12190 + }, + { + "epoch": 1.5508205062969087, + "ewc_loss": 0.007854167371988297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.854167051846161e-05, + "grad_norm": 3.9555552005767822, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8542402982711792, + "num_tokens": 465176531.0, + "step": 12191 + }, + { + "epoch": 1.5509477165754992, + "ewc_loss": 0.007932509295642376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932509470265359e-05, + "grad_norm": 3.858663558959961, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8822281360626221, + "num_tokens": 465216074.0, + "step": 12192 + }, + { + "epoch": 1.5510749268540898, + "ewc_loss": 0.007849972695112228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849972462281585e-05, + "grad_norm": 3.871110200881958, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8863856792449951, + "num_tokens": 465257986.0, + "step": 12193 + }, + { + "epoch": 1.5512021371326803, + "ewc_loss": 0.007888761349022388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.888761319918558e-05, + "grad_norm": 3.89250111579895, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8766036033630371, + "num_tokens": 465296038.0, + "step": 12194 + }, + { + "epoch": 1.5513293474112708, + "ewc_loss": 0.007894031703472137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.894031296018511e-05, + "grad_norm": 3.9467475414276123, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8529833555221558, + "num_tokens": 465334215.0, + "step": 12195 + }, + { + "epoch": 1.5514565576898613, + "ewc_loss": 0.007907809689640999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90780977695249e-05, + "grad_norm": 3.872757911682129, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8739023208618164, + "num_tokens": 465373362.0, + "step": 12196 + }, + { + "epoch": 1.5515837679684519, + "ewc_loss": 0.007844168692827225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844169158488512e-05, + "grad_norm": 3.923623561859131, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8702384233474731, + "num_tokens": 465410181.0, + "step": 12197 + }, + { + "epoch": 1.5517109782470424, + "ewc_loss": 0.007895857095718384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89585683378391e-05, + "grad_norm": 3.945108652114868, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8834906816482544, + "num_tokens": 465443056.0, + "step": 12198 + }, + { + "epoch": 1.551838188525633, + "ewc_loss": 0.007908454164862633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908454426797107e-05, + "grad_norm": 3.9014475345611572, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8775847554206848, + "num_tokens": 465484509.0, + "step": 12199 + }, + { + "epoch": 1.5519653988042235, + "ewc_loss": 0.007865842431783676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86584205343388e-05, + "grad_norm": 3.9179577827453613, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.872939944267273, + "num_tokens": 465520017.0, + "step": 12200 + }, + { + "epoch": 1.552092609082814, + "ewc_loss": 0.007899286225438118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.899285992607474e-05, + "grad_norm": 3.8656959533691406, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8907172679901123, + "num_tokens": 465557221.0, + "step": 12201 + }, + { + "epoch": 1.5522198193614045, + "ewc_loss": 0.007853090763092041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853090937715024e-05, + "grad_norm": 3.847139596939087, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8843574523925781, + "num_tokens": 465595455.0, + "step": 12202 + }, + { + "epoch": 1.552347029639995, + "ewc_loss": 0.00785880908370018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85880911280401e-05, + "grad_norm": 3.881840705871582, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8759177923202515, + "num_tokens": 465636436.0, + "step": 12203 + }, + { + "epoch": 1.5524742399185856, + "ewc_loss": 0.00788082554936409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880825432948768e-05, + "grad_norm": 3.9307315349578857, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8723421096801758, + "num_tokens": 465668956.0, + "step": 12204 + }, + { + "epoch": 1.5526014501971759, + "ewc_loss": 0.007896100170910358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896100578363985e-05, + "grad_norm": 3.938122510910034, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8551477789878845, + "num_tokens": 465708812.0, + "step": 12205 + }, + { + "epoch": 1.5527286604757664, + "ewc_loss": 0.007882323116064072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.882322825025767e-05, + "grad_norm": 3.9232265949249268, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8608642220497131, + "num_tokens": 465742610.0, + "step": 12206 + }, + { + "epoch": 1.552855870754357, + "ewc_loss": 0.007880298420786858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880298653617501e-05, + "grad_norm": 3.9214625358581543, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8732977509498596, + "num_tokens": 465776468.0, + "step": 12207 + }, + { + "epoch": 1.5529830810329475, + "ewc_loss": 0.007884452119469643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884452497819439e-05, + "grad_norm": 3.9714362621307373, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8912166357040405, + "num_tokens": 465809300.0, + "step": 12208 + }, + { + "epoch": 1.553110291311538, + "ewc_loss": 0.007922835648059845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92283535702154e-05, + "grad_norm": 3.8467535972595215, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8766687512397766, + "num_tokens": 465849847.0, + "step": 12209 + }, + { + "epoch": 1.5532375015901285, + "ewc_loss": 0.00783636886626482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836368604330346e-05, + "grad_norm": 3.8821680545806885, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8859449625015259, + "num_tokens": 465884986.0, + "step": 12210 + }, + { + "epoch": 1.5533647118687188, + "ewc_loss": 0.00788965541869402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889655535109341e-05, + "grad_norm": 3.929377794265747, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.881805419921875, + "num_tokens": 465919710.0, + "step": 12211 + }, + { + "epoch": 1.5534919221473094, + "ewc_loss": 0.007912898436188698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91289858170785e-05, + "grad_norm": 3.9495208263397217, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.867391049861908, + "num_tokens": 465956041.0, + "step": 12212 + }, + { + "epoch": 1.5536191324258999, + "ewc_loss": 0.007896186783909798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896186434663832e-05, + "grad_norm": 3.865814685821533, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.886346161365509, + "num_tokens": 465994352.0, + "step": 12213 + }, + { + "epoch": 1.5537463427044904, + "ewc_loss": 0.007841906510293484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841906335670501e-05, + "grad_norm": 3.879227638244629, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8750581741333008, + "num_tokens": 466032493.0, + "step": 12214 + }, + { + "epoch": 1.553873552983081, + "ewc_loss": 0.007897162809967995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.897162868175656e-05, + "grad_norm": 3.9107401371002197, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8782804012298584, + "num_tokens": 466067048.0, + "step": 12215 + }, + { + "epoch": 1.5540007632616715, + "ewc_loss": 0.00791449099779129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914491288829595e-05, + "grad_norm": 3.9363815784454346, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8541604280471802, + "num_tokens": 466106895.0, + "step": 12216 + }, + { + "epoch": 1.554127973540262, + "ewc_loss": 0.007901282049715519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901281787781045e-05, + "grad_norm": 3.889923334121704, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8592912554740906, + "num_tokens": 466146740.0, + "step": 12217 + }, + { + "epoch": 1.5542551838188525, + "ewc_loss": 0.007883221842348576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.883221405791119e-05, + "grad_norm": 3.850379467010498, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8808949589729309, + "num_tokens": 466187097.0, + "step": 12218 + }, + { + "epoch": 1.554382394097443, + "ewc_loss": 0.007863820530474186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86382079240866e-05, + "grad_norm": 3.9152750968933105, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8659567832946777, + "num_tokens": 466223969.0, + "step": 12219 + }, + { + "epoch": 1.5545096043760336, + "ewc_loss": 0.00791267491877079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.912675209809095e-05, + "grad_norm": 3.8955633640289307, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8861892223358154, + "num_tokens": 466260127.0, + "step": 12220 + }, + { + "epoch": 1.554636814654624, + "ewc_loss": 0.007879158481955528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.879158511059359e-05, + "grad_norm": 3.880039691925049, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8812325596809387, + "num_tokens": 466297219.0, + "step": 12221 + }, + { + "epoch": 1.5547640249332146, + "ewc_loss": 0.007877164520323277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877164898673072e-05, + "grad_norm": 3.9663541316986084, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8567565083503723, + "num_tokens": 466329474.0, + "step": 12222 + }, + { + "epoch": 1.5548912352118052, + "ewc_loss": 0.007939117029309273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939116767374799e-05, + "grad_norm": 3.8630099296569824, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8729366064071655, + "num_tokens": 466372801.0, + "step": 12223 + }, + { + "epoch": 1.5550184454903957, + "ewc_loss": 0.007850823923945427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850823749322444e-05, + "grad_norm": 3.8737692832946777, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8803189992904663, + "num_tokens": 466412950.0, + "step": 12224 + }, + { + "epoch": 1.5551456557689862, + "ewc_loss": 0.007876228541135788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876228482928127e-05, + "grad_norm": 3.9293549060821533, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8813731670379639, + "num_tokens": 466448830.0, + "step": 12225 + }, + { + "epoch": 1.5552728660475768, + "ewc_loss": 0.007910645566880703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910645945230499e-05, + "grad_norm": 3.8872580528259277, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8649739027023315, + "num_tokens": 466489749.0, + "step": 12226 + }, + { + "epoch": 1.5554000763261673, + "ewc_loss": 0.007853548973798752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853548595448956e-05, + "grad_norm": 3.900285005569458, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8576490879058838, + "num_tokens": 466531320.0, + "step": 12227 + }, + { + "epoch": 1.5555272866047578, + "ewc_loss": 0.007884319871664047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88432007539086e-05, + "grad_norm": 3.9549624919891357, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.86617112159729, + "num_tokens": 466570001.0, + "step": 12228 + }, + { + "epoch": 1.5556544968833483, + "ewc_loss": 0.007916932925581932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.916933100204915e-05, + "grad_norm": 3.894266366958618, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.875379204750061, + "num_tokens": 466607186.0, + "step": 12229 + }, + { + "epoch": 1.5557817071619386, + "ewc_loss": 0.00786327663809061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.863276550779119e-05, + "grad_norm": 3.842164993286133, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8660436868667603, + "num_tokens": 466649480.0, + "step": 12230 + }, + { + "epoch": 1.5559089174405292, + "ewc_loss": 0.007849454879760742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849454414099455e-05, + "grad_norm": 3.886812686920166, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8812371492385864, + "num_tokens": 466688861.0, + "step": 12231 + }, + { + "epoch": 1.5560361277191197, + "ewc_loss": 0.007898367941379547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.898367766756564e-05, + "grad_norm": 4.022851467132568, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8565393090248108, + "num_tokens": 466723200.0, + "step": 12232 + }, + { + "epoch": 1.5561633379977102, + "ewc_loss": 0.007955537177622318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955537148518488e-05, + "grad_norm": 3.8880295753479004, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8623563051223755, + "num_tokens": 466759639.0, + "step": 12233 + }, + { + "epoch": 1.5562905482763008, + "ewc_loss": 0.00784054584801197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.840545731596649e-05, + "grad_norm": 3.9025352001190186, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8687073588371277, + "num_tokens": 466795748.0, + "step": 12234 + }, + { + "epoch": 1.5564177585548913, + "ewc_loss": 0.007909133099019527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90913327364251e-05, + "grad_norm": 3.85194993019104, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.884283721446991, + "num_tokens": 466832938.0, + "step": 12235 + }, + { + "epoch": 1.5565449688334816, + "ewc_loss": 0.007884941063821316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88494071457535e-05, + "grad_norm": 3.93935227394104, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8682433366775513, + "num_tokens": 466868710.0, + "step": 12236 + }, + { + "epoch": 1.5566721791120721, + "ewc_loss": 0.007940971292555332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940971408970654e-05, + "grad_norm": 4.006112098693848, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8722885847091675, + "num_tokens": 466903558.0, + "step": 12237 + }, + { + "epoch": 1.5567993893906626, + "ewc_loss": 0.007964059710502625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.964059477671981e-05, + "grad_norm": 3.8891122341156006, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8662048578262329, + "num_tokens": 466946785.0, + "step": 12238 + }, + { + "epoch": 1.5569265996692532, + "ewc_loss": 0.007873388938605785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.873388676671311e-05, + "grad_norm": 3.871532678604126, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8659591674804688, + "num_tokens": 466988563.0, + "step": 12239 + }, + { + "epoch": 1.5570538099478437, + "ewc_loss": 0.007929213345050812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929213461466134e-05, + "grad_norm": 3.8953816890716553, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8678423166275024, + "num_tokens": 467025447.0, + "step": 12240 + }, + { + "epoch": 1.5571810202264342, + "ewc_loss": 0.007936517708003521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936517795315012e-05, + "grad_norm": 3.91302490234375, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8805892467498779, + "num_tokens": 467060751.0, + "step": 12241 + }, + { + "epoch": 1.5573082305050248, + "ewc_loss": 0.007935701869428158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935702160466462e-05, + "grad_norm": 3.878089666366577, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.873041033744812, + "num_tokens": 467101889.0, + "step": 12242 + }, + { + "epoch": 1.5574354407836153, + "ewc_loss": 0.007919908501207829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919908239273354e-05, + "grad_norm": 3.91652512550354, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8755890727043152, + "num_tokens": 467136782.0, + "step": 12243 + }, + { + "epoch": 1.5575626510622058, + "ewc_loss": 0.007942975498735905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9429752076976e-05, + "grad_norm": 3.8715059757232666, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8826766610145569, + "num_tokens": 467174137.0, + "step": 12244 + }, + { + "epoch": 1.5576898613407963, + "ewc_loss": 0.007913908921182156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913908484624699e-05, + "grad_norm": 3.8757991790771484, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8705053925514221, + "num_tokens": 467213919.0, + "step": 12245 + }, + { + "epoch": 1.5578170716193869, + "ewc_loss": 0.007940276525914669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9402765550185e-05, + "grad_norm": 3.861328363418579, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8822662830352783, + "num_tokens": 467254860.0, + "step": 12246 + }, + { + "epoch": 1.5579442818979774, + "ewc_loss": 0.007922750897705555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.922750955913216e-05, + "grad_norm": 3.83789324760437, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8843488693237305, + "num_tokens": 467296806.0, + "step": 12247 + }, + { + "epoch": 1.558071492176568, + "ewc_loss": 0.007919852621853352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919852941995487e-05, + "grad_norm": 3.9059813022613525, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8803613781929016, + "num_tokens": 467332882.0, + "step": 12248 + }, + { + "epoch": 1.5581987024551585, + "ewc_loss": 0.007952060550451279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952060695970431e-05, + "grad_norm": 3.842714786529541, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8896262645721436, + "num_tokens": 467375088.0, + "step": 12249 + }, + { + "epoch": 1.558325912733749, + "ewc_loss": 0.007892096415162086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8920966188889e-05, + "grad_norm": 3.9225776195526123, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8746000528335571, + "num_tokens": 467413000.0, + "step": 12250 + }, + { + "epoch": 1.5584531230123395, + "ewc_loss": 0.007933239452540874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.933239248814061e-05, + "grad_norm": 3.9110372066497803, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8884375095367432, + "num_tokens": 467450947.0, + "step": 12251 + }, + { + "epoch": 1.55858033329093, + "ewc_loss": 0.007899254560470581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.899254705989733e-05, + "grad_norm": 3.9075982570648193, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8639106154441833, + "num_tokens": 467493770.0, + "step": 12252 + }, + { + "epoch": 1.5587075435695206, + "ewc_loss": 0.00788207072764635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.882070349296555e-05, + "grad_norm": 3.935335636138916, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8704282641410828, + "num_tokens": 467531882.0, + "step": 12253 + }, + { + "epoch": 1.5588347538481109, + "ewc_loss": 0.007902451790869236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902451761765406e-05, + "grad_norm": 3.877502918243408, + "learning_rate": 1e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.901176393032074, + "num_tokens": 467566059.0, + "step": 12254 + }, + { + "epoch": 1.5589619641267014, + "ewc_loss": 0.007850628346204758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850628753658384e-05, + "grad_norm": 3.877134323120117, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8888680934906006, + "num_tokens": 467603734.0, + "step": 12255 + }, + { + "epoch": 1.559089174405292, + "ewc_loss": 0.007873463444411755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.873463619034737e-05, + "grad_norm": 3.9412710666656494, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8827120065689087, + "num_tokens": 467641224.0, + "step": 12256 + }, + { + "epoch": 1.5592163846838825, + "ewc_loss": 0.00789754930883646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.897549221524969e-05, + "grad_norm": 3.914348602294922, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8733615279197693, + "num_tokens": 467677576.0, + "step": 12257 + }, + { + "epoch": 1.559343594962473, + "ewc_loss": 0.007842183113098145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842183549655601e-05, + "grad_norm": 3.8752427101135254, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.881196916103363, + "num_tokens": 467717526.0, + "step": 12258 + }, + { + "epoch": 1.5594708052410635, + "ewc_loss": 0.007840973325073719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.840973557904363e-05, + "grad_norm": 3.8819937705993652, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8708890676498413, + "num_tokens": 467757591.0, + "step": 12259 + }, + { + "epoch": 1.5595980155196538, + "ewc_loss": 0.007841655053198338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841655315132812e-05, + "grad_norm": 3.9650521278381348, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8604862689971924, + "num_tokens": 467792794.0, + "step": 12260 + }, + { + "epoch": 1.5597252257982444, + "ewc_loss": 0.007876659743487835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876659947214648e-05, + "grad_norm": 3.889206647872925, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8684793710708618, + "num_tokens": 467829554.0, + "step": 12261 + }, + { + "epoch": 1.5598524360768349, + "ewc_loss": 0.007814730517566204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814730633981526e-05, + "grad_norm": 3.937373161315918, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8582535982131958, + "num_tokens": 467871403.0, + "step": 12262 + }, + { + "epoch": 1.5599796463554254, + "ewc_loss": 0.00786806084215641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868060492910445e-05, + "grad_norm": 3.912031888961792, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8561638593673706, + "num_tokens": 467906498.0, + "step": 12263 + }, + { + "epoch": 1.560106856634016, + "ewc_loss": 0.007848835550248623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848835957702249e-05, + "grad_norm": 3.88236927986145, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8729071617126465, + "num_tokens": 467946426.0, + "step": 12264 + }, + { + "epoch": 1.5602340669126065, + "ewc_loss": 0.0078398697078228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839869795134291e-05, + "grad_norm": 3.93239688873291, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8769553899765015, + "num_tokens": 467979930.0, + "step": 12265 + }, + { + "epoch": 1.560361277191197, + "ewc_loss": 0.007892371155321598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.892370922490954e-05, + "grad_norm": 3.9093596935272217, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.868499755859375, + "num_tokens": 468016347.0, + "step": 12266 + }, + { + "epoch": 1.5604884874697875, + "ewc_loss": 0.007874829694628716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87482931627892e-05, + "grad_norm": 3.891907215118408, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.881646454334259, + "num_tokens": 468051446.0, + "step": 12267 + }, + { + "epoch": 1.560615697748378, + "ewc_loss": 0.007880156859755516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880156772444025e-05, + "grad_norm": 3.9131245613098145, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8901869654655457, + "num_tokens": 468085713.0, + "step": 12268 + }, + { + "epoch": 1.5607429080269686, + "ewc_loss": 0.007919001393020153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919001654954627e-05, + "grad_norm": 3.9239344596862793, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.886847972869873, + "num_tokens": 468117863.0, + "step": 12269 + }, + { + "epoch": 1.560870118305559, + "ewc_loss": 0.00790427252650261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.904272933956236e-05, + "grad_norm": 3.896967649459839, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8718091249465942, + "num_tokens": 468153041.0, + "step": 12270 + }, + { + "epoch": 1.5609973285841496, + "ewc_loss": 0.007917930372059345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91793063399382e-05, + "grad_norm": 3.8977603912353516, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8731586933135986, + "num_tokens": 468193624.0, + "step": 12271 + }, + { + "epoch": 1.5611245388627402, + "ewc_loss": 0.00791978370398283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919783820398152e-05, + "grad_norm": 3.857431650161743, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8892911672592163, + "num_tokens": 468227091.0, + "step": 12272 + }, + { + "epoch": 1.5612517491413307, + "ewc_loss": 0.007900248281657696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.900248601799831e-05, + "grad_norm": 3.905961751937866, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8708765506744385, + "num_tokens": 468265079.0, + "step": 12273 + }, + { + "epoch": 1.5613789594199212, + "ewc_loss": 0.007967481389641762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967481360537931e-05, + "grad_norm": 3.8660449981689453, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8722466826438904, + "num_tokens": 468307055.0, + "step": 12274 + }, + { + "epoch": 1.5615061696985117, + "ewc_loss": 0.00791008211672306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9100820585154e-05, + "grad_norm": 3.900153398513794, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8696234822273254, + "num_tokens": 468344429.0, + "step": 12275 + }, + { + "epoch": 1.5616333799771023, + "ewc_loss": 0.007964320480823517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96432068455033e-05, + "grad_norm": 3.9014995098114014, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8744339942932129, + "num_tokens": 468379448.0, + "step": 12276 + }, + { + "epoch": 1.5617605902556928, + "ewc_loss": 0.007938084192574024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938084308989346e-05, + "grad_norm": 3.846834659576416, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8763591647148132, + "num_tokens": 468423344.0, + "step": 12277 + }, + { + "epoch": 1.5618878005342833, + "ewc_loss": 0.00790855847299099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90855847299099e-05, + "grad_norm": 3.8710241317749023, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8804728388786316, + "num_tokens": 468461657.0, + "step": 12278 + }, + { + "epoch": 1.5620150108128736, + "ewc_loss": 0.007943008095026016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.943007949506864e-05, + "grad_norm": 3.880221366882324, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8893836140632629, + "num_tokens": 468500428.0, + "step": 12279 + }, + { + "epoch": 1.5621422210914642, + "ewc_loss": 0.00792511273175478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92511273175478e-05, + "grad_norm": 3.918365240097046, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8637328147888184, + "num_tokens": 468536310.0, + "step": 12280 + }, + { + "epoch": 1.5622694313700547, + "ewc_loss": 0.007941070944070816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.941071089589968e-05, + "grad_norm": 3.8779873847961426, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8721362352371216, + "num_tokens": 468577954.0, + "step": 12281 + }, + { + "epoch": 1.5623966416486452, + "ewc_loss": 0.00790176447480917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901764183770865e-05, + "grad_norm": 3.9431562423706055, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8652854561805725, + "num_tokens": 468612131.0, + "step": 12282 + }, + { + "epoch": 1.5625238519272358, + "ewc_loss": 0.007939347065985203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939347415231168e-05, + "grad_norm": 3.8424696922302246, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8709124326705933, + "num_tokens": 468654022.0, + "step": 12283 + }, + { + "epoch": 1.5626510622058263, + "ewc_loss": 0.007848522625863552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848522363929078e-05, + "grad_norm": 3.9017059803009033, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8659143447875977, + "num_tokens": 468693120.0, + "step": 12284 + }, + { + "epoch": 1.5627782724844166, + "ewc_loss": 0.007915244437754154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915244350442663e-05, + "grad_norm": 3.9250447750091553, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8738937377929688, + "num_tokens": 468727695.0, + "step": 12285 + }, + { + "epoch": 1.5629054827630071, + "ewc_loss": 0.007902602665126324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90260310168378e-05, + "grad_norm": 3.873809337615967, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8709871172904968, + "num_tokens": 468766705.0, + "step": 12286 + }, + { + "epoch": 1.5630326930415976, + "ewc_loss": 0.007874197326600552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.874197035562247e-05, + "grad_norm": 3.941276788711548, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8673640489578247, + "num_tokens": 468804555.0, + "step": 12287 + }, + { + "epoch": 1.5631599033201882, + "ewc_loss": 0.007915378548204899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915378228062764e-05, + "grad_norm": 3.8878836631774902, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8713733553886414, + "num_tokens": 468842495.0, + "step": 12288 + }, + { + "epoch": 1.5632871135987787, + "ewc_loss": 0.007869122549891472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.869122782722116e-05, + "grad_norm": 3.902031183242798, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8733055591583252, + "num_tokens": 468877574.0, + "step": 12289 + }, + { + "epoch": 1.5634143238773692, + "ewc_loss": 0.007896882481873035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89688274380751e-05, + "grad_norm": 3.8842215538024902, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8843203783035278, + "num_tokens": 468914573.0, + "step": 12290 + }, + { + "epoch": 1.5635415341559598, + "ewc_loss": 0.007894735783338547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.894735608715564e-05, + "grad_norm": 3.891866445541382, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.885010838508606, + "num_tokens": 468950943.0, + "step": 12291 + }, + { + "epoch": 1.5636687444345503, + "ewc_loss": 0.007909455336630344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909455598564819e-05, + "grad_norm": 3.8656978607177734, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8893468976020813, + "num_tokens": 468990007.0, + "step": 12292 + }, + { + "epoch": 1.5637959547131408, + "ewc_loss": 0.007892654277384281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.892654684837908e-05, + "grad_norm": 3.916003704071045, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8782073855400085, + "num_tokens": 469022461.0, + "step": 12293 + }, + { + "epoch": 1.5639231649917313, + "ewc_loss": 0.007916049100458622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.916049071354792e-05, + "grad_norm": 3.9111249446868896, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8789803981781006, + "num_tokens": 469056396.0, + "step": 12294 + }, + { + "epoch": 1.5640503752703219, + "ewc_loss": 0.007910199463367462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910199201432988e-05, + "grad_norm": 3.855799436569214, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8828165531158447, + "num_tokens": 469099464.0, + "step": 12295 + }, + { + "epoch": 1.5641775855489124, + "ewc_loss": 0.007882745005190372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88274483056739e-05, + "grad_norm": 3.852203607559204, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.883073091506958, + "num_tokens": 469139830.0, + "step": 12296 + }, + { + "epoch": 1.564304795827503, + "ewc_loss": 0.007884153164923191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884152728365734e-05, + "grad_norm": 3.9021315574645996, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8745108246803284, + "num_tokens": 469177838.0, + "step": 12297 + }, + { + "epoch": 1.5644320061060935, + "ewc_loss": 0.007890905253589153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.890904817031696e-05, + "grad_norm": 3.8367550373077393, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8980804085731506, + "num_tokens": 469218727.0, + "step": 12298 + }, + { + "epoch": 1.564559216384684, + "ewc_loss": 0.007842318154871464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.842318154871464e-05, + "grad_norm": 3.902841806411743, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8525956869125366, + "num_tokens": 469261186.0, + "step": 12299 + }, + { + "epoch": 1.5646864266632745, + "ewc_loss": 0.007909241132438183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9092409578152e-05, + "grad_norm": 3.8826372623443604, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8827338218688965, + "num_tokens": 469301191.0, + "step": 12300 + }, + { + "epoch": 1.564813636941865, + "ewc_loss": 0.007839851081371307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839850877644494e-05, + "grad_norm": 3.862366199493408, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.879570722579956, + "num_tokens": 469340132.0, + "step": 12301 + }, + { + "epoch": 1.5649408472204556, + "ewc_loss": 0.007844790816307068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844791252864525e-05, + "grad_norm": 3.88177490234375, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8920333385467529, + "num_tokens": 469376211.0, + "step": 12302 + }, + { + "epoch": 1.5650680574990459, + "ewc_loss": 0.007862349040806293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862348866183311e-05, + "grad_norm": 3.9341704845428467, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8648431301116943, + "num_tokens": 469413961.0, + "step": 12303 + }, + { + "epoch": 1.5651952677776364, + "ewc_loss": 0.00786135345697403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86135351518169e-05, + "grad_norm": 3.9032161235809326, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8746524453163147, + "num_tokens": 469449507.0, + "step": 12304 + }, + { + "epoch": 1.565322478056227, + "ewc_loss": 0.007837995886802673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837996236048639e-05, + "grad_norm": 3.9239964485168457, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8628208637237549, + "num_tokens": 469486324.0, + "step": 12305 + }, + { + "epoch": 1.5654496883348175, + "ewc_loss": 0.007877877913415432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877877942519262e-05, + "grad_norm": 3.918865203857422, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8689910769462585, + "num_tokens": 469523799.0, + "step": 12306 + }, + { + "epoch": 1.565576898613408, + "ewc_loss": 0.007845220156013966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.845219806768e-05, + "grad_norm": 3.93689227104187, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8613885641098022, + "num_tokens": 469558661.0, + "step": 12307 + }, + { + "epoch": 1.5657041088919985, + "ewc_loss": 0.007888631895184517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.888631807873026e-05, + "grad_norm": 3.8568031787872314, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8732011318206787, + "num_tokens": 469603087.0, + "step": 12308 + }, + { + "epoch": 1.5658313191705888, + "ewc_loss": 0.007857054471969604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857054879423231e-05, + "grad_norm": 3.93845272064209, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8751227259635925, + "num_tokens": 469637344.0, + "step": 12309 + }, + { + "epoch": 1.5659585294491793, + "ewc_loss": 0.00792449526488781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924495730549097e-05, + "grad_norm": 3.839282274246216, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8828961253166199, + "num_tokens": 469681127.0, + "step": 12310 + }, + { + "epoch": 1.5660857397277699, + "ewc_loss": 0.007843510247766972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.843510684324428e-05, + "grad_norm": 3.8822946548461914, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8725076913833618, + "num_tokens": 469724238.0, + "step": 12311 + }, + { + "epoch": 1.5662129500063604, + "ewc_loss": 0.00790190976113081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901909702923149e-05, + "grad_norm": 3.8886985778808594, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8683988451957703, + "num_tokens": 469765967.0, + "step": 12312 + }, + { + "epoch": 1.566340160284951, + "ewc_loss": 0.007886658422648907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.886658568168059e-05, + "grad_norm": 3.9121921062469482, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8735479712486267, + "num_tokens": 469801387.0, + "step": 12313 + }, + { + "epoch": 1.5664673705635415, + "ewc_loss": 0.007885000668466091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.885001105023548e-05, + "grad_norm": 3.9345932006835938, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8770909309387207, + "num_tokens": 469835448.0, + "step": 12314 + }, + { + "epoch": 1.566594580842132, + "ewc_loss": 0.007893318310379982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.893318252172321e-05, + "grad_norm": 3.8713629245758057, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.866510272026062, + "num_tokens": 469876647.0, + "step": 12315 + }, + { + "epoch": 1.5667217911207225, + "ewc_loss": 0.007849827408790588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849826943129301e-05, + "grad_norm": 3.834786891937256, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8884444236755371, + "num_tokens": 469918158.0, + "step": 12316 + }, + { + "epoch": 1.566849001399313, + "ewc_loss": 0.00785250123590231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.852501585148275e-05, + "grad_norm": 3.937405586242676, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8615313768386841, + "num_tokens": 469953691.0, + "step": 12317 + }, + { + "epoch": 1.5669762116779036, + "ewc_loss": 0.007934502325952053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934502355055884e-05, + "grad_norm": 3.861785650253296, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8771004676818848, + "num_tokens": 469996174.0, + "step": 12318 + }, + { + "epoch": 1.567103421956494, + "ewc_loss": 0.007851190865039825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851190457586199e-05, + "grad_norm": 3.9461686611175537, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8631887435913086, + "num_tokens": 470031371.0, + "step": 12319 + }, + { + "epoch": 1.5672306322350846, + "ewc_loss": 0.007936526089906693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936525798868388e-05, + "grad_norm": 3.9054319858551025, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8795474767684937, + "num_tokens": 470068783.0, + "step": 12320 + }, + { + "epoch": 1.5673578425136752, + "ewc_loss": 0.00788052100688219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880520570324734e-05, + "grad_norm": 3.9361650943756104, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8583924770355225, + "num_tokens": 470103490.0, + "step": 12321 + }, + { + "epoch": 1.5674850527922657, + "ewc_loss": 0.00790668185800314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.906682003522292e-05, + "grad_norm": 3.935373067855835, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8707009553909302, + "num_tokens": 470137213.0, + "step": 12322 + }, + { + "epoch": 1.5676122630708562, + "ewc_loss": 0.007913060486316681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913060107966885e-05, + "grad_norm": 3.912212610244751, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8775490522384644, + "num_tokens": 470171816.0, + "step": 12323 + }, + { + "epoch": 1.5677394733494467, + "ewc_loss": 0.00790948886424303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909489067969844e-05, + "grad_norm": 3.912087917327881, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8538241386413574, + "num_tokens": 470210517.0, + "step": 12324 + }, + { + "epoch": 1.5678666836280373, + "ewc_loss": 0.007919231429696083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919231575215235e-05, + "grad_norm": 3.914630651473999, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.862433910369873, + "num_tokens": 470247085.0, + "step": 12325 + }, + { + "epoch": 1.5679938939066278, + "ewc_loss": 0.007935544475913048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935544272186235e-05, + "grad_norm": 3.868602752685547, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8733505010604858, + "num_tokens": 470288483.0, + "step": 12326 + }, + { + "epoch": 1.5681211041852183, + "ewc_loss": 0.007918411865830421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918411574792117e-05, + "grad_norm": 3.99407958984375, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8739557862281799, + "num_tokens": 470322173.0, + "step": 12327 + }, + { + "epoch": 1.5682483144638086, + "ewc_loss": 0.008009922690689564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009922748897225e-05, + "grad_norm": 3.9105401039123535, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8742509484291077, + "num_tokens": 470356617.0, + "step": 12328 + }, + { + "epoch": 1.5683755247423992, + "ewc_loss": 0.007908771745860577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908771658549085e-05, + "grad_norm": 3.87976336479187, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8860409259796143, + "num_tokens": 470393095.0, + "step": 12329 + }, + { + "epoch": 1.5685027350209897, + "ewc_loss": 0.007932533510029316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932533480925485e-05, + "grad_norm": 3.894575595855713, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8856284618377686, + "num_tokens": 470427785.0, + "step": 12330 + }, + { + "epoch": 1.5686299452995802, + "ewc_loss": 0.007940612733364105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940612704260275e-05, + "grad_norm": 3.8486244678497314, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8703486919403076, + "num_tokens": 470469971.0, + "step": 12331 + }, + { + "epoch": 1.5687571555781707, + "ewc_loss": 0.00791947077959776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919470954220742e-05, + "grad_norm": 3.9608981609344482, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8661932945251465, + "num_tokens": 470503935.0, + "step": 12332 + }, + { + "epoch": 1.5688843658567613, + "ewc_loss": 0.008002743124961853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00274356151931e-05, + "grad_norm": 3.9595890045166016, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8775984048843384, + "num_tokens": 470536014.0, + "step": 12333 + }, + { + "epoch": 1.5690115761353516, + "ewc_loss": 0.007956470362842083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956470653880388e-05, + "grad_norm": 3.8750948905944824, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8862591981887817, + "num_tokens": 470575632.0, + "step": 12334 + }, + { + "epoch": 1.569138786413942, + "ewc_loss": 0.00794205255806446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942052616272122e-05, + "grad_norm": 3.9392218589782715, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8846058249473572, + "num_tokens": 470611295.0, + "step": 12335 + }, + { + "epoch": 1.5692659966925326, + "ewc_loss": 0.007989548146724701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.989547884790227e-05, + "grad_norm": 3.8855578899383545, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8723149299621582, + "num_tokens": 470646993.0, + "step": 12336 + }, + { + "epoch": 1.5693932069711232, + "ewc_loss": 0.007935648784041405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935648318380117e-05, + "grad_norm": 3.9703779220581055, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8772193193435669, + "num_tokens": 470680253.0, + "step": 12337 + }, + { + "epoch": 1.5695204172497137, + "ewc_loss": 0.008012066595256329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012066246010363e-05, + "grad_norm": 3.8946523666381836, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8705935478210449, + "num_tokens": 470716169.0, + "step": 12338 + }, + { + "epoch": 1.5696476275283042, + "ewc_loss": 0.007937210611999035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.937210466479883e-05, + "grad_norm": 3.9172797203063965, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8881134986877441, + "num_tokens": 470754208.0, + "step": 12339 + }, + { + "epoch": 1.5697748378068948, + "ewc_loss": 0.007969588972628117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96958920545876e-05, + "grad_norm": 3.8115878105163574, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8768373727798462, + "num_tokens": 470801329.0, + "step": 12340 + }, + { + "epoch": 1.5699020480854853, + "ewc_loss": 0.007885606028139591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.885605737101287e-05, + "grad_norm": 3.9482414722442627, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8696733117103577, + "num_tokens": 470838408.0, + "step": 12341 + }, + { + "epoch": 1.5700292583640758, + "ewc_loss": 0.008004037663340569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004037954378873e-05, + "grad_norm": 3.9005675315856934, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8756082653999329, + "num_tokens": 470874001.0, + "step": 12342 + }, + { + "epoch": 1.5701564686426663, + "ewc_loss": 0.007918821647763252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918821211205795e-05, + "grad_norm": 3.859416961669922, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8702517747879028, + "num_tokens": 470916870.0, + "step": 12343 + }, + { + "epoch": 1.5702836789212569, + "ewc_loss": 0.00792501401156187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925013778731227e-05, + "grad_norm": 3.9062793254852295, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8821659684181213, + "num_tokens": 470953862.0, + "step": 12344 + }, + { + "epoch": 1.5704108891998474, + "ewc_loss": 0.007952543906867504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952543819556013e-05, + "grad_norm": 3.9273080825805664, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8844528198242188, + "num_tokens": 470991507.0, + "step": 12345 + }, + { + "epoch": 1.570538099478438, + "ewc_loss": 0.007941066287457943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9410667240154e-05, + "grad_norm": 3.920603036880493, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8776998519897461, + "num_tokens": 471030109.0, + "step": 12346 + }, + { + "epoch": 1.5706653097570284, + "ewc_loss": 0.007902994751930237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902994548203424e-05, + "grad_norm": 3.90632700920105, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8881494402885437, + "num_tokens": 471063829.0, + "step": 12347 + }, + { + "epoch": 1.570792520035619, + "ewc_loss": 0.007911769673228264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91177008068189e-05, + "grad_norm": 3.904059410095215, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8656231164932251, + "num_tokens": 471107259.0, + "step": 12348 + }, + { + "epoch": 1.5709197303142095, + "ewc_loss": 0.007887180894613266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887180981924757e-05, + "grad_norm": 3.9384262561798096, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8778805732727051, + "num_tokens": 471141516.0, + "step": 12349 + }, + { + "epoch": 1.5710469405928, + "ewc_loss": 0.007913426496088505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913426088634878e-05, + "grad_norm": 3.936249017715454, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8762195110321045, + "num_tokens": 471178655.0, + "step": 12350 + }, + { + "epoch": 1.5711741508713906, + "ewc_loss": 0.007901275530457497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901275239419192e-05, + "grad_norm": 3.955655336380005, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8858556151390076, + "num_tokens": 471209437.0, + "step": 12351 + }, + { + "epoch": 1.5713013611499809, + "ewc_loss": 0.007898910902440548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.898911280790344e-05, + "grad_norm": 3.8266725540161133, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8893508911132812, + "num_tokens": 471249559.0, + "step": 12352 + }, + { + "epoch": 1.5714285714285714, + "ewc_loss": 0.007830667309463024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.830667163943872e-05, + "grad_norm": 3.9232444763183594, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8794877529144287, + "num_tokens": 471287179.0, + "step": 12353 + }, + { + "epoch": 1.571555781707162, + "ewc_loss": 0.007927348837256432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.927348633529618e-05, + "grad_norm": 3.870363235473633, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8790749311447144, + "num_tokens": 471328052.0, + "step": 12354 + }, + { + "epoch": 1.5716829919857525, + "ewc_loss": 0.007866254076361656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866253872634843e-05, + "grad_norm": 3.936438798904419, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8727407455444336, + "num_tokens": 471362414.0, + "step": 12355 + }, + { + "epoch": 1.571810202264343, + "ewc_loss": 0.00793434027582407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934340101201087e-05, + "grad_norm": 3.9435794353485107, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8727926015853882, + "num_tokens": 471397910.0, + "step": 12356 + }, + { + "epoch": 1.5719374125429335, + "ewc_loss": 0.007918472401797771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918472692836076e-05, + "grad_norm": 3.8551647663116455, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8897415995597839, + "num_tokens": 471436951.0, + "step": 12357 + }, + { + "epoch": 1.5720646228215238, + "ewc_loss": 0.007861380465328693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861380436224863e-05, + "grad_norm": 3.9467921257019043, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8785207271575928, + "num_tokens": 471470139.0, + "step": 12358 + }, + { + "epoch": 1.5721918331001143, + "ewc_loss": 0.00795041024684906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.950409781187773e-05, + "grad_norm": 3.8515543937683105, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8830759525299072, + "num_tokens": 471508591.0, + "step": 12359 + }, + { + "epoch": 1.5723190433787049, + "ewc_loss": 0.007871684618294239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87168464739807e-05, + "grad_norm": 3.892946720123291, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8747158050537109, + "num_tokens": 471549165.0, + "step": 12360 + }, + { + "epoch": 1.5724462536572954, + "ewc_loss": 0.007927939295768738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92793944128789e-05, + "grad_norm": 3.8666462898254395, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8709468841552734, + "num_tokens": 471589021.0, + "step": 12361 + }, + { + "epoch": 1.572573463935886, + "ewc_loss": 0.007893307134509087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8933073382359e-05, + "grad_norm": 3.901703119277954, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8701167106628418, + "num_tokens": 471624263.0, + "step": 12362 + }, + { + "epoch": 1.5727006742144765, + "ewc_loss": 0.007942765951156616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942765660118312e-05, + "grad_norm": 3.878413200378418, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8701874613761902, + "num_tokens": 471663678.0, + "step": 12363 + }, + { + "epoch": 1.572827884493067, + "ewc_loss": 0.007906829006969929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.906828977866098e-05, + "grad_norm": 3.872833013534546, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8744109869003296, + "num_tokens": 471705887.0, + "step": 12364 + }, + { + "epoch": 1.5729550947716575, + "ewc_loss": 0.00791612733155489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.916127651697025e-05, + "grad_norm": 3.8998711109161377, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8826643228530884, + "num_tokens": 471739796.0, + "step": 12365 + }, + { + "epoch": 1.573082305050248, + "ewc_loss": 0.007929415442049503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929415733087808e-05, + "grad_norm": 4.004571914672852, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8715830445289612, + "num_tokens": 471774723.0, + "step": 12366 + }, + { + "epoch": 1.5732095153288386, + "ewc_loss": 0.007987997494637966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987997378222644e-05, + "grad_norm": 3.919213056564331, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8688063621520996, + "num_tokens": 471814169.0, + "step": 12367 + }, + { + "epoch": 1.573336725607429, + "ewc_loss": 0.007889287546277046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889287371654063e-05, + "grad_norm": 3.880875825881958, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8756262063980103, + "num_tokens": 471852367.0, + "step": 12368 + }, + { + "epoch": 1.5734639358860196, + "ewc_loss": 0.007917257957160473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917257607914507e-05, + "grad_norm": 3.9449760913848877, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.881071925163269, + "num_tokens": 471883438.0, + "step": 12369 + }, + { + "epoch": 1.5735911461646102, + "ewc_loss": 0.007961690425872803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961690425872803e-05, + "grad_norm": 3.8964059352874756, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8712566494941711, + "num_tokens": 471921051.0, + "step": 12370 + }, + { + "epoch": 1.5737183564432007, + "ewc_loss": 0.00788160040974617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.881600322434679e-05, + "grad_norm": 3.8829398155212402, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8794969320297241, + "num_tokens": 471959685.0, + "step": 12371 + }, + { + "epoch": 1.5738455667217912, + "ewc_loss": 0.007923036813735962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923036901047453e-05, + "grad_norm": 3.911893367767334, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8752503395080566, + "num_tokens": 471997345.0, + "step": 12372 + }, + { + "epoch": 1.5739727770003817, + "ewc_loss": 0.007925604470074177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925604586489499e-05, + "grad_norm": 3.9325520992279053, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8809444904327393, + "num_tokens": 472034455.0, + "step": 12373 + }, + { + "epoch": 1.5740999872789723, + "ewc_loss": 0.007914389483630657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914389425422996e-05, + "grad_norm": 3.9104092121124268, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8755083680152893, + "num_tokens": 472073630.0, + "step": 12374 + }, + { + "epoch": 1.5742271975575628, + "ewc_loss": 0.00789574347436428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.895743328845128e-05, + "grad_norm": 3.878272533416748, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8729709386825562, + "num_tokens": 472111556.0, + "step": 12375 + }, + { + "epoch": 1.5743544078361533, + "ewc_loss": 0.007866681553423405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866681698942557e-05, + "grad_norm": 3.8949408531188965, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8805943727493286, + "num_tokens": 472148276.0, + "step": 12376 + }, + { + "epoch": 1.5744816181147436, + "ewc_loss": 0.007893329486250877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.893329166108742e-05, + "grad_norm": 3.953359603881836, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8750528693199158, + "num_tokens": 472181665.0, + "step": 12377 + }, + { + "epoch": 1.5746088283933342, + "ewc_loss": 0.007903074845671654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90307458373718e-05, + "grad_norm": 3.904712438583374, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8697341680526733, + "num_tokens": 472222759.0, + "step": 12378 + }, + { + "epoch": 1.5747360386719247, + "ewc_loss": 0.007858297787606716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.858297612983733e-05, + "grad_norm": 3.8838422298431396, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8703898191452026, + "num_tokens": 472260867.0, + "step": 12379 + }, + { + "epoch": 1.5748632489505152, + "ewc_loss": 0.007864831946790218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.864832150517032e-05, + "grad_norm": 3.9081387519836426, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8649865388870239, + "num_tokens": 472300068.0, + "step": 12380 + }, + { + "epoch": 1.5749904592291057, + "ewc_loss": 0.007899964228272438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.899964111857116e-05, + "grad_norm": 3.9081461429595947, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8868621587753296, + "num_tokens": 472336459.0, + "step": 12381 + }, + { + "epoch": 1.5751176695076963, + "ewc_loss": 0.007875015959143639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.875016308389604e-05, + "grad_norm": 3.912302255630493, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8799238801002502, + "num_tokens": 472376696.0, + "step": 12382 + }, + { + "epoch": 1.5752448797862866, + "ewc_loss": 0.007887265644967556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887265383033082e-05, + "grad_norm": 3.889223098754883, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8889480829238892, + "num_tokens": 472414491.0, + "step": 12383 + }, + { + "epoch": 1.575372090064877, + "ewc_loss": 0.00786693673580885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866937085054815e-05, + "grad_norm": 3.9339375495910645, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8740375638008118, + "num_tokens": 472452188.0, + "step": 12384 + }, + { + "epoch": 1.5754993003434676, + "ewc_loss": 0.007896779105067253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896779425209388e-05, + "grad_norm": 3.9361329078674316, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8726968765258789, + "num_tokens": 472489690.0, + "step": 12385 + }, + { + "epoch": 1.5756265106220582, + "ewc_loss": 0.007866580039262772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866579835535958e-05, + "grad_norm": 3.942235231399536, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8731271028518677, + "num_tokens": 472529912.0, + "step": 12386 + }, + { + "epoch": 1.5757537209006487, + "ewc_loss": 0.007860077545046806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.860077312216163e-05, + "grad_norm": 3.957383155822754, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8735530376434326, + "num_tokens": 472562534.0, + "step": 12387 + }, + { + "epoch": 1.5758809311792392, + "ewc_loss": 0.007879725657403469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.879725308157504e-05, + "grad_norm": 3.9387264251708984, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8741147518157959, + "num_tokens": 472598955.0, + "step": 12388 + }, + { + "epoch": 1.5760081414578297, + "ewc_loss": 0.007861700840294361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861700578359887e-05, + "grad_norm": 3.877833366394043, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8793357610702515, + "num_tokens": 472638064.0, + "step": 12389 + }, + { + "epoch": 1.5761353517364203, + "ewc_loss": 0.007839247584342957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839247700758278e-05, + "grad_norm": 3.921135663986206, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8781695365905762, + "num_tokens": 472676823.0, + "step": 12390 + }, + { + "epoch": 1.5762625620150108, + "ewc_loss": 0.007892835885286331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.892835856182501e-05, + "grad_norm": 3.8686907291412354, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8758502006530762, + "num_tokens": 472718528.0, + "step": 12391 + }, + { + "epoch": 1.5763897722936013, + "ewc_loss": 0.007853837683796883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853837450966239e-05, + "grad_norm": 3.9299027919769287, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8797186017036438, + "num_tokens": 472755634.0, + "step": 12392 + }, + { + "epoch": 1.5765169825721919, + "ewc_loss": 0.007902857847511768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902857760200277e-05, + "grad_norm": 3.8379063606262207, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.887190580368042, + "num_tokens": 472798060.0, + "step": 12393 + }, + { + "epoch": 1.5766441928507824, + "ewc_loss": 0.007814016193151474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814016134943813e-05, + "grad_norm": 3.8851449489593506, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8790327906608582, + "num_tokens": 472837628.0, + "step": 12394 + }, + { + "epoch": 1.576771403129373, + "ewc_loss": 0.007888703607022762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.888703839853406e-05, + "grad_norm": 3.9631576538085938, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8767104148864746, + "num_tokens": 472877446.0, + "step": 12395 + }, + { + "epoch": 1.5768986134079634, + "ewc_loss": 0.007886135019361973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.886134699219838e-05, + "grad_norm": 3.9148995876312256, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8707600831985474, + "num_tokens": 472918437.0, + "step": 12396 + }, + { + "epoch": 1.577025823686554, + "ewc_loss": 0.007841247133910656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.841247133910656e-05, + "grad_norm": 3.895350217819214, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.883633017539978, + "num_tokens": 472956242.0, + "step": 12397 + }, + { + "epoch": 1.5771530339651445, + "ewc_loss": 0.007837088778614998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83708892413415e-05, + "grad_norm": 3.9712657928466797, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8862511515617371, + "num_tokens": 472985620.0, + "step": 12398 + }, + { + "epoch": 1.577280244243735, + "ewc_loss": 0.007888572290539742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.888572145020589e-05, + "grad_norm": 3.934115409851074, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8816676139831543, + "num_tokens": 473019517.0, + "step": 12399 + }, + { + "epoch": 1.5774074545223256, + "ewc_loss": 0.007847873494029045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.847873348509893e-05, + "grad_norm": 3.882096290588379, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8928828239440918, + "num_tokens": 473054624.0, + "step": 12400 + }, + { + "epoch": 1.5775346648009159, + "ewc_loss": 0.007814832963049412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.814833224983886e-05, + "grad_norm": 3.867152690887451, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8665791749954224, + "num_tokens": 473094140.0, + "step": 12401 + }, + { + "epoch": 1.5776618750795064, + "ewc_loss": 0.007839039899408817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839040335966274e-05, + "grad_norm": 3.9340298175811768, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8578547239303589, + "num_tokens": 473136886.0, + "step": 12402 + }, + { + "epoch": 1.577789085358097, + "ewc_loss": 0.007866617292165756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866617670515552e-05, + "grad_norm": 3.944309949874878, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8645768165588379, + "num_tokens": 473173267.0, + "step": 12403 + }, + { + "epoch": 1.5779162956366874, + "ewc_loss": 0.007859052158892155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859052129788324e-05, + "grad_norm": 3.900994300842285, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8563398718833923, + "num_tokens": 473216477.0, + "step": 12404 + }, + { + "epoch": 1.578043505915278, + "ewc_loss": 0.007822812534868717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.822812767699361e-05, + "grad_norm": 3.9422988891601562, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8674393892288208, + "num_tokens": 473256986.0, + "step": 12405 + }, + { + "epoch": 1.5781707161938685, + "ewc_loss": 0.007859938777983189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859938341425732e-05, + "grad_norm": 3.877056837081909, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8832796812057495, + "num_tokens": 473297967.0, + "step": 12406 + }, + { + "epoch": 1.5782979264724588, + "ewc_loss": 0.007807739078998566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.807739166310057e-05, + "grad_norm": 3.917628049850464, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8682218194007874, + "num_tokens": 473333580.0, + "step": 12407 + }, + { + "epoch": 1.5784251367510493, + "ewc_loss": 0.007879287004470825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87928729550913e-05, + "grad_norm": 3.8902883529663086, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8806474208831787, + "num_tokens": 473370553.0, + "step": 12408 + }, + { + "epoch": 1.5785523470296399, + "ewc_loss": 0.00784360058605671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.843600178603083e-05, + "grad_norm": 3.904029369354248, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8606493473052979, + "num_tokens": 473410705.0, + "step": 12409 + }, + { + "epoch": 1.5786795573082304, + "ewc_loss": 0.007881826721131802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88182660471648e-05, + "grad_norm": 4.001284122467041, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8792563676834106, + "num_tokens": 473443315.0, + "step": 12410 + }, + { + "epoch": 1.578806767586821, + "ewc_loss": 0.007929396815598011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929396815598011e-05, + "grad_norm": 3.850961446762085, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8682051301002502, + "num_tokens": 473488767.0, + "step": 12411 + }, + { + "epoch": 1.5789339778654115, + "ewc_loss": 0.00782536156475544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825361535651609e-05, + "grad_norm": 3.919370651245117, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.89134681224823, + "num_tokens": 473525360.0, + "step": 12412 + }, + { + "epoch": 1.579061188144002, + "ewc_loss": 0.007920688949525356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.920688949525356e-05, + "grad_norm": 3.946908712387085, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8693372011184692, + "num_tokens": 473560625.0, + "step": 12413 + }, + { + "epoch": 1.5791883984225925, + "ewc_loss": 0.007908628322184086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908628322184086e-05, + "grad_norm": 3.887801170349121, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8933718800544739, + "num_tokens": 473595484.0, + "step": 12414 + }, + { + "epoch": 1.579315608701183, + "ewc_loss": 0.007867539301514626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867539534345269e-05, + "grad_norm": 3.9442665576934814, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8732813000679016, + "num_tokens": 473627269.0, + "step": 12415 + }, + { + "epoch": 1.5794428189797736, + "ewc_loss": 0.007937086746096611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.937086775200441e-05, + "grad_norm": 3.8740828037261963, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8615027070045471, + "num_tokens": 473669159.0, + "step": 12416 + }, + { + "epoch": 1.579570029258364, + "ewc_loss": 0.007875603623688221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.875603478169069e-05, + "grad_norm": 3.9149250984191895, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8849056959152222, + "num_tokens": 473707739.0, + "step": 12417 + }, + { + "epoch": 1.5796972395369546, + "ewc_loss": 0.00793746393173933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.937463669804856e-05, + "grad_norm": 3.8699848651885986, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8763161301612854, + "num_tokens": 473746908.0, + "step": 12418 + }, + { + "epoch": 1.5798244498155452, + "ewc_loss": 0.00790221057832241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902210927568376e-05, + "grad_norm": 3.932526111602783, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8752546310424805, + "num_tokens": 473781480.0, + "step": 12419 + }, + { + "epoch": 1.5799516600941357, + "ewc_loss": 0.007941857911646366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.941858348203823e-05, + "grad_norm": 3.9545273780822754, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8769122958183289, + "num_tokens": 473814676.0, + "step": 12420 + }, + { + "epoch": 1.5800788703727262, + "ewc_loss": 0.007938699796795845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938699855003506e-05, + "grad_norm": 3.9593236446380615, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8716295957565308, + "num_tokens": 473855487.0, + "step": 12421 + }, + { + "epoch": 1.5802060806513167, + "ewc_loss": 0.007910347543656826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910347630968317e-05, + "grad_norm": 4.001553058624268, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8876939415931702, + "num_tokens": 473885648.0, + "step": 12422 + }, + { + "epoch": 1.5803332909299073, + "ewc_loss": 0.007946440950036049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946440746309236e-05, + "grad_norm": 3.9497485160827637, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8710230588912964, + "num_tokens": 473923862.0, + "step": 12423 + }, + { + "epoch": 1.5804605012084978, + "ewc_loss": 0.00790631677955389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90631675045006e-05, + "grad_norm": 3.9180233478546143, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.880564272403717, + "num_tokens": 473957725.0, + "step": 12424 + }, + { + "epoch": 1.5805877114870883, + "ewc_loss": 0.007911786437034607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911786815384403e-05, + "grad_norm": 3.8949227333068848, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8767157793045044, + "num_tokens": 473994939.0, + "step": 12425 + }, + { + "epoch": 1.5807149217656786, + "ewc_loss": 0.007910581305623055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910581189207733e-05, + "grad_norm": 3.8768463134765625, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8712367415428162, + "num_tokens": 474035996.0, + "step": 12426 + }, + { + "epoch": 1.5808421320442692, + "ewc_loss": 0.007912303321063519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91230340837501e-05, + "grad_norm": 3.891144275665283, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8840236663818359, + "num_tokens": 474071693.0, + "step": 12427 + }, + { + "epoch": 1.5809693423228597, + "ewc_loss": 0.00793460477143526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934604946058244e-05, + "grad_norm": 3.8698670864105225, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8804057836532593, + "num_tokens": 474114415.0, + "step": 12428 + }, + { + "epoch": 1.5810965526014502, + "ewc_loss": 0.00789199024438858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.891990389907733e-05, + "grad_norm": 3.9268391132354736, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8706202507019043, + "num_tokens": 474152060.0, + "step": 12429 + }, + { + "epoch": 1.5812237628800407, + "ewc_loss": 0.00794057734310627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940577779663727e-05, + "grad_norm": 3.880037546157837, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8798757791519165, + "num_tokens": 474190605.0, + "step": 12430 + }, + { + "epoch": 1.5813509731586313, + "ewc_loss": 0.007889894768595695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889894914114848e-05, + "grad_norm": 3.912000894546509, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8695669770240784, + "num_tokens": 474229447.0, + "step": 12431 + }, + { + "epoch": 1.5814781834372216, + "ewc_loss": 0.00791285652667284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.912856381153688e-05, + "grad_norm": 3.8676934242248535, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8832592964172363, + "num_tokens": 474268088.0, + "step": 12432 + }, + { + "epoch": 1.581605393715812, + "ewc_loss": 0.007887103594839573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887103856774047e-05, + "grad_norm": 3.9263460636138916, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8730403184890747, + "num_tokens": 474304856.0, + "step": 12433 + }, + { + "epoch": 1.5817326039944026, + "ewc_loss": 0.007928658276796341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928658305900171e-05, + "grad_norm": 3.8620967864990234, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8784759640693665, + "num_tokens": 474349248.0, + "step": 12434 + }, + { + "epoch": 1.5818598142729932, + "ewc_loss": 0.007856047712266445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856047886889428e-05, + "grad_norm": 3.927751064300537, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8862208127975464, + "num_tokens": 474384432.0, + "step": 12435 + }, + { + "epoch": 1.5819870245515837, + "ewc_loss": 0.007917379960417747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917379844002426e-05, + "grad_norm": 3.9782872200012207, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8736717700958252, + "num_tokens": 474415881.0, + "step": 12436 + }, + { + "epoch": 1.5821142348301742, + "ewc_loss": 0.00792820192873478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928202103357762e-05, + "grad_norm": 3.9694392681121826, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.872637927532196, + "num_tokens": 474452755.0, + "step": 12437 + }, + { + "epoch": 1.5822414451087647, + "ewc_loss": 0.007884172722697258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884173101047054e-05, + "grad_norm": 3.8968214988708496, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8723674416542053, + "num_tokens": 474493937.0, + "step": 12438 + }, + { + "epoch": 1.5823686553873553, + "ewc_loss": 0.007874744944274426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.874744915170595e-05, + "grad_norm": 3.8975436687469482, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8817188739776611, + "num_tokens": 474534072.0, + "step": 12439 + }, + { + "epoch": 1.5824958656659458, + "ewc_loss": 0.007895405404269695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.895404996816069e-05, + "grad_norm": 3.9694533348083496, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8684253692626953, + "num_tokens": 474569343.0, + "step": 12440 + }, + { + "epoch": 1.5826230759445363, + "ewc_loss": 0.007933001033961773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.933000597404316e-05, + "grad_norm": 3.914015293121338, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8648629784584045, + "num_tokens": 474606610.0, + "step": 12441 + }, + { + "epoch": 1.5827502862231269, + "ewc_loss": 0.007873748429119587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.873748108977452e-05, + "grad_norm": 3.922034978866577, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8664870262145996, + "num_tokens": 474645071.0, + "step": 12442 + }, + { + "epoch": 1.5828774965017174, + "ewc_loss": 0.007890034466981888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89003461250104e-05, + "grad_norm": 3.9232468605041504, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8621273636817932, + "num_tokens": 474683985.0, + "step": 12443 + }, + { + "epoch": 1.583004706780308, + "ewc_loss": 0.007903188467025757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.903188088675961e-05, + "grad_norm": 3.863508701324463, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8799892067909241, + "num_tokens": 474726102.0, + "step": 12444 + }, + { + "epoch": 1.5831319170588984, + "ewc_loss": 0.007861627265810966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861627091187984e-05, + "grad_norm": 3.8934526443481445, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8769838809967041, + "num_tokens": 474766430.0, + "step": 12445 + }, + { + "epoch": 1.583259127337489, + "ewc_loss": 0.007909165695309639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909165287856013e-05, + "grad_norm": 3.9323058128356934, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8746037483215332, + "num_tokens": 474800648.0, + "step": 12446 + }, + { + "epoch": 1.5833863376160795, + "ewc_loss": 0.007911454886198044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911455031717196e-05, + "grad_norm": 3.8418128490448, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8784543871879578, + "num_tokens": 474845540.0, + "step": 12447 + }, + { + "epoch": 1.58351354789467, + "ewc_loss": 0.007852092385292053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.852092676330358e-05, + "grad_norm": 3.9406704902648926, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8736009001731873, + "num_tokens": 474884296.0, + "step": 12448 + }, + { + "epoch": 1.5836407581732606, + "ewc_loss": 0.00794697180390358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946971891215071e-05, + "grad_norm": 3.8921759128570557, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8795657753944397, + "num_tokens": 474929344.0, + "step": 12449 + }, + { + "epoch": 1.5837679684518509, + "ewc_loss": 0.007844415493309498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.844415085855871e-05, + "grad_norm": 3.8746562004089355, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.88178551197052, + "num_tokens": 474968035.0, + "step": 12450 + }, + { + "epoch": 1.5838951787304414, + "ewc_loss": 0.007866366766393185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866366649977863e-05, + "grad_norm": 3.9423775672912598, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8713109493255615, + "num_tokens": 475006189.0, + "step": 12451 + }, + { + "epoch": 1.584022389009032, + "ewc_loss": 0.007914899848401546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914900197647512e-05, + "grad_norm": 3.953139543533325, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8706812858581543, + "num_tokens": 475039768.0, + "step": 12452 + }, + { + "epoch": 1.5841495992876224, + "ewc_loss": 0.007892275229096413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.892275607446209e-05, + "grad_norm": 3.8862998485565186, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.880591869354248, + "num_tokens": 475079938.0, + "step": 12453 + }, + { + "epoch": 1.584276809566213, + "ewc_loss": 0.007853765040636063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853764691390097e-05, + "grad_norm": 3.9129812717437744, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8636751174926758, + "num_tokens": 475122405.0, + "step": 12454 + }, + { + "epoch": 1.5844040198448035, + "ewc_loss": 0.007898332551121712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.898332114564255e-05, + "grad_norm": 3.9539794921875, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8685275912284851, + "num_tokens": 475158237.0, + "step": 12455 + }, + { + "epoch": 1.5845312301233938, + "ewc_loss": 0.007901920937001705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90192061685957e-05, + "grad_norm": 3.9047088623046875, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8755072951316833, + "num_tokens": 475194966.0, + "step": 12456 + }, + { + "epoch": 1.5846584404019843, + "ewc_loss": 0.007865416817367077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86541640991345e-05, + "grad_norm": 3.909379243850708, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8755326867103577, + "num_tokens": 475234483.0, + "step": 12457 + }, + { + "epoch": 1.5847856506805749, + "ewc_loss": 0.007877007126808167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877007010392845e-05, + "grad_norm": 3.891385316848755, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8707148432731628, + "num_tokens": 475275172.0, + "step": 12458 + }, + { + "epoch": 1.5849128609591654, + "ewc_loss": 0.007865333929657936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86533419159241e-05, + "grad_norm": 3.9114294052124023, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8803375959396362, + "num_tokens": 475313550.0, + "step": 12459 + }, + { + "epoch": 1.585040071237756, + "ewc_loss": 0.007878423668444157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878423639340326e-05, + "grad_norm": 3.832644462585449, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8814828395843506, + "num_tokens": 475357019.0, + "step": 12460 + }, + { + "epoch": 1.5851672815163464, + "ewc_loss": 0.00783135462552309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831354741938412e-05, + "grad_norm": 3.9340271949768066, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8798091411590576, + "num_tokens": 475394788.0, + "step": 12461 + }, + { + "epoch": 1.585294491794937, + "ewc_loss": 0.007890183478593826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.890183769632131e-05, + "grad_norm": 3.9091947078704834, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8744553923606873, + "num_tokens": 475432354.0, + "step": 12462 + }, + { + "epoch": 1.5854217020735275, + "ewc_loss": 0.007860646583139896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.860646292101592e-05, + "grad_norm": 3.9465694427490234, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8784340620040894, + "num_tokens": 475467284.0, + "step": 12463 + }, + { + "epoch": 1.585548912352118, + "ewc_loss": 0.00786069966852665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.860700134187937e-05, + "grad_norm": 3.8817138671875, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8685169219970703, + "num_tokens": 475505946.0, + "step": 12464 + }, + { + "epoch": 1.5856761226307086, + "ewc_loss": 0.007818281650543213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818281301297247e-05, + "grad_norm": 3.956430673599243, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8636683225631714, + "num_tokens": 475542799.0, + "step": 12465 + }, + { + "epoch": 1.585803332909299, + "ewc_loss": 0.007885430008172989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.885430386522785e-05, + "grad_norm": 3.8789470195770264, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8765898942947388, + "num_tokens": 475582064.0, + "step": 12466 + }, + { + "epoch": 1.5859305431878896, + "ewc_loss": 0.007820325903594494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820325845386833e-05, + "grad_norm": 4.0716705322265625, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8698432445526123, + "num_tokens": 475615521.0, + "step": 12467 + }, + { + "epoch": 1.5860577534664801, + "ewc_loss": 0.007964684627950191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.964684482431039e-05, + "grad_norm": 3.9138994216918945, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.883400559425354, + "num_tokens": 475650500.0, + "step": 12468 + }, + { + "epoch": 1.5861849637450707, + "ewc_loss": 0.007824174128472805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.824174099368975e-05, + "grad_norm": 3.851469039916992, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8751791715621948, + "num_tokens": 475695824.0, + "step": 12469 + }, + { + "epoch": 1.5863121740236612, + "ewc_loss": 0.007848778739571571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848778477637097e-05, + "grad_norm": 3.9418249130249023, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.853607177734375, + "num_tokens": 475734015.0, + "step": 12470 + }, + { + "epoch": 1.5864393843022517, + "ewc_loss": 0.00791569147258997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915691821835935e-05, + "grad_norm": 3.9172451496124268, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8730882406234741, + "num_tokens": 475768961.0, + "step": 12471 + }, + { + "epoch": 1.5865665945808423, + "ewc_loss": 0.007869986817240715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.869986438890919e-05, + "grad_norm": 3.8742587566375732, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8879502415657043, + "num_tokens": 475806551.0, + "step": 12472 + }, + { + "epoch": 1.5866938048594328, + "ewc_loss": 0.007858199067413807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.858199387555942e-05, + "grad_norm": 3.9268875122070312, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8680771589279175, + "num_tokens": 475844066.0, + "step": 12473 + }, + { + "epoch": 1.5868210151380233, + "ewc_loss": 0.007913296110928059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913295848993585e-05, + "grad_norm": 3.9842443466186523, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8791617155075073, + "num_tokens": 475874554.0, + "step": 12474 + }, + { + "epoch": 1.5869482254166136, + "ewc_loss": 0.007932956330478191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93295621406287e-05, + "grad_norm": 3.8722827434539795, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8822872638702393, + "num_tokens": 475915840.0, + "step": 12475 + }, + { + "epoch": 1.5870754356952042, + "ewc_loss": 0.007845748215913773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84574804129079e-05, + "grad_norm": 3.91853928565979, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8738943338394165, + "num_tokens": 475951516.0, + "step": 12476 + }, + { + "epoch": 1.5872026459737947, + "ewc_loss": 0.007927190512418747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92719074524939e-05, + "grad_norm": 3.9622962474823, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8776302337646484, + "num_tokens": 475984224.0, + "step": 12477 + }, + { + "epoch": 1.5873298562523852, + "ewc_loss": 0.007939922623336315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93992294347845e-05, + "grad_norm": 3.9780502319335938, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8536999225616455, + "num_tokens": 476015758.0, + "step": 12478 + }, + { + "epoch": 1.5874570665309757, + "ewc_loss": 0.007952235639095306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952236046548933e-05, + "grad_norm": 3.927762031555176, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.873115062713623, + "num_tokens": 476051835.0, + "step": 12479 + }, + { + "epoch": 1.5875842768095663, + "ewc_loss": 0.007927975617349148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.927975821075961e-05, + "grad_norm": 3.9124276638031006, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8714768886566162, + "num_tokens": 476090667.0, + "step": 12480 + }, + { + "epoch": 1.5877114870881566, + "ewc_loss": 0.007953494787216187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953494787216187e-05, + "grad_norm": 3.874265432357788, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8730450868606567, + "num_tokens": 476130448.0, + "step": 12481 + }, + { + "epoch": 1.587838697366747, + "ewc_loss": 0.007932553067803383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932553126011044e-05, + "grad_norm": 3.8610737323760986, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8651210069656372, + "num_tokens": 476174495.0, + "step": 12482 + }, + { + "epoch": 1.5879659076453376, + "ewc_loss": 0.007941256277263165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94125662650913e-05, + "grad_norm": 3.889723539352417, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8841372728347778, + "num_tokens": 476213412.0, + "step": 12483 + }, + { + "epoch": 1.5880931179239282, + "ewc_loss": 0.007957241497933865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957241905387491e-05, + "grad_norm": 3.8804104328155518, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8873347043991089, + "num_tokens": 476250441.0, + "step": 12484 + }, + { + "epoch": 1.5882203282025187, + "ewc_loss": 0.007945907302200794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.945907418616116e-05, + "grad_norm": 3.8982105255126953, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8887053728103638, + "num_tokens": 476284466.0, + "step": 12485 + }, + { + "epoch": 1.5883475384811092, + "ewc_loss": 0.007971690036356449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971689774421975e-05, + "grad_norm": 3.871621608734131, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8742626905441284, + "num_tokens": 476327760.0, + "step": 12486 + }, + { + "epoch": 1.5884747487596997, + "ewc_loss": 0.007933386601507664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.933386950753629e-05, + "grad_norm": 3.8829829692840576, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.870219349861145, + "num_tokens": 476369259.0, + "step": 12487 + }, + { + "epoch": 1.5886019590382903, + "ewc_loss": 0.007948788814246655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948788697831333e-05, + "grad_norm": 3.9472885131835938, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8751683831214905, + "num_tokens": 476403915.0, + "step": 12488 + }, + { + "epoch": 1.5887291693168808, + "ewc_loss": 0.007976866327226162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976865890668705e-05, + "grad_norm": 3.886734962463379, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8734968900680542, + "num_tokens": 476440837.0, + "step": 12489 + }, + { + "epoch": 1.5888563795954713, + "ewc_loss": 0.00792510062456131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925100362626836e-05, + "grad_norm": 3.9173054695129395, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8731018900871277, + "num_tokens": 476478478.0, + "step": 12490 + }, + { + "epoch": 1.5889835898740619, + "ewc_loss": 0.007960512302815914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960512448335066e-05, + "grad_norm": 3.873276710510254, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8837160468101501, + "num_tokens": 476514675.0, + "step": 12491 + }, + { + "epoch": 1.5891108001526524, + "ewc_loss": 0.007915960624814034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915960304671898e-05, + "grad_norm": 3.8726842403411865, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8724236488342285, + "num_tokens": 476559015.0, + "step": 12492 + }, + { + "epoch": 1.589238010431243, + "ewc_loss": 0.007925852201879025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925851969048381e-05, + "grad_norm": 3.9022297859191895, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.859857976436615, + "num_tokens": 476601848.0, + "step": 12493 + }, + { + "epoch": 1.5893652207098334, + "ewc_loss": 0.007926370948553085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926370744826272e-05, + "grad_norm": 3.9240453243255615, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8842155933380127, + "num_tokens": 476636794.0, + "step": 12494 + }, + { + "epoch": 1.589492430988424, + "ewc_loss": 0.007924867793917656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924867531983182e-05, + "grad_norm": 3.9477524757385254, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8709444999694824, + "num_tokens": 476671593.0, + "step": 12495 + }, + { + "epoch": 1.5896196412670145, + "ewc_loss": 0.007926525548100471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926525722723454e-05, + "grad_norm": 3.8954122066497803, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8853375315666199, + "num_tokens": 476706776.0, + "step": 12496 + }, + { + "epoch": 1.589746851545605, + "ewc_loss": 0.007882592268288136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.882592035457492e-05, + "grad_norm": 3.900585412979126, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.873197078704834, + "num_tokens": 476746700.0, + "step": 12497 + }, + { + "epoch": 1.5898740618241956, + "ewc_loss": 0.007909860461950302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909860141808167e-05, + "grad_norm": 3.9006755352020264, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8802766799926758, + "num_tokens": 476785837.0, + "step": 12498 + }, + { + "epoch": 1.5900012721027859, + "ewc_loss": 0.007907302118837833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90730191511102e-05, + "grad_norm": 3.925764560699463, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.880378007888794, + "num_tokens": 476818894.0, + "step": 12499 + }, + { + "epoch": 1.5901284823813764, + "ewc_loss": 0.007901444099843502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901444041635841e-05, + "grad_norm": 3.897432327270508, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8581703901290894, + "num_tokens": 476861773.0, + "step": 12500 + }, + { + "epoch": 1.590255692659967, + "ewc_loss": 0.007906808517873287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.906808605184779e-05, + "grad_norm": 3.909156322479248, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8815267086029053, + "num_tokens": 476899522.0, + "step": 12501 + }, + { + "epoch": 1.5903829029385574, + "ewc_loss": 0.007914258167147636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91425773059018e-05, + "grad_norm": 3.8769376277923584, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8701035380363464, + "num_tokens": 476939923.0, + "step": 12502 + }, + { + "epoch": 1.590510113217148, + "ewc_loss": 0.007879450917243958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.879451004555449e-05, + "grad_norm": 3.9077794551849365, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8741308450698853, + "num_tokens": 476979867.0, + "step": 12503 + }, + { + "epoch": 1.5906373234957385, + "ewc_loss": 0.007900667376816273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.900666969362646e-05, + "grad_norm": 3.90631365776062, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8826155662536621, + "num_tokens": 477016068.0, + "step": 12504 + }, + { + "epoch": 1.5907645337743288, + "ewc_loss": 0.007892407476902008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.892407302279025e-05, + "grad_norm": 3.9337573051452637, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8712828159332275, + "num_tokens": 477054526.0, + "step": 12505 + }, + { + "epoch": 1.5908917440529193, + "ewc_loss": 0.007907530292868614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.907530380180106e-05, + "grad_norm": 3.8771393299102783, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8695983290672302, + "num_tokens": 477098029.0, + "step": 12506 + }, + { + "epoch": 1.5910189543315099, + "ewc_loss": 0.007868497632443905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868497777963057e-05, + "grad_norm": 3.9026010036468506, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8821094036102295, + "num_tokens": 477137070.0, + "step": 12507 + }, + { + "epoch": 1.5911461646101004, + "ewc_loss": 0.007896815426647663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896815804997459e-05, + "grad_norm": 3.8977787494659424, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8747764825820923, + "num_tokens": 477175640.0, + "step": 12508 + }, + { + "epoch": 1.591273374888691, + "ewc_loss": 0.007893031463027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.893031579442322e-05, + "grad_norm": 3.996433734893799, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8771256804466248, + "num_tokens": 477210935.0, + "step": 12509 + }, + { + "epoch": 1.5914005851672814, + "ewc_loss": 0.007937104441225529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.937104237498716e-05, + "grad_norm": 3.9096057415008545, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8813799619674683, + "num_tokens": 477251567.0, + "step": 12510 + }, + { + "epoch": 1.591527795445872, + "ewc_loss": 0.007846740074455738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.846740481909364e-05, + "grad_norm": 3.9341068267822266, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8784698843955994, + "num_tokens": 477287964.0, + "step": 12511 + }, + { + "epoch": 1.5916550057244625, + "ewc_loss": 0.007924290373921394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924290548544377e-05, + "grad_norm": 3.9473214149475098, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8495000600814819, + "num_tokens": 477328660.0, + "step": 12512 + }, + { + "epoch": 1.591782216003053, + "ewc_loss": 0.007886993698775768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.886993262218311e-05, + "grad_norm": 3.925670623779297, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8789457082748413, + "num_tokens": 477364111.0, + "step": 12513 + }, + { + "epoch": 1.5919094262816436, + "ewc_loss": 0.007867452688515186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86745295044966e-05, + "grad_norm": 3.8954148292541504, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8824477195739746, + "num_tokens": 477402691.0, + "step": 12514 + }, + { + "epoch": 1.592036636560234, + "ewc_loss": 0.007862946949899197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862946949899197e-05, + "grad_norm": 3.903554916381836, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8896446228027344, + "num_tokens": 477437526.0, + "step": 12515 + }, + { + "epoch": 1.5921638468388246, + "ewc_loss": 0.007872934453189373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872934656916186e-05, + "grad_norm": 3.89909029006958, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8734039664268494, + "num_tokens": 477479175.0, + "step": 12516 + }, + { + "epoch": 1.5922910571174151, + "ewc_loss": 0.007854467257857323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.854466821299866e-05, + "grad_norm": 3.89601993560791, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8779213428497314, + "num_tokens": 477516896.0, + "step": 12517 + }, + { + "epoch": 1.5924182673960057, + "ewc_loss": 0.007862968370318413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862968050176278e-05, + "grad_norm": 3.935070753097534, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8659741282463074, + "num_tokens": 477555543.0, + "step": 12518 + }, + { + "epoch": 1.5925454776745962, + "ewc_loss": 0.007876818999648094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876818563090637e-05, + "grad_norm": 3.923496961593628, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8871670961380005, + "num_tokens": 477592251.0, + "step": 12519 + }, + { + "epoch": 1.5926726879531867, + "ewc_loss": 0.00786161795258522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861618360038847e-05, + "grad_norm": 3.9478816986083984, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8622138500213623, + "num_tokens": 477630039.0, + "step": 12520 + }, + { + "epoch": 1.5927998982317773, + "ewc_loss": 0.007876159623265266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876159361330792e-05, + "grad_norm": 3.9508466720581055, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8745054006576538, + "num_tokens": 477661028.0, + "step": 12521 + }, + { + "epoch": 1.5929271085103678, + "ewc_loss": 0.007869621738791466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.869621913414448e-05, + "grad_norm": 3.910627603530884, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8880126476287842, + "num_tokens": 477699612.0, + "step": 12522 + }, + { + "epoch": 1.5930543187889583, + "ewc_loss": 0.007858045399188995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.858045864850283e-05, + "grad_norm": 3.9144816398620605, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.870678186416626, + "num_tokens": 477737300.0, + "step": 12523 + }, + { + "epoch": 1.5931815290675486, + "ewc_loss": 0.007870038971304893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870038825785741e-05, + "grad_norm": 3.922537088394165, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8637259602546692, + "num_tokens": 477776031.0, + "step": 12524 + }, + { + "epoch": 1.5933087393461391, + "ewc_loss": 0.007888463325798512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.888463005656376e-05, + "grad_norm": 3.904204845428467, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8778835535049438, + "num_tokens": 477813640.0, + "step": 12525 + }, + { + "epoch": 1.5934359496247297, + "ewc_loss": 0.007867819629609585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867819658713415e-05, + "grad_norm": 3.9478607177734375, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8712003827095032, + "num_tokens": 477848652.0, + "step": 12526 + }, + { + "epoch": 1.5935631599033202, + "ewc_loss": 0.007886471226811409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.886471576057374e-05, + "grad_norm": 3.892695426940918, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8712265491485596, + "num_tokens": 477883710.0, + "step": 12527 + }, + { + "epoch": 1.5936903701819107, + "ewc_loss": 0.007868118584156036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868118700571358e-05, + "grad_norm": 3.9434022903442383, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8679727911949158, + "num_tokens": 477920316.0, + "step": 12528 + }, + { + "epoch": 1.5938175804605013, + "ewc_loss": 0.00791126023977995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911260036053136e-05, + "grad_norm": 3.8801703453063965, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8847442865371704, + "num_tokens": 477955523.0, + "step": 12529 + }, + { + "epoch": 1.5939447907390916, + "ewc_loss": 0.00787381362169981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87381359259598e-05, + "grad_norm": 3.8965401649475098, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.866568922996521, + "num_tokens": 477995768.0, + "step": 12530 + }, + { + "epoch": 1.594072001017682, + "ewc_loss": 0.007908969186246395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90896883700043e-05, + "grad_norm": 3.890505075454712, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8758723735809326, + "num_tokens": 478037718.0, + "step": 12531 + }, + { + "epoch": 1.5941992112962726, + "ewc_loss": 0.007902862504124641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902862853370607e-05, + "grad_norm": 3.8822176456451416, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.86275315284729, + "num_tokens": 478077388.0, + "step": 12532 + }, + { + "epoch": 1.5943264215748632, + "ewc_loss": 0.007900963537395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.900963828433305e-05, + "grad_norm": 3.9503915309906006, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8896013498306274, + "num_tokens": 478109494.0, + "step": 12533 + }, + { + "epoch": 1.5944536318534537, + "ewc_loss": 0.007952088490128517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952088344609365e-05, + "grad_norm": 3.9491448402404785, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8701180815696716, + "num_tokens": 478142420.0, + "step": 12534 + }, + { + "epoch": 1.5945808421320442, + "ewc_loss": 0.007939295843243599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939295755932108e-05, + "grad_norm": 3.9216554164886475, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8797855973243713, + "num_tokens": 478183848.0, + "step": 12535 + }, + { + "epoch": 1.5947080524106347, + "ewc_loss": 0.007916262373328209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.916262256912887e-05, + "grad_norm": 3.944394111633301, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8852229714393616, + "num_tokens": 478225242.0, + "step": 12536 + }, + { + "epoch": 1.5948352626892253, + "ewc_loss": 0.007939708419144154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939708302728832e-05, + "grad_norm": 3.892265796661377, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.882603645324707, + "num_tokens": 478263167.0, + "step": 12537 + }, + { + "epoch": 1.5949624729678158, + "ewc_loss": 0.007913734763860703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91373458923772e-05, + "grad_norm": 4.002923965454102, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8750140070915222, + "num_tokens": 478297752.0, + "step": 12538 + }, + { + "epoch": 1.5950896832464063, + "ewc_loss": 0.007974185049533844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97418542788364e-05, + "grad_norm": 3.970055103302002, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8755836486816406, + "num_tokens": 478332955.0, + "step": 12539 + }, + { + "epoch": 1.5952168935249968, + "ewc_loss": 0.007928796112537384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928796549094841e-05, + "grad_norm": 3.9831135272979736, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8539943695068359, + "num_tokens": 478367298.0, + "step": 12540 + }, + { + "epoch": 1.5953441038035874, + "ewc_loss": 0.007945168763399124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.945168908918276e-05, + "grad_norm": 3.8427112102508545, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8787262439727783, + "num_tokens": 478411183.0, + "step": 12541 + }, + { + "epoch": 1.595471314082178, + "ewc_loss": 0.007879217155277729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.879216718720272e-05, + "grad_norm": 3.9038026332855225, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8729103803634644, + "num_tokens": 478451184.0, + "step": 12542 + }, + { + "epoch": 1.5955985243607684, + "ewc_loss": 0.007953662425279617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953662861837074e-05, + "grad_norm": 3.916867256164551, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8719128370285034, + "num_tokens": 478490913.0, + "step": 12543 + }, + { + "epoch": 1.595725734639359, + "ewc_loss": 0.007934222929179668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934222958283499e-05, + "grad_norm": 3.900238275527954, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8773993253707886, + "num_tokens": 478529769.0, + "step": 12544 + }, + { + "epoch": 1.5958529449179495, + "ewc_loss": 0.007923140190541744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923140219645575e-05, + "grad_norm": 3.944932699203491, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8714300394058228, + "num_tokens": 478566665.0, + "step": 12545 + }, + { + "epoch": 1.59598015519654, + "ewc_loss": 0.007952051237225533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952051237225533e-05, + "grad_norm": 3.8698558807373047, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.9013605117797852, + "num_tokens": 478600814.0, + "step": 12546 + }, + { + "epoch": 1.5961073654751305, + "ewc_loss": 0.007905606180429459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.905606616986915e-05, + "grad_norm": 3.8916749954223633, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8774442672729492, + "num_tokens": 478641610.0, + "step": 12547 + }, + { + "epoch": 1.5962345757537209, + "ewc_loss": 0.007926829159259796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926829130155966e-05, + "grad_norm": 3.8761966228485107, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8831275701522827, + "num_tokens": 478678183.0, + "step": 12548 + }, + { + "epoch": 1.5963617860323114, + "ewc_loss": 0.00791472289711237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914722664281726e-05, + "grad_norm": 3.93740177154541, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8757773041725159, + "num_tokens": 478710881.0, + "step": 12549 + }, + { + "epoch": 1.596488996310902, + "ewc_loss": 0.007959583774209023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959584036143497e-05, + "grad_norm": 3.8851654529571533, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8883554935455322, + "num_tokens": 478748550.0, + "step": 12550 + }, + { + "epoch": 1.5966162065894924, + "ewc_loss": 0.007894502021372318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.894502050476149e-05, + "grad_norm": 3.8994147777557373, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8650053143501282, + "num_tokens": 478790417.0, + "step": 12551 + }, + { + "epoch": 1.596743416868083, + "ewc_loss": 0.007926028221845627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926028047222644e-05, + "grad_norm": 3.8814926147460938, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8821840286254883, + "num_tokens": 478828856.0, + "step": 12552 + }, + { + "epoch": 1.5968706271466735, + "ewc_loss": 0.007899080403149128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.899080810602754e-05, + "grad_norm": 3.867673635482788, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8719983100891113, + "num_tokens": 478870240.0, + "step": 12553 + }, + { + "epoch": 1.5969978374252638, + "ewc_loss": 0.00791300367563963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913004083093256e-05, + "grad_norm": 3.927212715148926, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8677318096160889, + "num_tokens": 478911920.0, + "step": 12554 + }, + { + "epoch": 1.5971250477038543, + "ewc_loss": 0.007934171706438065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9341720265802e-05, + "grad_norm": 3.9283413887023926, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8706013560295105, + "num_tokens": 478949083.0, + "step": 12555 + }, + { + "epoch": 1.5972522579824449, + "ewc_loss": 0.007903676480054855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.903676305431873e-05, + "grad_norm": 3.914513349533081, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8742817044258118, + "num_tokens": 478984087.0, + "step": 12556 + }, + { + "epoch": 1.5973794682610354, + "ewc_loss": 0.007905791513621807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.905791426310316e-05, + "grad_norm": 4.032298564910889, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.859098494052887, + "num_tokens": 479019364.0, + "step": 12557 + }, + { + "epoch": 1.597506678539626, + "ewc_loss": 0.007976100780069828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976101187523454e-05, + "grad_norm": 3.877277374267578, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8790422081947327, + "num_tokens": 479059002.0, + "step": 12558 + }, + { + "epoch": 1.5976338888182164, + "ewc_loss": 0.007849144749343395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.849145185900852e-05, + "grad_norm": 3.9598090648651123, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8750795722007751, + "num_tokens": 479092676.0, + "step": 12559 + }, + { + "epoch": 1.597761099096807, + "ewc_loss": 0.007981540635228157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.981540693435818e-05, + "grad_norm": 3.9549853801727295, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8798319101333618, + "num_tokens": 479125951.0, + "step": 12560 + }, + { + "epoch": 1.5978883093753975, + "ewc_loss": 0.007939813658595085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939813804114237e-05, + "grad_norm": 3.8570947647094727, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8901693224906921, + "num_tokens": 479166230.0, + "step": 12561 + }, + { + "epoch": 1.598015519653988, + "ewc_loss": 0.007887665182352066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887665560701862e-05, + "grad_norm": 3.8518240451812744, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8717978596687317, + "num_tokens": 479205813.0, + "step": 12562 + }, + { + "epoch": 1.5981427299325786, + "ewc_loss": 0.007930540479719639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93054059613496e-05, + "grad_norm": 3.887845754623413, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8722509741783142, + "num_tokens": 479244960.0, + "step": 12563 + }, + { + "epoch": 1.598269940211169, + "ewc_loss": 0.007954657077789307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.954657485242933e-05, + "grad_norm": 3.8675012588500977, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8810257911682129, + "num_tokens": 479289617.0, + "step": 12564 + }, + { + "epoch": 1.5983971504897596, + "ewc_loss": 0.007904681377112865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.904681115178391e-05, + "grad_norm": 3.91424298286438, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.875817596912384, + "num_tokens": 479326377.0, + "step": 12565 + }, + { + "epoch": 1.5985243607683501, + "ewc_loss": 0.007953450083732605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953449676278979e-05, + "grad_norm": 3.9359560012817383, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8748571276664734, + "num_tokens": 479360563.0, + "step": 12566 + }, + { + "epoch": 1.5986515710469407, + "ewc_loss": 0.007957208901643753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957209163578227e-05, + "grad_norm": 3.8819785118103027, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8860703706741333, + "num_tokens": 479403042.0, + "step": 12567 + }, + { + "epoch": 1.5987787813255312, + "ewc_loss": 0.00790155865252018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901559001766145e-05, + "grad_norm": 3.9079267978668213, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8822287917137146, + "num_tokens": 479437099.0, + "step": 12568 + }, + { + "epoch": 1.5989059916041217, + "ewc_loss": 0.00791837926954031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918378832982853e-05, + "grad_norm": 3.909517526626587, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8738713264465332, + "num_tokens": 479479326.0, + "step": 12569 + }, + { + "epoch": 1.5990332018827123, + "ewc_loss": 0.007900211028754711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.900210766820237e-05, + "grad_norm": 3.968458652496338, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8585106134414673, + "num_tokens": 479515320.0, + "step": 12570 + }, + { + "epoch": 1.5991604121613028, + "ewc_loss": 0.007925131358206272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925131649244577e-05, + "grad_norm": 3.90927791595459, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8647109270095825, + "num_tokens": 479559960.0, + "step": 12571 + }, + { + "epoch": 1.5992876224398933, + "ewc_loss": 0.007858967408537865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.858967001084238e-05, + "grad_norm": 3.899127244949341, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8751465082168579, + "num_tokens": 479599115.0, + "step": 12572 + }, + { + "epoch": 1.5994148327184836, + "ewc_loss": 0.007885720580816269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.885720697231591e-05, + "grad_norm": 3.9299299716949463, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8826726675033569, + "num_tokens": 479636116.0, + "step": 12573 + }, + { + "epoch": 1.5995420429970741, + "ewc_loss": 0.007899422198534012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89942205301486e-05, + "grad_norm": 3.906322479248047, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8672752380371094, + "num_tokens": 479676880.0, + "step": 12574 + }, + { + "epoch": 1.5996692532756647, + "ewc_loss": 0.00787513516843319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.875135634094477e-05, + "grad_norm": 3.940120220184326, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8777379989624023, + "num_tokens": 479717424.0, + "step": 12575 + }, + { + "epoch": 1.5997964635542552, + "ewc_loss": 0.00789369735866785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89369732956402e-05, + "grad_norm": 3.890746831893921, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8732367753982544, + "num_tokens": 479756240.0, + "step": 12576 + }, + { + "epoch": 1.5999236738328457, + "ewc_loss": 0.007860155776143074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.860155892558396e-05, + "grad_norm": 4.280898571014404, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8592199087142944, + "num_tokens": 479791863.0, + "step": 12577 + }, + { + "epoch": 1.6000508841114363, + "ewc_loss": 0.00808035209774971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080352563410997e-05, + "grad_norm": 3.866413116455078, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8844559192657471, + "num_tokens": 479827252.0, + "step": 12578 + }, + { + "epoch": 1.6001780943900266, + "ewc_loss": 0.007737825158983469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.73782521719113e-05, + "grad_norm": 3.9062137603759766, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8717359900474548, + "num_tokens": 479863100.0, + "step": 12579 + }, + { + "epoch": 1.600305304668617, + "ewc_loss": 0.00791750755161047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917507173260674e-05, + "grad_norm": 3.9384207725524902, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8646089434623718, + "num_tokens": 479899684.0, + "step": 12580 + }, + { + "epoch": 1.6004325149472076, + "ewc_loss": 0.007905575446784496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.905575330369174e-05, + "grad_norm": 3.9149365425109863, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8763630986213684, + "num_tokens": 479937656.0, + "step": 12581 + }, + { + "epoch": 1.6005597252257981, + "ewc_loss": 0.007887360639870167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887360698077828e-05, + "grad_norm": 3.951958179473877, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.871009349822998, + "num_tokens": 479973357.0, + "step": 12582 + }, + { + "epoch": 1.6006869355043887, + "ewc_loss": 0.007930577732622623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.930577703518793e-05, + "grad_norm": 3.8373219966888428, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8751029968261719, + "num_tokens": 480017862.0, + "step": 12583 + }, + { + "epoch": 1.6008141457829792, + "ewc_loss": 0.00787094421684742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870943954912946e-05, + "grad_norm": 3.9060654640197754, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8811931610107422, + "num_tokens": 480057039.0, + "step": 12584 + }, + { + "epoch": 1.6009413560615697, + "ewc_loss": 0.007949876599013805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949876453494653e-05, + "grad_norm": 3.9024040699005127, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8775248527526855, + "num_tokens": 480093396.0, + "step": 12585 + }, + { + "epoch": 1.6010685663401603, + "ewc_loss": 0.00793366227298975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.933662709547207e-05, + "grad_norm": 3.9324793815612793, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8719791173934937, + "num_tokens": 480125200.0, + "step": 12586 + }, + { + "epoch": 1.6011957766187508, + "ewc_loss": 0.007936538197100163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936538167996332e-05, + "grad_norm": 3.9744150638580322, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8716540336608887, + "num_tokens": 480158085.0, + "step": 12587 + }, + { + "epoch": 1.6013229868973413, + "ewc_loss": 0.007982995361089706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982995157362893e-05, + "grad_norm": 3.9382238388061523, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8900256752967834, + "num_tokens": 480192979.0, + "step": 12588 + }, + { + "epoch": 1.6014501971759318, + "ewc_loss": 0.007940849289298058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940849172882736e-05, + "grad_norm": 3.922849416732788, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8843426704406738, + "num_tokens": 480227984.0, + "step": 12589 + }, + { + "epoch": 1.6015774074545224, + "ewc_loss": 0.00795139279216528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.951392763061449e-05, + "grad_norm": 3.901489496231079, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8917510509490967, + "num_tokens": 480264280.0, + "step": 12590 + }, + { + "epoch": 1.601704617733113, + "ewc_loss": 0.007924431003630161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924430974526331e-05, + "grad_norm": 3.975921869277954, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8629376888275146, + "num_tokens": 480300670.0, + "step": 12591 + }, + { + "epoch": 1.6018318280117034, + "ewc_loss": 0.007971586659550667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971586455823854e-05, + "grad_norm": 3.9614593982696533, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8741649389266968, + "num_tokens": 480334460.0, + "step": 12592 + }, + { + "epoch": 1.601959038290294, + "ewc_loss": 0.007945515215396881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.945515244500712e-05, + "grad_norm": 3.8711719512939453, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8844887614250183, + "num_tokens": 480376066.0, + "step": 12593 + }, + { + "epoch": 1.6020862485688845, + "ewc_loss": 0.00791150238364935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911502325441688e-05, + "grad_norm": 3.892761468887329, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8818339109420776, + "num_tokens": 480417575.0, + "step": 12594 + }, + { + "epoch": 1.602213458847475, + "ewc_loss": 0.007952984422445297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952984742587432e-05, + "grad_norm": 3.9308676719665527, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8645898103713989, + "num_tokens": 480454106.0, + "step": 12595 + }, + { + "epoch": 1.6023406691260655, + "ewc_loss": 0.007945333607494831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.945333345560357e-05, + "grad_norm": 3.9298088550567627, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8644202351570129, + "num_tokens": 480493031.0, + "step": 12596 + }, + { + "epoch": 1.6024678794046558, + "ewc_loss": 0.007948092184960842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948092388687655e-05, + "grad_norm": 3.9148364067077637, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8776233196258545, + "num_tokens": 480528326.0, + "step": 12597 + }, + { + "epoch": 1.6025950896832464, + "ewc_loss": 0.007938283495604992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938283670227975e-05, + "grad_norm": 3.9063491821289062, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8827813267707825, + "num_tokens": 480565399.0, + "step": 12598 + }, + { + "epoch": 1.602722299961837, + "ewc_loss": 0.007934200577437878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934200402814895e-05, + "grad_norm": 3.9059760570526123, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.891288161277771, + "num_tokens": 480600028.0, + "step": 12599 + }, + { + "epoch": 1.6028495102404274, + "ewc_loss": 0.00794962514191866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949625432956964e-05, + "grad_norm": 3.9390926361083984, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.875686526298523, + "num_tokens": 480635759.0, + "step": 12600 + }, + { + "epoch": 1.602976720519018, + "ewc_loss": 0.007944001816213131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944001845316961e-05, + "grad_norm": 3.957270622253418, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.88088059425354, + "num_tokens": 480669528.0, + "step": 12601 + }, + { + "epoch": 1.6031039307976085, + "ewc_loss": 0.007959498092532158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959498179843649e-05, + "grad_norm": 3.9039499759674072, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8745368719100952, + "num_tokens": 480709425.0, + "step": 12602 + }, + { + "epoch": 1.6032311410761988, + "ewc_loss": 0.007931627333164215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.931626896606758e-05, + "grad_norm": 3.9225077629089355, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8618702292442322, + "num_tokens": 480745010.0, + "step": 12603 + }, + { + "epoch": 1.6033583513547893, + "ewc_loss": 0.007964341901242733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.964341784827411e-05, + "grad_norm": 3.9132938385009766, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8825284242630005, + "num_tokens": 480780320.0, + "step": 12604 + }, + { + "epoch": 1.6034855616333799, + "ewc_loss": 0.007938665337860584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938664930406958e-05, + "grad_norm": 3.8328349590301514, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8825384974479675, + "num_tokens": 480819335.0, + "step": 12605 + }, + { + "epoch": 1.6036127719119704, + "ewc_loss": 0.007923566736280918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923566590761766e-05, + "grad_norm": 3.9149749279022217, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8887580633163452, + "num_tokens": 480852702.0, + "step": 12606 + }, + { + "epoch": 1.603739982190561, + "ewc_loss": 0.00799511931836605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995119085535407e-05, + "grad_norm": 3.947901487350464, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8712871670722961, + "num_tokens": 480890258.0, + "step": 12607 + }, + { + "epoch": 1.6038671924691514, + "ewc_loss": 0.00796213187277317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.962132076499984e-05, + "grad_norm": 3.904275417327881, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8751535415649414, + "num_tokens": 480929555.0, + "step": 12608 + }, + { + "epoch": 1.603994402747742, + "ewc_loss": 0.00792607106268406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926070975372568e-05, + "grad_norm": 3.885577917098999, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8958669304847717, + "num_tokens": 480970295.0, + "step": 12609 + }, + { + "epoch": 1.6041216130263325, + "ewc_loss": 0.007923374883830547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923375233076513e-05, + "grad_norm": 3.9482932090759277, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8757736682891846, + "num_tokens": 481011213.0, + "step": 12610 + }, + { + "epoch": 1.604248823304923, + "ewc_loss": 0.007954173721373081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95417363406159e-05, + "grad_norm": 3.9304518699645996, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8711349368095398, + "num_tokens": 481045273.0, + "step": 12611 + }, + { + "epoch": 1.6043760335835135, + "ewc_loss": 0.007910213433206081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910213753348216e-05, + "grad_norm": 3.921761989593506, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8561494946479797, + "num_tokens": 481084567.0, + "step": 12612 + }, + { + "epoch": 1.604503243862104, + "ewc_loss": 0.007932445034384727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932444714242592e-05, + "grad_norm": 3.911243200302124, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8781039714813232, + "num_tokens": 481119455.0, + "step": 12613 + }, + { + "epoch": 1.6046304541406946, + "ewc_loss": 0.007950845174491405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.950845611048862e-05, + "grad_norm": 3.9439609050750732, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8596636652946472, + "num_tokens": 481154718.0, + "step": 12614 + }, + { + "epoch": 1.6047576644192851, + "ewc_loss": 0.007955744862556458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955744513310492e-05, + "grad_norm": 3.881013870239258, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8697791695594788, + "num_tokens": 481193716.0, + "step": 12615 + }, + { + "epoch": 1.6048848746978757, + "ewc_loss": 0.007923655211925507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923655357444659e-05, + "grad_norm": 3.9737370014190674, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8520735502243042, + "num_tokens": 481231378.0, + "step": 12616 + }, + { + "epoch": 1.6050120849764662, + "ewc_loss": 0.007988963276147842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988963625393808e-05, + "grad_norm": 3.8652424812316895, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8738070726394653, + "num_tokens": 481267907.0, + "step": 12617 + }, + { + "epoch": 1.6051392952550567, + "ewc_loss": 0.007903221063315868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.903220830485225e-05, + "grad_norm": 3.8680827617645264, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8769351243972778, + "num_tokens": 481309899.0, + "step": 12618 + }, + { + "epoch": 1.6052665055336472, + "ewc_loss": 0.007944988086819649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944988465169445e-05, + "grad_norm": 3.873345136642456, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8847780227661133, + "num_tokens": 481348845.0, + "step": 12619 + }, + { + "epoch": 1.6053937158122378, + "ewc_loss": 0.007938395254313946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938394992379472e-05, + "grad_norm": 3.918774366378784, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8651005029678345, + "num_tokens": 481384999.0, + "step": 12620 + }, + { + "epoch": 1.6055209260908283, + "ewc_loss": 0.007982996292412281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982995884958655e-05, + "grad_norm": 3.94606351852417, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8770884275436401, + "num_tokens": 481417287.0, + "step": 12621 + }, + { + "epoch": 1.6056481363694186, + "ewc_loss": 0.00796782597899437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967826240928844e-05, + "grad_norm": 3.8975226879119873, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8779557943344116, + "num_tokens": 481455943.0, + "step": 12622 + }, + { + "epoch": 1.6057753466480091, + "ewc_loss": 0.007910777814686298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910777640063316e-05, + "grad_norm": 3.879814863204956, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8896315693855286, + "num_tokens": 481492017.0, + "step": 12623 + }, + { + "epoch": 1.6059025569265997, + "ewc_loss": 0.007929801009595394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92980135884136e-05, + "grad_norm": 3.881098747253418, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8693828582763672, + "num_tokens": 481535115.0, + "step": 12624 + }, + { + "epoch": 1.6060297672051902, + "ewc_loss": 0.007929660379886627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929660205263644e-05, + "grad_norm": 3.860499620437622, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8617034554481506, + "num_tokens": 481580048.0, + "step": 12625 + }, + { + "epoch": 1.6061569774837807, + "ewc_loss": 0.007914036512374878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914036541478708e-05, + "grad_norm": 3.908512830734253, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8863775730133057, + "num_tokens": 481616748.0, + "step": 12626 + }, + { + "epoch": 1.6062841877623713, + "ewc_loss": 0.007955944165587425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95594387454912e-05, + "grad_norm": 3.9292142391204834, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8481918573379517, + "num_tokens": 481659123.0, + "step": 12627 + }, + { + "epoch": 1.6064113980409616, + "ewc_loss": 0.007935861125588417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935861503938213e-05, + "grad_norm": 3.869452714920044, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8654488325119019, + "num_tokens": 481704791.0, + "step": 12628 + }, + { + "epoch": 1.606538608319552, + "ewc_loss": 0.007885916158556938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.885915692895651e-05, + "grad_norm": 3.932553768157959, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8635534048080444, + "num_tokens": 481742913.0, + "step": 12629 + }, + { + "epoch": 1.6066658185981426, + "ewc_loss": 0.0079539455473423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953945168992504e-05, + "grad_norm": 3.904527187347412, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.862916111946106, + "num_tokens": 481783549.0, + "step": 12630 + }, + { + "epoch": 1.6067930288767331, + "ewc_loss": 0.007903527468442917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.903527148300782e-05, + "grad_norm": 3.9033427238464355, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8807435035705566, + "num_tokens": 481818486.0, + "step": 12631 + }, + { + "epoch": 1.6069202391553237, + "ewc_loss": 0.007917960174381733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917960465420038e-05, + "grad_norm": 3.965754508972168, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8749520182609558, + "num_tokens": 481849969.0, + "step": 12632 + }, + { + "epoch": 1.6070474494339142, + "ewc_loss": 0.00796037819236517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960378570714965e-05, + "grad_norm": 3.958966016769409, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8590455055236816, + "num_tokens": 481886108.0, + "step": 12633 + }, + { + "epoch": 1.6071746597125047, + "ewc_loss": 0.007920135743916035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.920135976746678e-05, + "grad_norm": 3.8792684078216553, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8866943120956421, + "num_tokens": 481919389.0, + "step": 12634 + }, + { + "epoch": 1.6073018699910953, + "ewc_loss": 0.007913606241345406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913605804787949e-05, + "grad_norm": 3.8632192611694336, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8885015249252319, + "num_tokens": 481958531.0, + "step": 12635 + }, + { + "epoch": 1.6074290802696858, + "ewc_loss": 0.007924892008304596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924891542643309e-05, + "grad_norm": 3.876572370529175, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8705247044563293, + "num_tokens": 481999864.0, + "step": 12636 + }, + { + "epoch": 1.6075562905482763, + "ewc_loss": 0.007940351963043213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940351497381926e-05, + "grad_norm": 3.9089488983154297, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8972381949424744, + "num_tokens": 482033890.0, + "step": 12637 + }, + { + "epoch": 1.6076835008268668, + "ewc_loss": 0.0079440176486969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944017852423713e-05, + "grad_norm": 3.8937430381774902, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8781232237815857, + "num_tokens": 482071186.0, + "step": 12638 + }, + { + "epoch": 1.6078107111054574, + "ewc_loss": 0.007915315218269825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91531492723152e-05, + "grad_norm": 3.903886556625366, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8725561499595642, + "num_tokens": 482107218.0, + "step": 12639 + }, + { + "epoch": 1.607937921384048, + "ewc_loss": 0.007942344062030315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94234438217245e-05, + "grad_norm": 3.984395742416382, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8716443777084351, + "num_tokens": 482146101.0, + "step": 12640 + }, + { + "epoch": 1.6080651316626384, + "ewc_loss": 0.007986707612872124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986708078533411e-05, + "grad_norm": 3.908613443374634, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.887108325958252, + "num_tokens": 482178531.0, + "step": 12641 + }, + { + "epoch": 1.608192341941229, + "ewc_loss": 0.007893409579992294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89340992923826e-05, + "grad_norm": 3.8838322162628174, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8837955594062805, + "num_tokens": 482217019.0, + "step": 12642 + }, + { + "epoch": 1.6083195522198195, + "ewc_loss": 0.007909822277724743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909822306828573e-05, + "grad_norm": 3.921435594558716, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8759642243385315, + "num_tokens": 482250547.0, + "step": 12643 + }, + { + "epoch": 1.60844676249841, + "ewc_loss": 0.007935372181236744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93537255958654e-05, + "grad_norm": 3.9395596981048584, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.871722936630249, + "num_tokens": 482287009.0, + "step": 12644 + }, + { + "epoch": 1.6085739727770005, + "ewc_loss": 0.007938224822282791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9382247349713e-05, + "grad_norm": 3.9082772731781006, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8620860576629639, + "num_tokens": 482327373.0, + "step": 12645 + }, + { + "epoch": 1.6087011830555908, + "ewc_loss": 0.007906299084424973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.906299288151786e-05, + "grad_norm": 3.943530559539795, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.862015962600708, + "num_tokens": 482363409.0, + "step": 12646 + }, + { + "epoch": 1.6088283933341814, + "ewc_loss": 0.007959156297147274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959156209835783e-05, + "grad_norm": 3.990699529647827, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8667291402816772, + "num_tokens": 482396499.0, + "step": 12647 + }, + { + "epoch": 1.608955603612772, + "ewc_loss": 0.007963685318827629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963685493450612e-05, + "grad_norm": 3.8760292530059814, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8771566152572632, + "num_tokens": 482435059.0, + "step": 12648 + }, + { + "epoch": 1.6090828138913624, + "ewc_loss": 0.007912078872323036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.912078581284732e-05, + "grad_norm": 3.963589668273926, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8696667551994324, + "num_tokens": 482472793.0, + "step": 12649 + }, + { + "epoch": 1.609210024169953, + "ewc_loss": 0.007991653867065907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991653546923772e-05, + "grad_norm": 3.9118049144744873, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8769487142562866, + "num_tokens": 482508857.0, + "step": 12650 + }, + { + "epoch": 1.6093372344485435, + "ewc_loss": 0.00794626958668232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946269761305302e-05, + "grad_norm": 3.983933210372925, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8599799871444702, + "num_tokens": 482548526.0, + "step": 12651 + }, + { + "epoch": 1.6094644447271338, + "ewc_loss": 0.00800560787320137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.005608106032014e-05, + "grad_norm": 3.945255994796753, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8676244616508484, + "num_tokens": 482586461.0, + "step": 12652 + }, + { + "epoch": 1.6095916550057243, + "ewc_loss": 0.007956246845424175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956246554385871e-05, + "grad_norm": 3.9457452297210693, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8687924742698669, + "num_tokens": 482624642.0, + "step": 12653 + }, + { + "epoch": 1.6097188652843148, + "ewc_loss": 0.00797434151172638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974341133376583e-05, + "grad_norm": 3.8760101795196533, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8750175833702087, + "num_tokens": 482664096.0, + "step": 12654 + }, + { + "epoch": 1.6098460755629054, + "ewc_loss": 0.007935043424367905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935042958706617e-05, + "grad_norm": 3.9503705501556396, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8849435448646545, + "num_tokens": 482701230.0, + "step": 12655 + }, + { + "epoch": 1.609973285841496, + "ewc_loss": 0.007993734441697598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993734470801428e-05, + "grad_norm": 3.906527042388916, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8753857612609863, + "num_tokens": 482741611.0, + "step": 12656 + }, + { + "epoch": 1.6101004961200864, + "ewc_loss": 0.007925630547106266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92563077993691e-05, + "grad_norm": 3.906641721725464, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8858118653297424, + "num_tokens": 482779482.0, + "step": 12657 + }, + { + "epoch": 1.610227706398677, + "ewc_loss": 0.007955515757203102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955515320645645e-05, + "grad_norm": 3.9110374450683594, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8823286294937134, + "num_tokens": 482816279.0, + "step": 12658 + }, + { + "epoch": 1.6103549166772675, + "ewc_loss": 0.007940491661429405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94049192336388e-05, + "grad_norm": 3.8831698894500732, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8658007979393005, + "num_tokens": 482859941.0, + "step": 12659 + }, + { + "epoch": 1.610482126955858, + "ewc_loss": 0.007924509234726429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924509554868564e-05, + "grad_norm": 3.945415496826172, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8798525333404541, + "num_tokens": 482894348.0, + "step": 12660 + }, + { + "epoch": 1.6106093372344485, + "ewc_loss": 0.007948074489831924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94807419879362e-05, + "grad_norm": 3.99245023727417, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8699397444725037, + "num_tokens": 482930882.0, + "step": 12661 + }, + { + "epoch": 1.610736547513039, + "ewc_loss": 0.007940559647977352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940559589769691e-05, + "grad_norm": 3.9523096084594727, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8576820492744446, + "num_tokens": 482969515.0, + "step": 12662 + }, + { + "epoch": 1.6108637577916296, + "ewc_loss": 0.007896313443779945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896313036326319e-05, + "grad_norm": 3.9068989753723145, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.871807336807251, + "num_tokens": 483008074.0, + "step": 12663 + }, + { + "epoch": 1.6109909680702201, + "ewc_loss": 0.007890546694397926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.890546839917079e-05, + "grad_norm": 3.983682632446289, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.86795973777771, + "num_tokens": 483038920.0, + "step": 12664 + }, + { + "epoch": 1.6111181783488107, + "ewc_loss": 0.007966984063386917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.966983685037121e-05, + "grad_norm": 3.861511707305908, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8861516118049622, + "num_tokens": 483075559.0, + "step": 12665 + }, + { + "epoch": 1.6112453886274012, + "ewc_loss": 0.00786449946463108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.864499639254063e-05, + "grad_norm": 3.91440486907959, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8777207136154175, + "num_tokens": 483114378.0, + "step": 12666 + }, + { + "epoch": 1.6113725989059917, + "ewc_loss": 0.007951224222779274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9512239608448e-05, + "grad_norm": 3.9421608448028564, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8804064989089966, + "num_tokens": 483147911.0, + "step": 12667 + }, + { + "epoch": 1.6114998091845822, + "ewc_loss": 0.007956947200000286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956947229104117e-05, + "grad_norm": 3.859287738800049, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8890149593353271, + "num_tokens": 483188275.0, + "step": 12668 + }, + { + "epoch": 1.6116270194631728, + "ewc_loss": 0.00788490753620863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884907245170325e-05, + "grad_norm": 3.837280035018921, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8868087530136108, + "num_tokens": 483230804.0, + "step": 12669 + }, + { + "epoch": 1.6117542297417633, + "ewc_loss": 0.007928412407636642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928412378532812e-05, + "grad_norm": 3.989985466003418, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8782232999801636, + "num_tokens": 483262368.0, + "step": 12670 + }, + { + "epoch": 1.6118814400203536, + "ewc_loss": 0.008002514019608498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002514368854463e-05, + "grad_norm": 3.921444892883301, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8632590174674988, + "num_tokens": 483300894.0, + "step": 12671 + }, + { + "epoch": 1.6120086502989441, + "ewc_loss": 0.00790082011371851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.900820492068306e-05, + "grad_norm": 3.923008680343628, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8800531625747681, + "num_tokens": 483337377.0, + "step": 12672 + }, + { + "epoch": 1.6121358605775347, + "ewc_loss": 0.007943042553961277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94304214650765e-05, + "grad_norm": 3.8999640941619873, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.874988317489624, + "num_tokens": 483376572.0, + "step": 12673 + }, + { + "epoch": 1.6122630708561252, + "ewc_loss": 0.007926017977297306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926017860881984e-05, + "grad_norm": 3.9166855812072754, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8630038499832153, + "num_tokens": 483415584.0, + "step": 12674 + }, + { + "epoch": 1.6123902811347157, + "ewc_loss": 0.00794217549264431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942175579955801e-05, + "grad_norm": 3.866900682449341, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8766365647315979, + "num_tokens": 483459584.0, + "step": 12675 + }, + { + "epoch": 1.6125174914133062, + "ewc_loss": 0.007909983396530151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909983105491847e-05, + "grad_norm": 3.870856761932373, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8703383803367615, + "num_tokens": 483504260.0, + "step": 12676 + }, + { + "epoch": 1.6126447016918966, + "ewc_loss": 0.007932014763355255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932014705147594e-05, + "grad_norm": 3.86669921875, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8784242868423462, + "num_tokens": 483547271.0, + "step": 12677 + }, + { + "epoch": 1.612771911970487, + "ewc_loss": 0.007911043241620064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911043212516233e-05, + "grad_norm": 3.951321601867676, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8775231838226318, + "num_tokens": 483582255.0, + "step": 12678 + }, + { + "epoch": 1.6128991222490776, + "ewc_loss": 0.00795307382941246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953074236866087e-05, + "grad_norm": 3.897138833999634, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.870798647403717, + "num_tokens": 483622290.0, + "step": 12679 + }, + { + "epoch": 1.6130263325276681, + "ewc_loss": 0.007892080582678318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.892080611782148e-05, + "grad_norm": 3.8674862384796143, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8864678144454956, + "num_tokens": 483657253.0, + "step": 12680 + }, + { + "epoch": 1.6131535428062587, + "ewc_loss": 0.007896269671618938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896269380580634e-05, + "grad_norm": 3.9175004959106445, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8794313669204712, + "num_tokens": 483693062.0, + "step": 12681 + }, + { + "epoch": 1.6132807530848492, + "ewc_loss": 0.007935757748782635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935758185340092e-05, + "grad_norm": 3.880307912826538, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8711600303649902, + "num_tokens": 483735845.0, + "step": 12682 + }, + { + "epoch": 1.6134079633634397, + "ewc_loss": 0.007876148447394371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876148447394371e-05, + "grad_norm": 3.8158655166625977, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8793045282363892, + "num_tokens": 483783833.0, + "step": 12683 + }, + { + "epoch": 1.6135351736420303, + "ewc_loss": 0.007859566248953342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859566539991647e-05, + "grad_norm": 3.9748833179473877, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8743900656700134, + "num_tokens": 483820175.0, + "step": 12684 + }, + { + "epoch": 1.6136623839206208, + "ewc_loss": 0.007964633405208588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96463355072774e-05, + "grad_norm": 3.8437063694000244, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8858629465103149, + "num_tokens": 483863454.0, + "step": 12685 + }, + { + "epoch": 1.6137895941992113, + "ewc_loss": 0.007813768461346626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81376802478917e-05, + "grad_norm": 3.8815057277679443, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8938349485397339, + "num_tokens": 483900657.0, + "step": 12686 + }, + { + "epoch": 1.6139168044778018, + "ewc_loss": 0.007879790849983692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.879790791776031e-05, + "grad_norm": 3.9440925121307373, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8684927225112915, + "num_tokens": 483935912.0, + "step": 12687 + }, + { + "epoch": 1.6140440147563924, + "ewc_loss": 0.00788284931331873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.882848876761273e-05, + "grad_norm": 3.963106632232666, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8752713203430176, + "num_tokens": 483968594.0, + "step": 12688 + }, + { + "epoch": 1.614171225034983, + "ewc_loss": 0.007884891703724861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884891238063574e-05, + "grad_norm": 3.8598012924194336, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8843086957931519, + "num_tokens": 484010866.0, + "step": 12689 + }, + { + "epoch": 1.6142984353135734, + "ewc_loss": 0.007822781801223755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.82278148108162e-05, + "grad_norm": 3.9280893802642822, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.869774341583252, + "num_tokens": 484051355.0, + "step": 12690 + }, + { + "epoch": 1.614425645592164, + "ewc_loss": 0.007889007218182087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889007247285917e-05, + "grad_norm": 3.884230852127075, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.887019157409668, + "num_tokens": 484087049.0, + "step": 12691 + }, + { + "epoch": 1.6145528558707545, + "ewc_loss": 0.007820045575499535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.820045721018687e-05, + "grad_norm": 3.944650411605835, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8643783330917358, + "num_tokens": 484124914.0, + "step": 12692 + }, + { + "epoch": 1.614680066149345, + "ewc_loss": 0.007894700393080711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.894699956523255e-05, + "grad_norm": 3.942300796508789, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8914097547531128, + "num_tokens": 484154524.0, + "step": 12693 + }, + { + "epoch": 1.6148072764279355, + "ewc_loss": 0.007858225144445896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.858225581003353e-05, + "grad_norm": 3.9092752933502197, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8735837936401367, + "num_tokens": 484195091.0, + "step": 12694 + }, + { + "epoch": 1.6149344867065258, + "ewc_loss": 0.00784887932240963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.848878885852173e-05, + "grad_norm": 3.8789618015289307, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8773746490478516, + "num_tokens": 484236787.0, + "step": 12695 + }, + { + "epoch": 1.6150616969851164, + "ewc_loss": 0.00782941747456789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829417154425755e-05, + "grad_norm": 3.8852920532226562, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8718730211257935, + "num_tokens": 484281748.0, + "step": 12696 + }, + { + "epoch": 1.615188907263707, + "ewc_loss": 0.007843833416700363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.843833009246737e-05, + "grad_norm": 3.8674421310424805, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8827630877494812, + "num_tokens": 484321181.0, + "step": 12697 + }, + { + "epoch": 1.6153161175422974, + "ewc_loss": 0.007836339063942432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.836339500499889e-05, + "grad_norm": 3.902940273284912, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8736968040466309, + "num_tokens": 484360476.0, + "step": 12698 + }, + { + "epoch": 1.615443327820888, + "ewc_loss": 0.00786794163286686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867941167205572e-05, + "grad_norm": 3.9498209953308105, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8773418664932251, + "num_tokens": 484395065.0, + "step": 12699 + }, + { + "epoch": 1.6155705380994785, + "ewc_loss": 0.007874307222664356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.874307630117983e-05, + "grad_norm": 3.877255439758301, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.877035915851593, + "num_tokens": 484439309.0, + "step": 12700 + }, + { + "epoch": 1.6156977483780688, + "ewc_loss": 0.007793245371431112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.793245458742604e-05, + "grad_norm": 3.9525322914123535, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8757991790771484, + "num_tokens": 484473018.0, + "step": 12701 + }, + { + "epoch": 1.6158249586566593, + "ewc_loss": 0.007878909818828106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878909673308954e-05, + "grad_norm": 3.90262508392334, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.88199782371521, + "num_tokens": 484518306.0, + "step": 12702 + }, + { + "epoch": 1.6159521689352498, + "ewc_loss": 0.007817520759999752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.817520963726565e-05, + "grad_norm": 3.944793462753296, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8623411655426025, + "num_tokens": 484556348.0, + "step": 12703 + }, + { + "epoch": 1.6160793792138404, + "ewc_loss": 0.007861444726586342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.861445192247629e-05, + "grad_norm": 3.912137269973755, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8867111206054688, + "num_tokens": 484593136.0, + "step": 12704 + }, + { + "epoch": 1.616206589492431, + "ewc_loss": 0.007835605181753635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.835605356376618e-05, + "grad_norm": 3.9431328773498535, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8865031003952026, + "num_tokens": 484628025.0, + "step": 12705 + }, + { + "epoch": 1.6163337997710214, + "ewc_loss": 0.00785642210394144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.856421871110797e-05, + "grad_norm": 3.905003786087036, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8819270133972168, + "num_tokens": 484666703.0, + "step": 12706 + }, + { + "epoch": 1.616461010049612, + "ewc_loss": 0.0078219473361969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.821946928743273e-05, + "grad_norm": 3.906346321105957, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8838963508605957, + "num_tokens": 484703417.0, + "step": 12707 + }, + { + "epoch": 1.6165882203282025, + "ewc_loss": 0.007837790064513683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.837790326448157e-05, + "grad_norm": 3.9731788635253906, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8888654112815857, + "num_tokens": 484741830.0, + "step": 12708 + }, + { + "epoch": 1.616715430606793, + "ewc_loss": 0.007873413152992725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8734134149272e-05, + "grad_norm": 3.972705841064453, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.873998761177063, + "num_tokens": 484780140.0, + "step": 12709 + }, + { + "epoch": 1.6168426408853835, + "ewc_loss": 0.007831632159650326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831631955923513e-05, + "grad_norm": 3.9349794387817383, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8738671541213989, + "num_tokens": 484813856.0, + "step": 12710 + }, + { + "epoch": 1.616969851163974, + "ewc_loss": 0.007818082347512245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.818081940058619e-05, + "grad_norm": 3.9067742824554443, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8837884664535522, + "num_tokens": 484851383.0, + "step": 12711 + }, + { + "epoch": 1.6170970614425646, + "ewc_loss": 0.007830404676496983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.830404501874e-05, + "grad_norm": 3.9399259090423584, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8847172260284424, + "num_tokens": 484884361.0, + "step": 12712 + }, + { + "epoch": 1.6172242717211551, + "ewc_loss": 0.007872423157095909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.872423157095909e-05, + "grad_norm": 3.933932304382324, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8559285402297974, + "num_tokens": 484922106.0, + "step": 12713 + }, + { + "epoch": 1.6173514819997457, + "ewc_loss": 0.007871461100876331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.871461275499314e-05, + "grad_norm": 3.9380288124084473, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8754041194915771, + "num_tokens": 484963100.0, + "step": 12714 + }, + { + "epoch": 1.6174786922783362, + "ewc_loss": 0.007862589322030544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86258970038034e-05, + "grad_norm": 3.8411805629730225, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8857079744338989, + "num_tokens": 485006896.0, + "step": 12715 + }, + { + "epoch": 1.6176059025569267, + "ewc_loss": 0.007809191942214966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.80919217504561e-05, + "grad_norm": 3.9028432369232178, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8748295903205872, + "num_tokens": 485045968.0, + "step": 12716 + }, + { + "epoch": 1.6177331128355172, + "ewc_loss": 0.007876187562942505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876187737565488e-05, + "grad_norm": 3.8817062377929688, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8733069896697998, + "num_tokens": 485084736.0, + "step": 12717 + }, + { + "epoch": 1.6178603231141078, + "ewc_loss": 0.007826663553714752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.826663932064548e-05, + "grad_norm": 3.8988912105560303, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8842489719390869, + "num_tokens": 485123130.0, + "step": 12718 + }, + { + "epoch": 1.6179875333926983, + "ewc_loss": 0.007865961641073227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.865962106734514e-05, + "grad_norm": 3.9470407962799072, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8724607229232788, + "num_tokens": 485158072.0, + "step": 12719 + }, + { + "epoch": 1.6181147436712886, + "ewc_loss": 0.007871815003454685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.871814887039363e-05, + "grad_norm": 3.9289968013763428, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8830468654632568, + "num_tokens": 485196783.0, + "step": 12720 + }, + { + "epoch": 1.6182419539498791, + "ewc_loss": 0.007868860848248005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868860848248005e-05, + "grad_norm": 3.8961539268493652, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8823661804199219, + "num_tokens": 485231433.0, + "step": 12721 + }, + { + "epoch": 1.6183691642284697, + "ewc_loss": 0.007870794273912907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870794070186093e-05, + "grad_norm": 3.9586801528930664, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8756293058395386, + "num_tokens": 485269016.0, + "step": 12722 + }, + { + "epoch": 1.6184963745070602, + "ewc_loss": 0.007901571691036224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90157137089409e-05, + "grad_norm": 3.8856451511383057, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8664016723632812, + "num_tokens": 485308293.0, + "step": 12723 + }, + { + "epoch": 1.6186235847856507, + "ewc_loss": 0.007855921983718872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.855922012822703e-05, + "grad_norm": 3.9420037269592285, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8852484822273254, + "num_tokens": 485340758.0, + "step": 12724 + }, + { + "epoch": 1.6187507950642412, + "ewc_loss": 0.007916365750133991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.916365575511009e-05, + "grad_norm": 3.8223752975463867, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8901093006134033, + "num_tokens": 485384047.0, + "step": 12725 + }, + { + "epoch": 1.6188780053428315, + "ewc_loss": 0.007831740193068981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.831740367691964e-05, + "grad_norm": 3.9541702270507812, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8793675899505615, + "num_tokens": 485416909.0, + "step": 12726 + }, + { + "epoch": 1.619005215621422, + "ewc_loss": 0.007966604083776474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.966603880049661e-05, + "grad_norm": 3.980240821838379, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8777940273284912, + "num_tokens": 485449532.0, + "step": 12727 + }, + { + "epoch": 1.6191324259000126, + "ewc_loss": 0.007917727343738079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917726907180622e-05, + "grad_norm": 3.8533971309661865, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8868243098258972, + "num_tokens": 485491171.0, + "step": 12728 + }, + { + "epoch": 1.6192596361786031, + "ewc_loss": 0.007853811606764793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85381198511459e-05, + "grad_norm": 3.963742256164551, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8710284233093262, + "num_tokens": 485525511.0, + "step": 12729 + }, + { + "epoch": 1.6193868464571937, + "ewc_loss": 0.00796572770923376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.965727854752913e-05, + "grad_norm": 3.909008502960205, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.873782217502594, + "num_tokens": 485562517.0, + "step": 12730 + }, + { + "epoch": 1.6195140567357842, + "ewc_loss": 0.007882772013545036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.882771751610562e-05, + "grad_norm": 3.905762195587158, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.87745600938797, + "num_tokens": 485601058.0, + "step": 12731 + }, + { + "epoch": 1.6196412670143747, + "ewc_loss": 0.00791975762695074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919757626950741e-05, + "grad_norm": 3.943438768386841, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8898261785507202, + "num_tokens": 485634737.0, + "step": 12732 + }, + { + "epoch": 1.6197684772929652, + "ewc_loss": 0.007944174110889435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944174285512418e-05, + "grad_norm": 3.8863189220428467, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.87980055809021, + "num_tokens": 485671300.0, + "step": 12733 + }, + { + "epoch": 1.6198956875715558, + "ewc_loss": 0.007889376021921635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889376138336957e-05, + "grad_norm": 3.8981308937072754, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8842368125915527, + "num_tokens": 485708712.0, + "step": 12734 + }, + { + "epoch": 1.6200228978501463, + "ewc_loss": 0.007928316481411457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928316335892305e-05, + "grad_norm": 3.873084306716919, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.888900637626648, + "num_tokens": 485746820.0, + "step": 12735 + }, + { + "epoch": 1.6201501081287368, + "ewc_loss": 0.007890509441494942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.890509732533246e-05, + "grad_norm": 3.9182558059692383, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8767088651657104, + "num_tokens": 485785921.0, + "step": 12736 + }, + { + "epoch": 1.6202773184073274, + "ewc_loss": 0.007924331352114677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924331293907017e-05, + "grad_norm": 3.891110420227051, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8768975138664246, + "num_tokens": 485825766.0, + "step": 12737 + }, + { + "epoch": 1.6204045286859179, + "ewc_loss": 0.007901683449745178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901683420641348e-05, + "grad_norm": 3.912743330001831, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8714334964752197, + "num_tokens": 485862288.0, + "step": 12738 + }, + { + "epoch": 1.6205317389645084, + "ewc_loss": 0.007924704812467098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924704550532624e-05, + "grad_norm": 3.874386787414551, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8716673851013184, + "num_tokens": 485903582.0, + "step": 12739 + }, + { + "epoch": 1.620658949243099, + "ewc_loss": 0.007882324978709221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.882325007813051e-05, + "grad_norm": 3.8582892417907715, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8728481531143188, + "num_tokens": 485950931.0, + "step": 12740 + }, + { + "epoch": 1.6207861595216895, + "ewc_loss": 0.00790179893374443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901799108367413e-05, + "grad_norm": 3.9530189037323, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8788890242576599, + "num_tokens": 485984185.0, + "step": 12741 + }, + { + "epoch": 1.62091336980028, + "ewc_loss": 0.007952926680445671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952926534926519e-05, + "grad_norm": 3.9036002159118652, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8830348253250122, + "num_tokens": 486018796.0, + "step": 12742 + }, + { + "epoch": 1.6210405800788705, + "ewc_loss": 0.00789730530232191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.897305476944894e-05, + "grad_norm": 3.9604570865631104, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.863226592540741, + "num_tokens": 486055328.0, + "step": 12743 + }, + { + "epoch": 1.6211677903574608, + "ewc_loss": 0.00793471559882164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934715540613979e-05, + "grad_norm": 3.9133777618408203, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8722997903823853, + "num_tokens": 486092340.0, + "step": 12744 + }, + { + "epoch": 1.6212950006360514, + "ewc_loss": 0.007878749631345272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878749602241442e-05, + "grad_norm": 3.9847583770751953, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8631153106689453, + "num_tokens": 486123195.0, + "step": 12745 + }, + { + "epoch": 1.621422210914642, + "ewc_loss": 0.007957002148032188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957001798786223e-05, + "grad_norm": 3.866727113723755, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8852872848510742, + "num_tokens": 486167145.0, + "step": 12746 + }, + { + "epoch": 1.6215494211932324, + "ewc_loss": 0.007859766483306885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.859766628826037e-05, + "grad_norm": 3.8952672481536865, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8683660626411438, + "num_tokens": 486207925.0, + "step": 12747 + }, + { + "epoch": 1.621676631471823, + "ewc_loss": 0.007945004850625992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.945005199871957e-05, + "grad_norm": 3.9455668926239014, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8806179761886597, + "num_tokens": 486245446.0, + "step": 12748 + }, + { + "epoch": 1.6218038417504135, + "ewc_loss": 0.007941205985844135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.941206422401592e-05, + "grad_norm": 3.9572272300720215, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8774152398109436, + "num_tokens": 486280461.0, + "step": 12749 + }, + { + "epoch": 1.6219310520290038, + "ewc_loss": 0.007904648780822754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.904649100964889e-05, + "grad_norm": 3.8754384517669678, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8651673793792725, + "num_tokens": 486323926.0, + "step": 12750 + }, + { + "epoch": 1.6220582623075943, + "ewc_loss": 0.00788366049528122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.883660146035254e-05, + "grad_norm": 3.922081470489502, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8693563938140869, + "num_tokens": 486360016.0, + "step": 12751 + }, + { + "epoch": 1.6221854725861848, + "ewc_loss": 0.007939688861370087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939688657643273e-05, + "grad_norm": 3.911592483520508, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8801721334457397, + "num_tokens": 486394994.0, + "step": 12752 + }, + { + "epoch": 1.6223126828647754, + "ewc_loss": 0.00790648628026247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90648628026247e-05, + "grad_norm": 3.8736677169799805, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8764591217041016, + "num_tokens": 486436909.0, + "step": 12753 + }, + { + "epoch": 1.622439893143366, + "ewc_loss": 0.00789718795567751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.897187606431544e-05, + "grad_norm": 3.9183802604675293, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.882671058177948, + "num_tokens": 486473201.0, + "step": 12754 + }, + { + "epoch": 1.6225671034219564, + "ewc_loss": 0.007944482378661633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944482786115259e-05, + "grad_norm": 3.936729669570923, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8591645359992981, + "num_tokens": 486510733.0, + "step": 12755 + }, + { + "epoch": 1.622694313700547, + "ewc_loss": 0.007928002625703812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928002742119133e-05, + "grad_norm": 3.9030799865722656, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8896371126174927, + "num_tokens": 486546666.0, + "step": 12756 + }, + { + "epoch": 1.6228215239791375, + "ewc_loss": 0.007922044955193996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92204518802464e-05, + "grad_norm": 3.9642040729522705, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8573369383811951, + "num_tokens": 486585639.0, + "step": 12757 + }, + { + "epoch": 1.622948734257728, + "ewc_loss": 0.007952322252094746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952322630444542e-05, + "grad_norm": 3.887894630432129, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8833372592926025, + "num_tokens": 486623028.0, + "step": 12758 + }, + { + "epoch": 1.6230759445363185, + "ewc_loss": 0.007877345196902752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877345342421904e-05, + "grad_norm": 3.969003438949585, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8729208111763, + "num_tokens": 486655906.0, + "step": 12759 + }, + { + "epoch": 1.623203154814909, + "ewc_loss": 0.007979015819728374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979015936143696e-05, + "grad_norm": 3.953080415725708, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8719849586486816, + "num_tokens": 486689461.0, + "step": 12760 + }, + { + "epoch": 1.6233303650934996, + "ewc_loss": 0.007942549884319305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94254956417717e-05, + "grad_norm": 3.9525020122528076, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8622370958328247, + "num_tokens": 486728373.0, + "step": 12761 + }, + { + "epoch": 1.6234575753720901, + "ewc_loss": 0.007940595969557762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940595969557762e-05, + "grad_norm": 3.83872127532959, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8851360082626343, + "num_tokens": 486769955.0, + "step": 12762 + }, + { + "epoch": 1.6235847856506807, + "ewc_loss": 0.007895532995462418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.895533053670079e-05, + "grad_norm": 3.949646234512329, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8793306350708008, + "num_tokens": 486806468.0, + "step": 12763 + }, + { + "epoch": 1.6237119959292712, + "ewc_loss": 0.008003709837794304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003709808690473e-05, + "grad_norm": 3.9176840782165527, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8638803958892822, + "num_tokens": 486846526.0, + "step": 12764 + }, + { + "epoch": 1.6238392062078617, + "ewc_loss": 0.007926716469228268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926716352812946e-05, + "grad_norm": 3.869971513748169, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8925917148590088, + "num_tokens": 486886323.0, + "step": 12765 + }, + { + "epoch": 1.6239664164864522, + "ewc_loss": 0.007909231819212437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909231499070302e-05, + "grad_norm": 3.9895145893096924, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8594554662704468, + "num_tokens": 486918099.0, + "step": 12766 + }, + { + "epoch": 1.6240936267650428, + "ewc_loss": 0.007997148670256138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997148350114003e-05, + "grad_norm": 3.8786256313323975, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8901231288909912, + "num_tokens": 486953789.0, + "step": 12767 + }, + { + "epoch": 1.6242208370436333, + "ewc_loss": 0.007906974293291569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.906974497018382e-05, + "grad_norm": 3.87616229057312, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8752173185348511, + "num_tokens": 486996165.0, + "step": 12768 + }, + { + "epoch": 1.6243480473222236, + "ewc_loss": 0.007941502146422863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94150255387649e-05, + "grad_norm": 3.9392197132110596, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8876684308052063, + "num_tokens": 487028249.0, + "step": 12769 + }, + { + "epoch": 1.6244752576008141, + "ewc_loss": 0.00797637552022934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976375491125509e-05, + "grad_norm": 3.8877687454223633, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8764613270759583, + "num_tokens": 487070099.0, + "step": 12770 + }, + { + "epoch": 1.6246024678794047, + "ewc_loss": 0.00790358241647482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90358244557865e-05, + "grad_norm": 3.9060299396514893, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8795280456542969, + "num_tokens": 487107117.0, + "step": 12771 + }, + { + "epoch": 1.6247296781579952, + "ewc_loss": 0.007960805669426918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960805669426918e-05, + "grad_norm": 3.9294564723968506, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.865888237953186, + "num_tokens": 487146024.0, + "step": 12772 + }, + { + "epoch": 1.6248568884365857, + "ewc_loss": 0.007926744408905506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926744729047641e-05, + "grad_norm": 3.9037628173828125, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8653538227081299, + "num_tokens": 487189864.0, + "step": 12773 + }, + { + "epoch": 1.6249840987151762, + "ewc_loss": 0.007918219082057476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918218761915341e-05, + "grad_norm": 3.9060823917388916, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8796856999397278, + "num_tokens": 487230307.0, + "step": 12774 + }, + { + "epoch": 1.6251113089937665, + "ewc_loss": 0.007930828258395195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.930828724056482e-05, + "grad_norm": 3.9802968502044678, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8810848593711853, + "num_tokens": 487261387.0, + "step": 12775 + }, + { + "epoch": 1.625238519272357, + "ewc_loss": 0.007949030958116055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949030987219885e-05, + "grad_norm": 3.9342668056488037, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8657851219177246, + "num_tokens": 487298611.0, + "step": 12776 + }, + { + "epoch": 1.6253657295509476, + "ewc_loss": 0.007907839491963387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.907839608378708e-05, + "grad_norm": 3.8962292671203613, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8756041526794434, + "num_tokens": 487338172.0, + "step": 12777 + }, + { + "epoch": 1.6254929398295381, + "ewc_loss": 0.007910976186394691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910976273706183e-05, + "grad_norm": 3.9102163314819336, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8771900534629822, + "num_tokens": 487375126.0, + "step": 12778 + }, + { + "epoch": 1.6256201501081287, + "ewc_loss": 0.007919007912278175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91900820331648e-05, + "grad_norm": 3.868703842163086, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8879837989807129, + "num_tokens": 487416539.0, + "step": 12779 + }, + { + "epoch": 1.6257473603867192, + "ewc_loss": 0.00790863111615181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908631232567132e-05, + "grad_norm": 3.9896862506866455, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8788483142852783, + "num_tokens": 487448429.0, + "step": 12780 + }, + { + "epoch": 1.6258745706653097, + "ewc_loss": 0.007991332560777664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991332677192986e-05, + "grad_norm": 3.946607828140259, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8764238357543945, + "num_tokens": 487486054.0, + "step": 12781 + }, + { + "epoch": 1.6260017809439002, + "ewc_loss": 0.007914167828857899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914167508715764e-05, + "grad_norm": 3.8743088245391846, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8821307420730591, + "num_tokens": 487523771.0, + "step": 12782 + }, + { + "epoch": 1.6261289912224908, + "ewc_loss": 0.007920481264591217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92048085713759e-05, + "grad_norm": 3.9109396934509277, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8821552991867065, + "num_tokens": 487561334.0, + "step": 12783 + }, + { + "epoch": 1.6262562015010813, + "ewc_loss": 0.007948038168251514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94803854660131e-05, + "grad_norm": 3.846492290496826, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.877851665019989, + "num_tokens": 487606382.0, + "step": 12784 + }, + { + "epoch": 1.6263834117796718, + "ewc_loss": 0.0079084113240242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908411498647183e-05, + "grad_norm": 3.920985698699951, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8736109733581543, + "num_tokens": 487647640.0, + "step": 12785 + }, + { + "epoch": 1.6265106220582624, + "ewc_loss": 0.007958629168570042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.958629430504516e-05, + "grad_norm": 3.9430558681488037, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8734005689620972, + "num_tokens": 487682604.0, + "step": 12786 + }, + { + "epoch": 1.6266378323368529, + "ewc_loss": 0.007934032008051872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934031600598246e-05, + "grad_norm": 3.8972671031951904, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8803439140319824, + "num_tokens": 487718271.0, + "step": 12787 + }, + { + "epoch": 1.6267650426154434, + "ewc_loss": 0.00791690219193697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.916901813587174e-05, + "grad_norm": 3.9517269134521484, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8635554313659668, + "num_tokens": 487752872.0, + "step": 12788 + }, + { + "epoch": 1.626892252894034, + "ewc_loss": 0.007944968529045582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944968820083886e-05, + "grad_norm": 3.871185064315796, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8681768178939819, + "num_tokens": 487798309.0, + "step": 12789 + }, + { + "epoch": 1.6270194631726245, + "ewc_loss": 0.007884630002081394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.884630031185225e-05, + "grad_norm": 3.9607954025268555, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8664214015007019, + "num_tokens": 487834052.0, + "step": 12790 + }, + { + "epoch": 1.627146673451215, + "ewc_loss": 0.007973713800311089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973713945830241e-05, + "grad_norm": 3.9628045558929443, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8654149770736694, + "num_tokens": 487868662.0, + "step": 12791 + }, + { + "epoch": 1.6272738837298055, + "ewc_loss": 0.007931014522910118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.931014988571405e-05, + "grad_norm": 3.9391977787017822, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8740195035934448, + "num_tokens": 487907425.0, + "step": 12792 + }, + { + "epoch": 1.6274010940083958, + "ewc_loss": 0.007922517135739326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.922516670078039e-05, + "grad_norm": 3.8894388675689697, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8849352598190308, + "num_tokens": 487942979.0, + "step": 12793 + }, + { + "epoch": 1.6275283042869864, + "ewc_loss": 0.007911264896392822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911265129223466e-05, + "grad_norm": 3.9124746322631836, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8808046579360962, + "num_tokens": 487982402.0, + "step": 12794 + }, + { + "epoch": 1.6276555145655769, + "ewc_loss": 0.007936566136777401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936566544231027e-05, + "grad_norm": 3.9350297451019287, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.884211540222168, + "num_tokens": 488020071.0, + "step": 12795 + }, + { + "epoch": 1.6277827248441674, + "ewc_loss": 0.007947046309709549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.947046105982736e-05, + "grad_norm": 3.930595636367798, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8639564514160156, + "num_tokens": 488057457.0, + "step": 12796 + }, + { + "epoch": 1.627909935122758, + "ewc_loss": 0.00792330875992775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923308294266462e-05, + "grad_norm": 3.938746452331543, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8725646138191223, + "num_tokens": 488094002.0, + "step": 12797 + }, + { + "epoch": 1.6280371454013485, + "ewc_loss": 0.007939750328660011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939750503282994e-05, + "grad_norm": 3.902794361114502, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8627809882164001, + "num_tokens": 488134995.0, + "step": 12798 + }, + { + "epoch": 1.6281643556799388, + "ewc_loss": 0.007915686815977097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915686728665605e-05, + "grad_norm": 3.948901653289795, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8800495862960815, + "num_tokens": 488169942.0, + "step": 12799 + }, + { + "epoch": 1.6282915659585293, + "ewc_loss": 0.00796005129814148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960051152622327e-05, + "grad_norm": 3.9009222984313965, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8777599334716797, + "num_tokens": 488210852.0, + "step": 12800 + }, + { + "epoch": 1.6284187762371198, + "ewc_loss": 0.007909824140369892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909824489615858e-05, + "grad_norm": 3.893216609954834, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8788461089134216, + "num_tokens": 488253067.0, + "step": 12801 + }, + { + "epoch": 1.6285459865157104, + "ewc_loss": 0.007932181470096111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932181324576959e-05, + "grad_norm": 3.9294443130493164, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8859190344810486, + "num_tokens": 488293498.0, + "step": 12802 + }, + { + "epoch": 1.628673196794301, + "ewc_loss": 0.00792567990720272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925679528852925e-05, + "grad_norm": 3.8964662551879883, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8763251304626465, + "num_tokens": 488333931.0, + "step": 12803 + }, + { + "epoch": 1.6288004070728914, + "ewc_loss": 0.007911310531198978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911310967756435e-05, + "grad_norm": 3.9328722953796387, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8803147077560425, + "num_tokens": 488371724.0, + "step": 12804 + }, + { + "epoch": 1.628927617351482, + "ewc_loss": 0.007930736988782883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.930737046990544e-05, + "grad_norm": 3.8734288215637207, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8719706535339355, + "num_tokens": 488415171.0, + "step": 12805 + }, + { + "epoch": 1.6290548276300725, + "ewc_loss": 0.00786836352199316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868363900342956e-05, + "grad_norm": 3.9738786220550537, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8706499934196472, + "num_tokens": 488451672.0, + "step": 12806 + }, + { + "epoch": 1.629182037908663, + "ewc_loss": 0.00793458241969347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93458239058964e-05, + "grad_norm": 3.8897647857666016, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8901517391204834, + "num_tokens": 488492657.0, + "step": 12807 + }, + { + "epoch": 1.6293092481872535, + "ewc_loss": 0.007850264199078083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.850264228181913e-05, + "grad_norm": 3.8814358711242676, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8839404582977295, + "num_tokens": 488530681.0, + "step": 12808 + }, + { + "epoch": 1.629436458465844, + "ewc_loss": 0.007879680953919888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.879680924816057e-05, + "grad_norm": 3.9376230239868164, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8866211175918579, + "num_tokens": 488560542.0, + "step": 12809 + }, + { + "epoch": 1.6295636687444346, + "ewc_loss": 0.007905879989266396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.905880192993209e-05, + "grad_norm": 3.9662814140319824, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8775191307067871, + "num_tokens": 488597491.0, + "step": 12810 + }, + { + "epoch": 1.6296908790230251, + "ewc_loss": 0.007899625226855278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.899625052232295e-05, + "grad_norm": 3.894810438156128, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8830149173736572, + "num_tokens": 488634556.0, + "step": 12811 + }, + { + "epoch": 1.6298180893016156, + "ewc_loss": 0.007852229289710522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.852229464333504e-05, + "grad_norm": 4.0032758712768555, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8724644184112549, + "num_tokens": 488668174.0, + "step": 12812 + }, + { + "epoch": 1.6299452995802062, + "ewc_loss": 0.007938025519251823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938025373732671e-05, + "grad_norm": 3.9049196243286133, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8779195547103882, + "num_tokens": 488705994.0, + "step": 12813 + }, + { + "epoch": 1.6300725098587967, + "ewc_loss": 0.007853241637349129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.853241550037637e-05, + "grad_norm": 3.906313896179199, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8666878938674927, + "num_tokens": 488746908.0, + "step": 12814 + }, + { + "epoch": 1.6301997201373872, + "ewc_loss": 0.00787876732647419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878767064539716e-05, + "grad_norm": 3.8844878673553467, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8772578835487366, + "num_tokens": 488785607.0, + "step": 12815 + }, + { + "epoch": 1.6303269304159778, + "ewc_loss": 0.007889491505920887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88949109846726e-05, + "grad_norm": 4.003657817840576, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8756576776504517, + "num_tokens": 488823831.0, + "step": 12816 + }, + { + "epoch": 1.630454140694568, + "ewc_loss": 0.007949399761855602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949399878270924e-05, + "grad_norm": 3.919842004776001, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.883274257183075, + "num_tokens": 488859944.0, + "step": 12817 + }, + { + "epoch": 1.6305813509731586, + "ewc_loss": 0.007871602661907673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.871602429077029e-05, + "grad_norm": 3.866063356399536, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.862092137336731, + "num_tokens": 488903764.0, + "step": 12818 + }, + { + "epoch": 1.6307085612517491, + "ewc_loss": 0.007862112484872341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86211239756085e-05, + "grad_norm": 3.9170801639556885, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8650729060173035, + "num_tokens": 488946890.0, + "step": 12819 + }, + { + "epoch": 1.6308357715303397, + "ewc_loss": 0.007911432534456253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911432476248592e-05, + "grad_norm": 3.950977325439453, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8729247450828552, + "num_tokens": 488981901.0, + "step": 12820 + }, + { + "epoch": 1.6309629818089302, + "ewc_loss": 0.00792631320655346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92631326476112e-05, + "grad_norm": 3.8618688583374023, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8785110116004944, + "num_tokens": 489024180.0, + "step": 12821 + }, + { + "epoch": 1.6310901920875207, + "ewc_loss": 0.007851880043745041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.851880218368024e-05, + "grad_norm": 3.8263137340545654, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8793267607688904, + "num_tokens": 489070971.0, + "step": 12822 + }, + { + "epoch": 1.6312174023661112, + "ewc_loss": 0.007880284450948238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880284101702273e-05, + "grad_norm": 3.883068561553955, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8850216865539551, + "num_tokens": 489111636.0, + "step": 12823 + }, + { + "epoch": 1.6313446126447015, + "ewc_loss": 0.007894966751337051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.894966984167695e-05, + "grad_norm": 3.9130587577819824, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8741347789764404, + "num_tokens": 489148677.0, + "step": 12824 + }, + { + "epoch": 1.631471822923292, + "ewc_loss": 0.007899889722466469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.899889897089452e-05, + "grad_norm": 3.936354637145996, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8660327792167664, + "num_tokens": 489186222.0, + "step": 12825 + }, + { + "epoch": 1.6315990332018826, + "ewc_loss": 0.007899775169789791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.899774936959147e-05, + "grad_norm": 3.96710205078125, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8515337705612183, + "num_tokens": 489223173.0, + "step": 12826 + }, + { + "epoch": 1.6317262434804731, + "ewc_loss": 0.007903838530182838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.903838559286669e-05, + "grad_norm": 3.8894224166870117, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8766786456108093, + "num_tokens": 489259236.0, + "step": 12827 + }, + { + "epoch": 1.6318534537590637, + "ewc_loss": 0.007857143878936768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857143646106124e-05, + "grad_norm": 3.88942551612854, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8684561252593994, + "num_tokens": 489298449.0, + "step": 12828 + }, + { + "epoch": 1.6319806640376542, + "ewc_loss": 0.007894164882600307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89416444604285e-05, + "grad_norm": 3.9633240699768066, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8584463596343994, + "num_tokens": 489334800.0, + "step": 12829 + }, + { + "epoch": 1.6321078743162447, + "ewc_loss": 0.007943278178572655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.943278615130112e-05, + "grad_norm": 3.8542590141296387, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8825243711471558, + "num_tokens": 489378510.0, + "step": 12830 + }, + { + "epoch": 1.6322350845948352, + "ewc_loss": 0.007863507606089115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86350792623125e-05, + "grad_norm": 3.8787922859191895, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8654522895812988, + "num_tokens": 489419267.0, + "step": 12831 + }, + { + "epoch": 1.6323622948734258, + "ewc_loss": 0.007918777875602245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918778283055872e-05, + "grad_norm": 3.9051942825317383, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8782939910888672, + "num_tokens": 489454772.0, + "step": 12832 + }, + { + "epoch": 1.6324895051520163, + "ewc_loss": 0.007925920188426971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925920363049954e-05, + "grad_norm": 3.9675192832946777, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8719768524169922, + "num_tokens": 489490146.0, + "step": 12833 + }, + { + "epoch": 1.6326167154306068, + "ewc_loss": 0.007946544326841831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946544064907357e-05, + "grad_norm": 3.8833022117614746, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8463459014892578, + "num_tokens": 489532754.0, + "step": 12834 + }, + { + "epoch": 1.6327439257091974, + "ewc_loss": 0.007894056849181652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.894056761870161e-05, + "grad_norm": 3.9458906650543213, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8733928203582764, + "num_tokens": 489570378.0, + "step": 12835 + }, + { + "epoch": 1.6328711359877879, + "ewc_loss": 0.007957967929542065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957968045957386e-05, + "grad_norm": 3.914862632751465, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8697855472564697, + "num_tokens": 489604562.0, + "step": 12836 + }, + { + "epoch": 1.6329983462663784, + "ewc_loss": 0.007943551056087017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.943550735944882e-05, + "grad_norm": 3.9268624782562256, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8848662376403809, + "num_tokens": 489644060.0, + "step": 12837 + }, + { + "epoch": 1.633125556544969, + "ewc_loss": 0.007960584945976734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960585207911208e-05, + "grad_norm": 3.9218087196350098, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8776756525039673, + "num_tokens": 489682693.0, + "step": 12838 + }, + { + "epoch": 1.6332527668235595, + "ewc_loss": 0.007948934100568295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948934216983616e-05, + "grad_norm": 3.913515329360962, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8845735788345337, + "num_tokens": 489720786.0, + "step": 12839 + }, + { + "epoch": 1.63337997710215, + "ewc_loss": 0.00795008521527052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95008527347818e-05, + "grad_norm": 3.9087634086608887, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8814303874969482, + "num_tokens": 489758773.0, + "step": 12840 + }, + { + "epoch": 1.6335071873807405, + "ewc_loss": 0.007929548621177673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929548883112147e-05, + "grad_norm": 3.944720983505249, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8784560561180115, + "num_tokens": 489793436.0, + "step": 12841 + }, + { + "epoch": 1.6336343976593308, + "ewc_loss": 0.007966269738972187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96626991359517e-05, + "grad_norm": 3.8746511936187744, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8945000767707825, + "num_tokens": 489828634.0, + "step": 12842 + }, + { + "epoch": 1.6337616079379214, + "ewc_loss": 0.007911783643066883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911783905001357e-05, + "grad_norm": 3.9103963375091553, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8810067176818848, + "num_tokens": 489866335.0, + "step": 12843 + }, + { + "epoch": 1.6338888182165119, + "ewc_loss": 0.007958196103572845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95819578343071e-05, + "grad_norm": 3.931497097015381, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8854731321334839, + "num_tokens": 489903059.0, + "step": 12844 + }, + { + "epoch": 1.6340160284951024, + "ewc_loss": 0.007965045981109142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.965046097524464e-05, + "grad_norm": 3.8980259895324707, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8819026947021484, + "num_tokens": 489940186.0, + "step": 12845 + }, + { + "epoch": 1.634143238773693, + "ewc_loss": 0.007932055741548538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932055450510234e-05, + "grad_norm": 3.842142343521118, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8866806626319885, + "num_tokens": 489984732.0, + "step": 12846 + }, + { + "epoch": 1.6342704490522835, + "ewc_loss": 0.00790493842214346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.904938684077933e-05, + "grad_norm": 3.9700403213500977, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8778056502342224, + "num_tokens": 490016699.0, + "step": 12847 + }, + { + "epoch": 1.6343976593308738, + "ewc_loss": 0.007995067164301872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995067426236346e-05, + "grad_norm": 3.967231512069702, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8677963018417358, + "num_tokens": 490051351.0, + "step": 12848 + }, + { + "epoch": 1.6345248696094643, + "ewc_loss": 0.007954970002174377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.954970351420343e-05, + "grad_norm": 3.9044995307922363, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8854961395263672, + "num_tokens": 490089668.0, + "step": 12849 + }, + { + "epoch": 1.6346520798880548, + "ewc_loss": 0.007916628383100033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91662823758088e-05, + "grad_norm": 3.875387191772461, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8770942687988281, + "num_tokens": 490129651.0, + "step": 12850 + }, + { + "epoch": 1.6347792901666454, + "ewc_loss": 0.007935886271297932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935886242194101e-05, + "grad_norm": 3.9163119792938232, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8706929683685303, + "num_tokens": 490168441.0, + "step": 12851 + }, + { + "epoch": 1.6349065004452359, + "ewc_loss": 0.007950556464493275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.950556027935818e-05, + "grad_norm": 3.908106803894043, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8875080347061157, + "num_tokens": 490204754.0, + "step": 12852 + }, + { + "epoch": 1.6350337107238264, + "ewc_loss": 0.007925368845462799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925368845462799e-05, + "grad_norm": 3.875241756439209, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8905298709869385, + "num_tokens": 490244178.0, + "step": 12853 + }, + { + "epoch": 1.635160921002417, + "ewc_loss": 0.00791468471288681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914684829302132e-05, + "grad_norm": 3.935332775115967, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8708599805831909, + "num_tokens": 490281161.0, + "step": 12854 + }, + { + "epoch": 1.6352881312810075, + "ewc_loss": 0.00796105619519949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961055962368846e-05, + "grad_norm": 3.9531478881835938, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8781682848930359, + "num_tokens": 490314892.0, + "step": 12855 + }, + { + "epoch": 1.635415341559598, + "ewc_loss": 0.007941785268485546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94178486103192e-05, + "grad_norm": 3.964970350265503, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8668875098228455, + "num_tokens": 490352600.0, + "step": 12856 + }, + { + "epoch": 1.6355425518381885, + "ewc_loss": 0.00794555339962244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.945553079480305e-05, + "grad_norm": 3.9855053424835205, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8887319564819336, + "num_tokens": 490384487.0, + "step": 12857 + }, + { + "epoch": 1.635669762116779, + "ewc_loss": 0.00793963111937046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939631177578121e-05, + "grad_norm": 3.9325103759765625, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8575578927993774, + "num_tokens": 490422199.0, + "step": 12858 + }, + { + "epoch": 1.6357969723953696, + "ewc_loss": 0.007929879240691662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929879211587831e-05, + "grad_norm": 3.952253818511963, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8701603412628174, + "num_tokens": 490458058.0, + "step": 12859 + }, + { + "epoch": 1.6359241826739601, + "ewc_loss": 0.007956286892294884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95628729974851e-05, + "grad_norm": 3.8763251304626465, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8872678875923157, + "num_tokens": 490498447.0, + "step": 12860 + }, + { + "epoch": 1.6360513929525506, + "ewc_loss": 0.007923291064798832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923290831968188e-05, + "grad_norm": 3.908954381942749, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8643199801445007, + "num_tokens": 490538502.0, + "step": 12861 + }, + { + "epoch": 1.6361786032311412, + "ewc_loss": 0.00796188972890377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961889787111431e-05, + "grad_norm": 3.9129862785339355, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8808331489562988, + "num_tokens": 490575752.0, + "step": 12862 + }, + { + "epoch": 1.6363058135097317, + "ewc_loss": 0.007957165129482746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95716478023678e-05, + "grad_norm": 3.9372034072875977, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8774045705795288, + "num_tokens": 490613502.0, + "step": 12863 + }, + { + "epoch": 1.6364330237883222, + "ewc_loss": 0.007971713319420815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971713785082102e-05, + "grad_norm": 3.903062105178833, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8711232542991638, + "num_tokens": 490653820.0, + "step": 12864 + }, + { + "epoch": 1.6365602340669128, + "ewc_loss": 0.007955358363687992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955358159961179e-05, + "grad_norm": 3.9076812267303467, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.861190915107727, + "num_tokens": 490693332.0, + "step": 12865 + }, + { + "epoch": 1.636687444345503, + "ewc_loss": 0.007978331297636032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.978331268532202e-05, + "grad_norm": 3.8992760181427, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.874476432800293, + "num_tokens": 490733135.0, + "step": 12866 + }, + { + "epoch": 1.6368146546240936, + "ewc_loss": 0.007952714338898659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952714076964185e-05, + "grad_norm": 3.8743715286254883, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8803876638412476, + "num_tokens": 490775505.0, + "step": 12867 + }, + { + "epoch": 1.6369418649026841, + "ewc_loss": 0.007932531647384167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932532025733963e-05, + "grad_norm": 3.870908498764038, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8754525780677795, + "num_tokens": 490817783.0, + "step": 12868 + }, + { + "epoch": 1.6370690751812746, + "ewc_loss": 0.007944975048303604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944975368445739e-05, + "grad_norm": 3.996141195297241, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8709225654602051, + "num_tokens": 490849464.0, + "step": 12869 + }, + { + "epoch": 1.6371962854598652, + "ewc_loss": 0.007998534478247166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998534420039505e-05, + "grad_norm": 3.907599925994873, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8702456951141357, + "num_tokens": 490891925.0, + "step": 12870 + }, + { + "epoch": 1.6373234957384557, + "ewc_loss": 0.007911808788776398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911808643257245e-05, + "grad_norm": 3.9363741874694824, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8697424530982971, + "num_tokens": 490926221.0, + "step": 12871 + }, + { + "epoch": 1.6374507060170462, + "ewc_loss": 0.00796471070498228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96471067587845e-05, + "grad_norm": 3.9125819206237793, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8674871921539307, + "num_tokens": 490964141.0, + "step": 12872 + }, + { + "epoch": 1.6375779162956365, + "ewc_loss": 0.007942291907966137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942291995277628e-05, + "grad_norm": 3.9655635356903076, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8718414902687073, + "num_tokens": 490997917.0, + "step": 12873 + }, + { + "epoch": 1.637705126574227, + "ewc_loss": 0.007969661615788937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96966123743914e-05, + "grad_norm": 3.9567084312438965, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8886191844940186, + "num_tokens": 491029244.0, + "step": 12874 + }, + { + "epoch": 1.6378323368528176, + "ewc_loss": 0.007967932149767876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967931742314249e-05, + "grad_norm": 3.928028106689453, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8757076263427734, + "num_tokens": 491066591.0, + "step": 12875 + }, + { + "epoch": 1.6379595471314081, + "ewc_loss": 0.007961088791489601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96108870417811e-05, + "grad_norm": 3.8510642051696777, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8743394613265991, + "num_tokens": 491106546.0, + "step": 12876 + }, + { + "epoch": 1.6380867574099987, + "ewc_loss": 0.007939786650240421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939786883071065e-05, + "grad_norm": 3.9310195446014404, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8698709011077881, + "num_tokens": 491145512.0, + "step": 12877 + }, + { + "epoch": 1.6382139676885892, + "ewc_loss": 0.00800381787121296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003817492863163e-05, + "grad_norm": 3.8738303184509277, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8742396235466003, + "num_tokens": 491185624.0, + "step": 12878 + }, + { + "epoch": 1.6383411779671797, + "ewc_loss": 0.007935582660138607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93558283476159e-05, + "grad_norm": 3.9144487380981445, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8699607849121094, + "num_tokens": 491220982.0, + "step": 12879 + }, + { + "epoch": 1.6384683882457702, + "ewc_loss": 0.007995924912393093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995925261639059e-05, + "grad_norm": 3.919520616531372, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8783407211303711, + "num_tokens": 491262501.0, + "step": 12880 + }, + { + "epoch": 1.6385955985243608, + "ewc_loss": 0.007980706170201302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.980706141097471e-05, + "grad_norm": 3.9049952030181885, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8871469497680664, + "num_tokens": 491298078.0, + "step": 12881 + }, + { + "epoch": 1.6387228088029513, + "ewc_loss": 0.007974907755851746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974907930474728e-05, + "grad_norm": 3.912497043609619, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8821530342102051, + "num_tokens": 491336203.0, + "step": 12882 + }, + { + "epoch": 1.6388500190815418, + "ewc_loss": 0.007989884354174137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.989884034032002e-05, + "grad_norm": 3.9270434379577637, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8664358854293823, + "num_tokens": 491374723.0, + "step": 12883 + }, + { + "epoch": 1.6389772293601323, + "ewc_loss": 0.007967210374772549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967209967318922e-05, + "grad_norm": 3.928192138671875, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8779204487800598, + "num_tokens": 491410546.0, + "step": 12884 + }, + { + "epoch": 1.6391044396387229, + "ewc_loss": 0.007951537147164345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.951537554617971e-05, + "grad_norm": 3.8917477130889893, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8723856806755066, + "num_tokens": 491450642.0, + "step": 12885 + }, + { + "epoch": 1.6392316499173134, + "ewc_loss": 0.007936288602650166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936288602650166e-05, + "grad_norm": 3.8882055282592773, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8735105395317078, + "num_tokens": 491489893.0, + "step": 12886 + }, + { + "epoch": 1.639358860195904, + "ewc_loss": 0.007937575690448284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.937575719552115e-05, + "grad_norm": 3.8674449920654297, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8668462038040161, + "num_tokens": 491535870.0, + "step": 12887 + }, + { + "epoch": 1.6394860704744945, + "ewc_loss": 0.007914578542113304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914578600320965e-05, + "grad_norm": 3.9349589347839355, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8757292032241821, + "num_tokens": 491575988.0, + "step": 12888 + }, + { + "epoch": 1.639613280753085, + "ewc_loss": 0.00794151145964861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.941511285025626e-05, + "grad_norm": 3.8860297203063965, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8778625726699829, + "num_tokens": 491616822.0, + "step": 12889 + }, + { + "epoch": 1.6397404910316755, + "ewc_loss": 0.007896363735198975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896363968029618e-05, + "grad_norm": 3.9235117435455322, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8946598768234253, + "num_tokens": 491654132.0, + "step": 12890 + }, + { + "epoch": 1.6398677013102658, + "ewc_loss": 0.007908929139375687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908928819233552e-05, + "grad_norm": 3.914984941482544, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8636517524719238, + "num_tokens": 491697215.0, + "step": 12891 + }, + { + "epoch": 1.6399949115888564, + "ewc_loss": 0.007893354631960392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.893354631960392e-05, + "grad_norm": 4.006612300872803, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8550679683685303, + "num_tokens": 491731338.0, + "step": 12892 + }, + { + "epoch": 1.6401221218674469, + "ewc_loss": 0.007942022755742073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942022784845904e-05, + "grad_norm": 3.906709909439087, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.87443608045578, + "num_tokens": 491770808.0, + "step": 12893 + }, + { + "epoch": 1.6402493321460374, + "ewc_loss": 0.007832443341612816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.832443225197494e-05, + "grad_norm": 3.9541702270507812, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8837671279907227, + "num_tokens": 491804167.0, + "step": 12894 + }, + { + "epoch": 1.640376542424628, + "ewc_loss": 0.007924993522465229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924993406049907e-05, + "grad_norm": 3.9715306758880615, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8768137693405151, + "num_tokens": 491840094.0, + "step": 12895 + }, + { + "epoch": 1.6405037527032185, + "ewc_loss": 0.007894893176853657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89489276940003e-05, + "grad_norm": 3.9185144901275635, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8627950549125671, + "num_tokens": 491881643.0, + "step": 12896 + }, + { + "epoch": 1.6406309629818088, + "ewc_loss": 0.00787610374391079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876103336457163e-05, + "grad_norm": 3.9304392337799072, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.86405348777771, + "num_tokens": 491920951.0, + "step": 12897 + }, + { + "epoch": 1.6407581732603993, + "ewc_loss": 0.00788888055831194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88888064562343e-05, + "grad_norm": 3.93530535697937, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8726464509963989, + "num_tokens": 491954599.0, + "step": 12898 + }, + { + "epoch": 1.6408853835389898, + "ewc_loss": 0.007888463325798512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.888463733252138e-05, + "grad_norm": 3.9175496101379395, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8751969337463379, + "num_tokens": 491995375.0, + "step": 12899 + }, + { + "epoch": 1.6410125938175804, + "ewc_loss": 0.007888411171734333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.888411346357316e-05, + "grad_norm": 3.8930020332336426, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.871479868888855, + "num_tokens": 492035297.0, + "step": 12900 + }, + { + "epoch": 1.6411398040961709, + "ewc_loss": 0.007889538072049618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889538392191753e-05, + "grad_norm": 3.948869228363037, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8764936923980713, + "num_tokens": 492071003.0, + "step": 12901 + }, + { + "epoch": 1.6412670143747614, + "ewc_loss": 0.007944853976368904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944853859953582e-05, + "grad_norm": 3.9165191650390625, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8757554292678833, + "num_tokens": 492106046.0, + "step": 12902 + }, + { + "epoch": 1.641394224653352, + "ewc_loss": 0.007902256213128567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902256038505584e-05, + "grad_norm": 3.8803324699401855, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8871225118637085, + "num_tokens": 492147252.0, + "step": 12903 + }, + { + "epoch": 1.6415214349319425, + "ewc_loss": 0.007913108915090561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9131088568829e-05, + "grad_norm": 3.882852792739868, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8672332167625427, + "num_tokens": 492188185.0, + "step": 12904 + }, + { + "epoch": 1.641648645210533, + "ewc_loss": 0.007918721064925194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918721530586481e-05, + "grad_norm": 3.9698638916015625, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8813425898551941, + "num_tokens": 492219630.0, + "step": 12905 + }, + { + "epoch": 1.6417758554891235, + "ewc_loss": 0.007986405864357948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986405398696661e-05, + "grad_norm": 3.999112129211426, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8766165375709534, + "num_tokens": 492251955.0, + "step": 12906 + }, + { + "epoch": 1.641903065767714, + "ewc_loss": 0.007970916107296944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.970916340127587e-05, + "grad_norm": 3.8903982639312744, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8768303990364075, + "num_tokens": 492293484.0, + "step": 12907 + }, + { + "epoch": 1.6420302760463046, + "ewc_loss": 0.007907605729997158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.907605322543532e-05, + "grad_norm": 3.9344871044158936, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8614298701286316, + "num_tokens": 492331169.0, + "step": 12908 + }, + { + "epoch": 1.642157486324895, + "ewc_loss": 0.00795993022620678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959930371725932e-05, + "grad_norm": 3.8791375160217285, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8830110430717468, + "num_tokens": 492366071.0, + "step": 12909 + }, + { + "epoch": 1.6422846966034856, + "ewc_loss": 0.007906459271907806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.906459359219298e-05, + "grad_norm": 3.9535505771636963, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8962185382843018, + "num_tokens": 492396820.0, + "step": 12910 + }, + { + "epoch": 1.6424119068820762, + "ewc_loss": 0.00797754805535078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977547647897154e-05, + "grad_norm": 3.9006776809692383, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8826866149902344, + "num_tokens": 492435282.0, + "step": 12911 + }, + { + "epoch": 1.6425391171606667, + "ewc_loss": 0.007928171195089817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928170816740021e-05, + "grad_norm": 3.9170472621917725, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8843865394592285, + "num_tokens": 492470195.0, + "step": 12912 + }, + { + "epoch": 1.6426663274392572, + "ewc_loss": 0.007953669875860214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953670137794688e-05, + "grad_norm": 3.8905954360961914, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.869684100151062, + "num_tokens": 492514265.0, + "step": 12913 + }, + { + "epoch": 1.6427935377178478, + "ewc_loss": 0.00791797786951065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917977927718312e-05, + "grad_norm": 3.913323163986206, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8768850564956665, + "num_tokens": 492550348.0, + "step": 12914 + }, + { + "epoch": 1.642920747996438, + "ewc_loss": 0.007938987575471401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938987982925028e-05, + "grad_norm": 3.8985326290130615, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8789573907852173, + "num_tokens": 492590648.0, + "step": 12915 + }, + { + "epoch": 1.6430479582750286, + "ewc_loss": 0.007916063070297241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.916062895674258e-05, + "grad_norm": 3.959846258163452, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.876677393913269, + "num_tokens": 492629523.0, + "step": 12916 + }, + { + "epoch": 1.6431751685536191, + "ewc_loss": 0.007952798157930374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952797750476748e-05, + "grad_norm": 3.9030873775482178, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8710008263587952, + "num_tokens": 492668392.0, + "step": 12917 + }, + { + "epoch": 1.6433023788322096, + "ewc_loss": 0.007907641120254993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.907640974735841e-05, + "grad_norm": 4.076961994171143, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8654007911682129, + "num_tokens": 492696073.0, + "step": 12918 + }, + { + "epoch": 1.6434295891108002, + "ewc_loss": 0.008034665137529373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034665370360017e-05, + "grad_norm": 3.9597785472869873, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.878301739692688, + "num_tokens": 492728590.0, + "step": 12919 + }, + { + "epoch": 1.6435567993893907, + "ewc_loss": 0.00791564118117094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915640890132636e-05, + "grad_norm": 3.919748306274414, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8690849542617798, + "num_tokens": 492767290.0, + "step": 12920 + }, + { + "epoch": 1.6436840096679812, + "ewc_loss": 0.007937665097415447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.937665213830769e-05, + "grad_norm": 3.8991620540618896, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8862438201904297, + "num_tokens": 492801019.0, + "step": 12921 + }, + { + "epoch": 1.6438112199465715, + "ewc_loss": 0.007962045259773731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.962045492604375e-05, + "grad_norm": 3.884366750717163, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8755521774291992, + "num_tokens": 492841220.0, + "step": 12922 + }, + { + "epoch": 1.643938430225162, + "ewc_loss": 0.007956866174936295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956865738378838e-05, + "grad_norm": 3.868541717529297, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8655735850334167, + "num_tokens": 492885076.0, + "step": 12923 + }, + { + "epoch": 1.6440656405037526, + "ewc_loss": 0.007970811799168587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.970811566337943e-05, + "grad_norm": 3.9428932666778564, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8712680339813232, + "num_tokens": 492918554.0, + "step": 12924 + }, + { + "epoch": 1.6441928507823431, + "ewc_loss": 0.008021268993616104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021268877200782e-05, + "grad_norm": 3.8241758346557617, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8919697999954224, + "num_tokens": 492961890.0, + "step": 12925 + }, + { + "epoch": 1.6443200610609336, + "ewc_loss": 0.007922729477286339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.922729855636135e-05, + "grad_norm": 3.90901517868042, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.885385274887085, + "num_tokens": 492997768.0, + "step": 12926 + }, + { + "epoch": 1.6444472713395242, + "ewc_loss": 0.008035647682845592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035647624637932e-05, + "grad_norm": 3.8710718154907227, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8992587924003601, + "num_tokens": 493037017.0, + "step": 12927 + }, + { + "epoch": 1.6445744816181147, + "ewc_loss": 0.007950977422297001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.950977305881679e-05, + "grad_norm": 3.9085752964019775, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8846334218978882, + "num_tokens": 493074579.0, + "step": 12928 + }, + { + "epoch": 1.6447016918967052, + "ewc_loss": 0.007996480911970139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.996481144800782e-05, + "grad_norm": 3.9343619346618652, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8768860101699829, + "num_tokens": 493111249.0, + "step": 12929 + }, + { + "epoch": 1.6448289021752958, + "ewc_loss": 0.00798935629427433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.989356527104974e-05, + "grad_norm": 3.95117449760437, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8603688478469849, + "num_tokens": 493150723.0, + "step": 12930 + }, + { + "epoch": 1.6449561124538863, + "ewc_loss": 0.007976977154612541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976977212820202e-05, + "grad_norm": 3.8799588680267334, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8753290176391602, + "num_tokens": 493189530.0, + "step": 12931 + }, + { + "epoch": 1.6450833227324768, + "ewc_loss": 0.007946012541651726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94601219240576e-05, + "grad_norm": 3.8539793491363525, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8833662271499634, + "num_tokens": 493231369.0, + "step": 12932 + }, + { + "epoch": 1.6452105330110673, + "ewc_loss": 0.007949192076921463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949191785883158e-05, + "grad_norm": 3.8974921703338623, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8623694181442261, + "num_tokens": 493278430.0, + "step": 12933 + }, + { + "epoch": 1.6453377432896579, + "ewc_loss": 0.007988550700247288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988550351001322e-05, + "grad_norm": 3.9362406730651855, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8818098902702332, + "num_tokens": 493312258.0, + "step": 12934 + }, + { + "epoch": 1.6454649535682484, + "ewc_loss": 0.007979589514434338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979589281603694e-05, + "grad_norm": 3.918898105621338, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8753081560134888, + "num_tokens": 493347725.0, + "step": 12935 + }, + { + "epoch": 1.645592163846839, + "ewc_loss": 0.007976164110004902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976164488354698e-05, + "grad_norm": 3.8854196071624756, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8791813850402832, + "num_tokens": 493388119.0, + "step": 12936 + }, + { + "epoch": 1.6457193741254295, + "ewc_loss": 0.007929958403110504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929958519525826e-05, + "grad_norm": 3.9059526920318604, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.877876877784729, + "num_tokens": 493424607.0, + "step": 12937 + }, + { + "epoch": 1.64584658440402, + "ewc_loss": 0.007976816035807133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976815686561167e-05, + "grad_norm": 3.995389938354492, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8524482250213623, + "num_tokens": 493459546.0, + "step": 12938 + }, + { + "epoch": 1.6459737946826105, + "ewc_loss": 0.007993561215698719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99356130301021e-05, + "grad_norm": 3.908306121826172, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8799489736557007, + "num_tokens": 493494162.0, + "step": 12939 + }, + { + "epoch": 1.6461010049612008, + "ewc_loss": 0.007931157015264034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.931156869744882e-05, + "grad_norm": 3.945265531539917, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.874842643737793, + "num_tokens": 493537523.0, + "step": 12940 + }, + { + "epoch": 1.6462282152397913, + "ewc_loss": 0.007972614839673042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.972614548634738e-05, + "grad_norm": 3.879201889038086, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.874962329864502, + "num_tokens": 493580065.0, + "step": 12941 + }, + { + "epoch": 1.6463554255183819, + "ewc_loss": 0.00792943499982357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929435378173366e-05, + "grad_norm": 3.8597633838653564, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8825956583023071, + "num_tokens": 493626717.0, + "step": 12942 + }, + { + "epoch": 1.6464826357969724, + "ewc_loss": 0.007912395521998405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91239581303671e-05, + "grad_norm": 3.9141640663146973, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8813292384147644, + "num_tokens": 493666266.0, + "step": 12943 + }, + { + "epoch": 1.646609846075563, + "ewc_loss": 0.00794882420450449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948824350023642e-05, + "grad_norm": 3.929103374481201, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8668064475059509, + "num_tokens": 493703184.0, + "step": 12944 + }, + { + "epoch": 1.6467370563541535, + "ewc_loss": 0.007933717221021652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.933717279229313e-05, + "grad_norm": 3.8962604999542236, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8906207084655762, + "num_tokens": 493744061.0, + "step": 12945 + }, + { + "epoch": 1.6468642666327438, + "ewc_loss": 0.007907029241323471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.907029066700488e-05, + "grad_norm": 3.9419972896575928, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8755491971969604, + "num_tokens": 493783585.0, + "step": 12946 + }, + { + "epoch": 1.6469914769113343, + "ewc_loss": 0.007932743057608604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932743028504774e-05, + "grad_norm": 3.8794474601745605, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8816345930099487, + "num_tokens": 493827781.0, + "step": 12947 + }, + { + "epoch": 1.6471186871899248, + "ewc_loss": 0.007866926491260529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.866926898714155e-05, + "grad_norm": 3.93526554107666, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8808057308197021, + "num_tokens": 493864418.0, + "step": 12948 + }, + { + "epoch": 1.6472458974685154, + "ewc_loss": 0.007927519269287586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.927519618533552e-05, + "grad_norm": 3.980595111846924, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8747279644012451, + "num_tokens": 493895226.0, + "step": 12949 + }, + { + "epoch": 1.6473731077471059, + "ewc_loss": 0.00790171418339014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901713979663327e-05, + "grad_norm": 3.903984546661377, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8668124675750732, + "num_tokens": 493935210.0, + "step": 12950 + }, + { + "epoch": 1.6475003180256964, + "ewc_loss": 0.007880588062107563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880588236730546e-05, + "grad_norm": 3.9695310592651367, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8752162456512451, + "num_tokens": 493966368.0, + "step": 12951 + }, + { + "epoch": 1.647627528304287, + "ewc_loss": 0.00794968195259571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949682185426354e-05, + "grad_norm": 3.9271934032440186, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8713152408599854, + "num_tokens": 494004304.0, + "step": 12952 + }, + { + "epoch": 1.6477547385828775, + "ewc_loss": 0.00788987334817648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.889873086242005e-05, + "grad_norm": 3.872633934020996, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.882441520690918, + "num_tokens": 494044758.0, + "step": 12953 + }, + { + "epoch": 1.647881948861468, + "ewc_loss": 0.007895822636783123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.895822636783123e-05, + "grad_norm": 3.9651496410369873, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8611247539520264, + "num_tokens": 494080624.0, + "step": 12954 + }, + { + "epoch": 1.6480091591400585, + "ewc_loss": 0.007963900454342365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963900861795992e-05, + "grad_norm": 4.0016608238220215, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.882912278175354, + "num_tokens": 494116215.0, + "step": 12955 + }, + { + "epoch": 1.648136369418649, + "ewc_loss": 0.007938225753605366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938225462567061e-05, + "grad_norm": 3.9656190872192383, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8748266696929932, + "num_tokens": 494150317.0, + "step": 12956 + }, + { + "epoch": 1.6482635796972396, + "ewc_loss": 0.007917708717286587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917708717286587e-05, + "grad_norm": 3.8921496868133545, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8896934390068054, + "num_tokens": 494187273.0, + "step": 12957 + }, + { + "epoch": 1.64839078997583, + "ewc_loss": 0.007890033535659313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.890033157309517e-05, + "grad_norm": 3.922886610031128, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8721020817756653, + "num_tokens": 494226921.0, + "step": 12958 + }, + { + "epoch": 1.6485180002544206, + "ewc_loss": 0.00792840775102377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928408012958243e-05, + "grad_norm": 3.9008288383483887, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8829985857009888, + "num_tokens": 494263141.0, + "step": 12959 + }, + { + "epoch": 1.6486452105330112, + "ewc_loss": 0.007915196940302849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91519705671817e-05, + "grad_norm": 3.923088312149048, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8724372982978821, + "num_tokens": 494307925.0, + "step": 12960 + }, + { + "epoch": 1.6487724208116017, + "ewc_loss": 0.007937271147966385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93727085692808e-05, + "grad_norm": 3.8758633136749268, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8695939183235168, + "num_tokens": 494353428.0, + "step": 12961 + }, + { + "epoch": 1.6488996310901922, + "ewc_loss": 0.00789546873420477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.895469025243074e-05, + "grad_norm": 4.074102401733398, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8652605414390564, + "num_tokens": 494383559.0, + "step": 12962 + }, + { + "epoch": 1.6490268413687827, + "ewc_loss": 0.008046438917517662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046439324971288e-05, + "grad_norm": 3.890611410140991, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8787486553192139, + "num_tokens": 494419564.0, + "step": 12963 + }, + { + "epoch": 1.649154051647373, + "ewc_loss": 0.00786546990275383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.865469524404034e-05, + "grad_norm": 3.9237470626831055, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8768085837364197, + "num_tokens": 494466886.0, + "step": 12964 + }, + { + "epoch": 1.6492812619259636, + "ewc_loss": 0.00797257199883461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.972572348080575e-05, + "grad_norm": 3.953036069869995, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8594894409179688, + "num_tokens": 494506672.0, + "step": 12965 + }, + { + "epoch": 1.649408472204554, + "ewc_loss": 0.00796310231089592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963102689245716e-05, + "grad_norm": 3.9520347118377686, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8761401176452637, + "num_tokens": 494541881.0, + "step": 12966 + }, + { + "epoch": 1.6495356824831446, + "ewc_loss": 0.00794780720025301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94780717114918e-05, + "grad_norm": 3.861004590988159, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8808205127716064, + "num_tokens": 494583546.0, + "step": 12967 + }, + { + "epoch": 1.6496628927617352, + "ewc_loss": 0.007897676900029182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.897676550783217e-05, + "grad_norm": 3.996450424194336, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8642942905426025, + "num_tokens": 494616625.0, + "step": 12968 + }, + { + "epoch": 1.6497901030403257, + "ewc_loss": 0.008013603277504444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01360365585424e-05, + "grad_norm": 3.9149997234344482, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8894518613815308, + "num_tokens": 494654590.0, + "step": 12969 + }, + { + "epoch": 1.6499173133189162, + "ewc_loss": 0.007909631356596947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90963094914332e-05, + "grad_norm": 3.876370429992676, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.889641523361206, + "num_tokens": 494692471.0, + "step": 12970 + }, + { + "epoch": 1.6500445235975065, + "ewc_loss": 0.007935305126011372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935304893180728e-05, + "grad_norm": 3.923832893371582, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8690261840820312, + "num_tokens": 494730347.0, + "step": 12971 + }, + { + "epoch": 1.650171733876097, + "ewc_loss": 0.0079519497230649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.951950101414695e-05, + "grad_norm": 3.9635627269744873, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8650763034820557, + "num_tokens": 494763753.0, + "step": 12972 + }, + { + "epoch": 1.6502989441546876, + "ewc_loss": 0.007966520264744759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.966520206537098e-05, + "grad_norm": 3.9093055725097656, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8723207116127014, + "num_tokens": 494800416.0, + "step": 12973 + }, + { + "epoch": 1.6504261544332781, + "ewc_loss": 0.007915305905044079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915305468486622e-05, + "grad_norm": 3.8835816383361816, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8751283884048462, + "num_tokens": 494842733.0, + "step": 12974 + }, + { + "epoch": 1.6505533647118686, + "ewc_loss": 0.007934940047562122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934939640108496e-05, + "grad_norm": 3.9567275047302246, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8703325986862183, + "num_tokens": 494878661.0, + "step": 12975 + }, + { + "epoch": 1.6506805749904592, + "ewc_loss": 0.007976789958775043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976790220709518e-05, + "grad_norm": 3.887392520904541, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8685644865036011, + "num_tokens": 494918142.0, + "step": 12976 + }, + { + "epoch": 1.6508077852690497, + "ewc_loss": 0.007911941036581993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911941065685824e-05, + "grad_norm": 3.917825222015381, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8744850754737854, + "num_tokens": 494956947.0, + "step": 12977 + }, + { + "epoch": 1.6509349955476402, + "ewc_loss": 0.007961129769682884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961129449540749e-05, + "grad_norm": 3.9502320289611816, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8716914057731628, + "num_tokens": 494989850.0, + "step": 12978 + }, + { + "epoch": 1.6510622058262308, + "ewc_loss": 0.007966551929712296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9665522207506e-05, + "grad_norm": 3.952467679977417, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.868603527545929, + "num_tokens": 495026770.0, + "step": 12979 + }, + { + "epoch": 1.6511894161048213, + "ewc_loss": 0.00796719454228878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967194687807932e-05, + "grad_norm": 3.894653558731079, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8866507411003113, + "num_tokens": 495065009.0, + "step": 12980 + }, + { + "epoch": 1.6513166263834118, + "ewc_loss": 0.007920336909592152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92033679317683e-05, + "grad_norm": 3.955763339996338, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8775265216827393, + "num_tokens": 495096914.0, + "step": 12981 + }, + { + "epoch": 1.6514438366620023, + "ewc_loss": 0.007997753098607063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997752982191741e-05, + "grad_norm": 3.889533281326294, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8801312446594238, + "num_tokens": 495138348.0, + "step": 12982 + }, + { + "epoch": 1.6515710469405929, + "ewc_loss": 0.007935313507914543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935313624329865e-05, + "grad_norm": 3.9093947410583496, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8687475919723511, + "num_tokens": 495177026.0, + "step": 12983 + }, + { + "epoch": 1.6516982572191834, + "ewc_loss": 0.007973643019795418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973642641445622e-05, + "grad_norm": 3.9394795894622803, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8790389895439148, + "num_tokens": 495213850.0, + "step": 12984 + }, + { + "epoch": 1.651825467497774, + "ewc_loss": 0.007974290288984776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974290201673284e-05, + "grad_norm": 3.8694818019866943, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8852472901344299, + "num_tokens": 495257213.0, + "step": 12985 + }, + { + "epoch": 1.6519526777763645, + "ewc_loss": 0.007916895672678947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.916895265225321e-05, + "grad_norm": 3.9088308811187744, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8657222390174866, + "num_tokens": 495300018.0, + "step": 12986 + }, + { + "epoch": 1.652079888054955, + "ewc_loss": 0.007942667230963707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94266743469052e-05, + "grad_norm": 3.9462761878967285, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8740429282188416, + "num_tokens": 495336471.0, + "step": 12987 + }, + { + "epoch": 1.6522070983335455, + "ewc_loss": 0.007955702021718025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955701585160568e-05, + "grad_norm": 3.9106338024139404, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8791128396987915, + "num_tokens": 495374680.0, + "step": 12988 + }, + { + "epoch": 1.6523343086121358, + "ewc_loss": 0.007932155393064022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932155131129548e-05, + "grad_norm": 3.8988044261932373, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.877710223197937, + "num_tokens": 495414096.0, + "step": 12989 + }, + { + "epoch": 1.6524615188907263, + "ewc_loss": 0.007928083650767803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928083505248651e-05, + "grad_norm": 3.9596946239471436, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8616538047790527, + "num_tokens": 495452955.0, + "step": 12990 + }, + { + "epoch": 1.6525887291693169, + "ewc_loss": 0.00797299575060606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.972995808813721e-05, + "grad_norm": 3.925128221511841, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8727459907531738, + "num_tokens": 495490091.0, + "step": 12991 + }, + { + "epoch": 1.6527159394479074, + "ewc_loss": 0.007913254201412201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913254376035184e-05, + "grad_norm": 3.9286649227142334, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8849973082542419, + "num_tokens": 495524543.0, + "step": 12992 + }, + { + "epoch": 1.652843149726498, + "ewc_loss": 0.007945427671074867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94542720541358e-05, + "grad_norm": 3.958988904953003, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8670564889907837, + "num_tokens": 495562068.0, + "step": 12993 + }, + { + "epoch": 1.6529703600050885, + "ewc_loss": 0.007953580468893051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953580643516034e-05, + "grad_norm": 3.9015750885009766, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8833798170089722, + "num_tokens": 495598355.0, + "step": 12994 + }, + { + "epoch": 1.6530975702836788, + "ewc_loss": 0.007917793467640877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917793845990673e-05, + "grad_norm": 3.9378254413604736, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8628644943237305, + "num_tokens": 495634735.0, + "step": 12995 + }, + { + "epoch": 1.6532247805622693, + "ewc_loss": 0.00796798150986433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967981218826026e-05, + "grad_norm": 3.989487648010254, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8762863874435425, + "num_tokens": 495668804.0, + "step": 12996 + }, + { + "epoch": 1.6533519908408598, + "ewc_loss": 0.007963585667312145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963585812831298e-05, + "grad_norm": 3.9212050437927246, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8798898458480835, + "num_tokens": 495707129.0, + "step": 12997 + }, + { + "epoch": 1.6534792011194503, + "ewc_loss": 0.007924718782305717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924718374852091e-05, + "grad_norm": 3.9403748512268066, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8566271066665649, + "num_tokens": 495746501.0, + "step": 12998 + }, + { + "epoch": 1.6536064113980409, + "ewc_loss": 0.007954086177051067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95408632257022e-05, + "grad_norm": 3.877894401550293, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8922487497329712, + "num_tokens": 495789723.0, + "step": 12999 + }, + { + "epoch": 1.6537336216766314, + "ewc_loss": 0.007914121262729168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914121670182794e-05, + "grad_norm": 3.9956204891204834, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8644469380378723, + "num_tokens": 495827073.0, + "step": 13000 + }, + { + "epoch": 1.653860831955222, + "ewc_loss": 0.008005990646779537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00599082140252e-05, + "grad_norm": 3.9780359268188477, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8663202524185181, + "num_tokens": 495862362.0, + "step": 13001 + }, + { + "epoch": 1.6539880422338125, + "ewc_loss": 0.007957328110933304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9573284892831e-05, + "grad_norm": 3.866086006164551, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8773166537284851, + "num_tokens": 495905546.0, + "step": 13002 + }, + { + "epoch": 1.654115252512403, + "ewc_loss": 0.007903040386736393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.903040386736393e-05, + "grad_norm": 3.919985055923462, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.874579668045044, + "num_tokens": 495942291.0, + "step": 13003 + }, + { + "epoch": 1.6542424627909935, + "ewc_loss": 0.007993146777153015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993146573426202e-05, + "grad_norm": 3.964094877243042, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8662461638450623, + "num_tokens": 495978631.0, + "step": 13004 + }, + { + "epoch": 1.654369673069584, + "ewc_loss": 0.00799499824643135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994998304639012e-05, + "grad_norm": 3.932593584060669, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.871152400970459, + "num_tokens": 496016626.0, + "step": 13005 + }, + { + "epoch": 1.6544968833481746, + "ewc_loss": 0.007966159842908382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.966160046635196e-05, + "grad_norm": 3.9457168579101562, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8691666126251221, + "num_tokens": 496051949.0, + "step": 13006 + }, + { + "epoch": 1.654624093626765, + "ewc_loss": 0.00798574648797512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.985746196936816e-05, + "grad_norm": 3.9025425910949707, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8819009065628052, + "num_tokens": 496092693.0, + "step": 13007 + }, + { + "epoch": 1.6547513039053556, + "ewc_loss": 0.007949837483465672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949837890919298e-05, + "grad_norm": 3.9038169384002686, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8704837560653687, + "num_tokens": 496132853.0, + "step": 13008 + }, + { + "epoch": 1.6548785141839462, + "ewc_loss": 0.007984022609889507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984022522578016e-05, + "grad_norm": 3.9815030097961426, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8616917133331299, + "num_tokens": 496168303.0, + "step": 13009 + }, + { + "epoch": 1.6550057244625367, + "ewc_loss": 0.008024661801755428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024661656236276e-05, + "grad_norm": 3.90966534614563, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.864054799079895, + "num_tokens": 496206132.0, + "step": 13010 + }, + { + "epoch": 1.6551329347411272, + "ewc_loss": 0.007972405292093754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97240572865121e-05, + "grad_norm": 3.9420766830444336, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8695577383041382, + "num_tokens": 496247542.0, + "step": 13011 + }, + { + "epoch": 1.6552601450197177, + "ewc_loss": 0.008021139539778233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02113936515525e-05, + "grad_norm": 3.9082162380218506, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.884251058101654, + "num_tokens": 496285698.0, + "step": 13012 + }, + { + "epoch": 1.655387355298308, + "ewc_loss": 0.007973747327923775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973747415235266e-05, + "grad_norm": 3.9343695640563965, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8768728971481323, + "num_tokens": 496322216.0, + "step": 13013 + }, + { + "epoch": 1.6555145655768986, + "ewc_loss": 0.008006690070033073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006690040929243e-05, + "grad_norm": 3.9777896404266357, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8738794326782227, + "num_tokens": 496357687.0, + "step": 13014 + }, + { + "epoch": 1.655641775855489, + "ewc_loss": 0.008017439395189285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017439540708438e-05, + "grad_norm": 3.8753128051757812, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8877289891242981, + "num_tokens": 496400875.0, + "step": 13015 + }, + { + "epoch": 1.6557689861340796, + "ewc_loss": 0.00793043989688158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.930440187919885e-05, + "grad_norm": 4.020273208618164, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8791857957839966, + "num_tokens": 496432076.0, + "step": 13016 + }, + { + "epoch": 1.6558961964126702, + "ewc_loss": 0.008055687882006168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055687794694677e-05, + "grad_norm": 3.971235752105713, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8725625872612, + "num_tokens": 496466852.0, + "step": 13017 + }, + { + "epoch": 1.6560234066912607, + "ewc_loss": 0.007953195832669735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953195745358244e-05, + "grad_norm": 3.9037280082702637, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8739190101623535, + "num_tokens": 496508890.0, + "step": 13018 + }, + { + "epoch": 1.6561506169698512, + "ewc_loss": 0.00794405210763216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944052049424499e-05, + "grad_norm": 4.063209056854248, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8908549547195435, + "num_tokens": 496537964.0, + "step": 13019 + }, + { + "epoch": 1.6562778272484415, + "ewc_loss": 0.008073664270341396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07366450317204e-05, + "grad_norm": 4.013474941253662, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8597321510314941, + "num_tokens": 496571263.0, + "step": 13020 + }, + { + "epoch": 1.656405037527032, + "ewc_loss": 0.007969561964273453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969561556819826e-05, + "grad_norm": 3.9540324211120605, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.883097767829895, + "num_tokens": 496610058.0, + "step": 13021 + }, + { + "epoch": 1.6565322478056226, + "ewc_loss": 0.007966643199324608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.966643170220777e-05, + "grad_norm": 4.025059700012207, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8682882785797119, + "num_tokens": 496640542.0, + "step": 13022 + }, + { + "epoch": 1.656659458084213, + "ewc_loss": 0.008036350831389427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036350482143462e-05, + "grad_norm": 3.9475882053375244, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8736370801925659, + "num_tokens": 496678030.0, + "step": 13023 + }, + { + "epoch": 1.6567866683628036, + "ewc_loss": 0.007959340699017048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959340291563421e-05, + "grad_norm": 3.914766550064087, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.870540976524353, + "num_tokens": 496714057.0, + "step": 13024 + }, + { + "epoch": 1.6569138786413942, + "ewc_loss": 0.007979188114404678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979188376339152e-05, + "grad_norm": 3.947160243988037, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8907527923583984, + "num_tokens": 496744501.0, + "step": 13025 + }, + { + "epoch": 1.6570410889199847, + "ewc_loss": 0.008015140891075134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.015141065698117e-05, + "grad_norm": 3.941537857055664, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8804941177368164, + "num_tokens": 496780233.0, + "step": 13026 + }, + { + "epoch": 1.6571682991985752, + "ewc_loss": 0.008014081977307796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.014082413865253e-05, + "grad_norm": 4.0051374435424805, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8753073215484619, + "num_tokens": 496809645.0, + "step": 13027 + }, + { + "epoch": 1.6572955094771658, + "ewc_loss": 0.008069467730820179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069467730820179e-05, + "grad_norm": 3.900770664215088, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8768895864486694, + "num_tokens": 496854871.0, + "step": 13028 + }, + { + "epoch": 1.6574227197557563, + "ewc_loss": 0.007979176007211208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979176007211208e-05, + "grad_norm": 3.903841495513916, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8909874558448792, + "num_tokens": 496891958.0, + "step": 13029 + }, + { + "epoch": 1.6575499300343468, + "ewc_loss": 0.008028095588088036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02809590823017e-05, + "grad_norm": 3.9586963653564453, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8670191764831543, + "num_tokens": 496928550.0, + "step": 13030 + }, + { + "epoch": 1.6576771403129373, + "ewc_loss": 0.00804258231073618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04258233984001e-05, + "grad_norm": 3.872490644454956, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8742781281471252, + "num_tokens": 496972009.0, + "step": 13031 + }, + { + "epoch": 1.6578043505915279, + "ewc_loss": 0.00798303633928299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.983036630321294e-05, + "grad_norm": 3.918590545654297, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8737370371818542, + "num_tokens": 497013605.0, + "step": 13032 + }, + { + "epoch": 1.6579315608701184, + "ewc_loss": 0.008026129566133022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026129216887057e-05, + "grad_norm": 3.9660065174102783, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8773097991943359, + "num_tokens": 497048712.0, + "step": 13033 + }, + { + "epoch": 1.658058771148709, + "ewc_loss": 0.008033184334635735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03318471298553e-05, + "grad_norm": 3.8876607418060303, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8882344961166382, + "num_tokens": 497089161.0, + "step": 13034 + }, + { + "epoch": 1.6581859814272994, + "ewc_loss": 0.007960401475429535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96040112618357e-05, + "grad_norm": 4.005512714385986, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8471996784210205, + "num_tokens": 497127253.0, + "step": 13035 + }, + { + "epoch": 1.65831319170589, + "ewc_loss": 0.00806216150522232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062161941779777e-05, + "grad_norm": 3.8854339122772217, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8743351697921753, + "num_tokens": 497167637.0, + "step": 13036 + }, + { + "epoch": 1.6584404019844805, + "ewc_loss": 0.007928609848022461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928610284579918e-05, + "grad_norm": 3.910162925720215, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8748056888580322, + "num_tokens": 497205663.0, + "step": 13037 + }, + { + "epoch": 1.6585676122630708, + "ewc_loss": 0.007994935847818851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99493573140353e-05, + "grad_norm": 3.9240102767944336, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8924067616462708, + "num_tokens": 497242146.0, + "step": 13038 + }, + { + "epoch": 1.6586948225416613, + "ewc_loss": 0.0079865213483572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986521814018488e-05, + "grad_norm": 3.9587154388427734, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8785048723220825, + "num_tokens": 497279339.0, + "step": 13039 + }, + { + "epoch": 1.6588220328202519, + "ewc_loss": 0.007980200462043285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.980200462043285e-05, + "grad_norm": 3.9230844974517822, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8666045069694519, + "num_tokens": 497318530.0, + "step": 13040 + }, + { + "epoch": 1.6589492430988424, + "ewc_loss": 0.007953094318509102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953094609547406e-05, + "grad_norm": 3.9954097270965576, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8687682151794434, + "num_tokens": 497352943.0, + "step": 13041 + }, + { + "epoch": 1.659076453377433, + "ewc_loss": 0.008022619411349297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022619294933975e-05, + "grad_norm": 3.9950482845306396, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8697953224182129, + "num_tokens": 497387729.0, + "step": 13042 + }, + { + "epoch": 1.6592036636560235, + "ewc_loss": 0.007969684898853302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969685248099267e-05, + "grad_norm": 3.8998022079467773, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8765751719474792, + "num_tokens": 497427875.0, + "step": 13043 + }, + { + "epoch": 1.6593308739346138, + "ewc_loss": 0.007917620241641998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.917620678199455e-05, + "grad_norm": 3.929360866546631, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8694939613342285, + "num_tokens": 497469076.0, + "step": 13044 + }, + { + "epoch": 1.6594580842132043, + "ewc_loss": 0.007971318438649178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97131797298789e-05, + "grad_norm": 3.8800160884857178, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8767408132553101, + "num_tokens": 497509556.0, + "step": 13045 + }, + { + "epoch": 1.6595852944917948, + "ewc_loss": 0.007937201298773289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.937201007734984e-05, + "grad_norm": 3.971729278564453, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8684340715408325, + "num_tokens": 497548151.0, + "step": 13046 + }, + { + "epoch": 1.6597125047703853, + "ewc_loss": 0.008007245138287544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007245196495205e-05, + "grad_norm": 3.98330020904541, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8501110076904297, + "num_tokens": 497589088.0, + "step": 13047 + }, + { + "epoch": 1.6598397150489759, + "ewc_loss": 0.007979823276400566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979822839843109e-05, + "grad_norm": 3.8615925312042236, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8831005096435547, + "num_tokens": 497632095.0, + "step": 13048 + }, + { + "epoch": 1.6599669253275664, + "ewc_loss": 0.007919988594949245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919989002402872e-05, + "grad_norm": 3.9476418495178223, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8861805200576782, + "num_tokens": 497665148.0, + "step": 13049 + }, + { + "epoch": 1.660094135606157, + "ewc_loss": 0.008011817000806332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011817408259958e-05, + "grad_norm": 4.025676727294922, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.869086742401123, + "num_tokens": 497700134.0, + "step": 13050 + }, + { + "epoch": 1.6602213458847475, + "ewc_loss": 0.008013738319277763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013738261070102e-05, + "grad_norm": 3.8854448795318604, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8804917335510254, + "num_tokens": 497743101.0, + "step": 13051 + }, + { + "epoch": 1.660348556163338, + "ewc_loss": 0.007915590889751911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915590686025098e-05, + "grad_norm": 3.999803304672241, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8678197860717773, + "num_tokens": 497775112.0, + "step": 13052 + }, + { + "epoch": 1.6604757664419285, + "ewc_loss": 0.00803576223552227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035761857172474e-05, + "grad_norm": 3.875574827194214, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8862594962120056, + "num_tokens": 497813320.0, + "step": 13053 + }, + { + "epoch": 1.660602976720519, + "ewc_loss": 0.007915315218269825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915315654827282e-05, + "grad_norm": 3.9415814876556396, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8699340224266052, + "num_tokens": 497852780.0, + "step": 13054 + }, + { + "epoch": 1.6607301869991096, + "ewc_loss": 0.007990183308720589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.990183075889945e-05, + "grad_norm": 3.9924497604370117, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8761231303215027, + "num_tokens": 497886313.0, + "step": 13055 + }, + { + "epoch": 1.6608573972777, + "ewc_loss": 0.00799417681992054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99417684902437e-05, + "grad_norm": 3.921022415161133, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8725112676620483, + "num_tokens": 497924108.0, + "step": 13056 + }, + { + "epoch": 1.6609846075562906, + "ewc_loss": 0.007938436232507229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938436465337873e-05, + "grad_norm": 3.929154872894287, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8649716973304749, + "num_tokens": 497964656.0, + "step": 13057 + }, + { + "epoch": 1.6611118178348812, + "ewc_loss": 0.007953446358442307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953446765895933e-05, + "grad_norm": 3.928117513656616, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8620567321777344, + "num_tokens": 498005436.0, + "step": 13058 + }, + { + "epoch": 1.6612390281134717, + "ewc_loss": 0.007962092757225037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.962092786328867e-05, + "grad_norm": 3.958885431289673, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8720257878303528, + "num_tokens": 498042237.0, + "step": 13059 + }, + { + "epoch": 1.6613662383920622, + "ewc_loss": 0.007971452549099922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971452578203753e-05, + "grad_norm": 3.8865246772766113, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8747756481170654, + "num_tokens": 498083008.0, + "step": 13060 + }, + { + "epoch": 1.6614934486706527, + "ewc_loss": 0.00792554672807455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925546378828585e-05, + "grad_norm": 4.033590793609619, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8529856204986572, + "num_tokens": 498120550.0, + "step": 13061 + }, + { + "epoch": 1.661620658949243, + "ewc_loss": 0.008028505370020866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028505544643849e-05, + "grad_norm": 3.9237780570983887, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8738117814064026, + "num_tokens": 498153033.0, + "step": 13062 + }, + { + "epoch": 1.6617478692278336, + "ewc_loss": 0.007914042100310326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914041634649038e-05, + "grad_norm": 3.9119482040405273, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8780957460403442, + "num_tokens": 498190491.0, + "step": 13063 + }, + { + "epoch": 1.661875079506424, + "ewc_loss": 0.007961345836520195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96134554548189e-05, + "grad_norm": 3.876030206680298, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8683922290802002, + "num_tokens": 498235889.0, + "step": 13064 + }, + { + "epoch": 1.6620022897850146, + "ewc_loss": 0.007955058477818966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955058390507475e-05, + "grad_norm": 3.920863151550293, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8725126385688782, + "num_tokens": 498277191.0, + "step": 13065 + }, + { + "epoch": 1.6621295000636052, + "ewc_loss": 0.007972299121320248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.972299499670044e-05, + "grad_norm": 3.937537908554077, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.865059494972229, + "num_tokens": 498318443.0, + "step": 13066 + }, + { + "epoch": 1.6622567103421957, + "ewc_loss": 0.0079438341781497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.943833770696074e-05, + "grad_norm": 3.9775402545928955, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8635588884353638, + "num_tokens": 498355955.0, + "step": 13067 + }, + { + "epoch": 1.662383920620786, + "ewc_loss": 0.007961937226355076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961937080835924e-05, + "grad_norm": 3.9562904834747314, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8754795789718628, + "num_tokens": 498388588.0, + "step": 13068 + }, + { + "epoch": 1.6625111308993765, + "ewc_loss": 0.007925248704850674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925248792162165e-05, + "grad_norm": 3.937596559524536, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8737123608589172, + "num_tokens": 498430661.0, + "step": 13069 + }, + { + "epoch": 1.662638341177967, + "ewc_loss": 0.007931364700198174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.931364234536886e-05, + "grad_norm": 3.9273860454559326, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8640392422676086, + "num_tokens": 498470428.0, + "step": 13070 + }, + { + "epoch": 1.6627655514565576, + "ewc_loss": 0.007931497879326344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.931498112156987e-05, + "grad_norm": 3.910456657409668, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8740425109863281, + "num_tokens": 498512031.0, + "step": 13071 + }, + { + "epoch": 1.662892761735148, + "ewc_loss": 0.007911675609648228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911675493232906e-05, + "grad_norm": 3.9515984058380127, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8955299854278564, + "num_tokens": 498548344.0, + "step": 13072 + }, + { + "epoch": 1.6630199720137386, + "ewc_loss": 0.007942548021674156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942548108985648e-05, + "grad_norm": 3.8737382888793945, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.888291597366333, + "num_tokens": 498590115.0, + "step": 13073 + }, + { + "epoch": 1.6631471822923292, + "ewc_loss": 0.007876724004745483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876723975641653e-05, + "grad_norm": 3.8850982189178467, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8761650323867798, + "num_tokens": 498634034.0, + "step": 13074 + }, + { + "epoch": 1.6632743925709197, + "ewc_loss": 0.00791056640446186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910566637292504e-05, + "grad_norm": 3.949061632156372, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8665483593940735, + "num_tokens": 498675086.0, + "step": 13075 + }, + { + "epoch": 1.6634016028495102, + "ewc_loss": 0.007938137277960777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93813742347993e-05, + "grad_norm": 3.9207000732421875, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8819675445556641, + "num_tokens": 498711374.0, + "step": 13076 + }, + { + "epoch": 1.6635288131281007, + "ewc_loss": 0.00787733867764473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877338794060051e-05, + "grad_norm": 3.8919854164123535, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8796828389167786, + "num_tokens": 498753050.0, + "step": 13077 + }, + { + "epoch": 1.6636560234066913, + "ewc_loss": 0.007876496762037277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87649696576409e-05, + "grad_norm": 3.944753408432007, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8770627975463867, + "num_tokens": 498790406.0, + "step": 13078 + }, + { + "epoch": 1.6637832336852818, + "ewc_loss": 0.007908281870186329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908281986601651e-05, + "grad_norm": 3.912487030029297, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8725305795669556, + "num_tokens": 498827712.0, + "step": 13079 + }, + { + "epoch": 1.6639104439638723, + "ewc_loss": 0.007854669354856014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85466909292154e-05, + "grad_norm": 3.9784908294677734, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8795187473297119, + "num_tokens": 498862283.0, + "step": 13080 + }, + { + "epoch": 1.6640376542424629, + "ewc_loss": 0.007919029332697392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919029303593561e-05, + "grad_norm": 3.872872829437256, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8772829174995422, + "num_tokens": 498902686.0, + "step": 13081 + }, + { + "epoch": 1.6641648645210534, + "ewc_loss": 0.00783499889075756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.834999269107357e-05, + "grad_norm": 3.9482502937316895, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8648776412010193, + "num_tokens": 498942785.0, + "step": 13082 + }, + { + "epoch": 1.664292074799644, + "ewc_loss": 0.007911317050457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911317516118288e-05, + "grad_norm": 3.91033935546875, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8799996376037598, + "num_tokens": 498982761.0, + "step": 13083 + }, + { + "epoch": 1.6644192850782344, + "ewc_loss": 0.007881567813456059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.881567580625415e-05, + "grad_norm": 3.907268524169922, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8566721081733704, + "num_tokens": 499025762.0, + "step": 13084 + }, + { + "epoch": 1.664546495356825, + "ewc_loss": 0.007887275889515877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887275569373742e-05, + "grad_norm": 3.9497885704040527, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8661767840385437, + "num_tokens": 499063394.0, + "step": 13085 + }, + { + "epoch": 1.6646737056354155, + "ewc_loss": 0.007905540987849236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.905541133368388e-05, + "grad_norm": 3.845088481903076, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8842589259147644, + "num_tokens": 499105824.0, + "step": 13086 + }, + { + "epoch": 1.6648009159140058, + "ewc_loss": 0.007842675782740116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.84267540439032e-05, + "grad_norm": 3.958827495574951, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8840861320495605, + "num_tokens": 499140935.0, + "step": 13087 + }, + { + "epoch": 1.6649281261925963, + "ewc_loss": 0.007948373444378376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948373240651563e-05, + "grad_norm": 3.928924560546875, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8733364343643188, + "num_tokens": 499179492.0, + "step": 13088 + }, + { + "epoch": 1.6650553364711869, + "ewc_loss": 0.00786742102354765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.867420936236158e-05, + "grad_norm": 3.9317667484283447, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8852822780609131, + "num_tokens": 499214716.0, + "step": 13089 + }, + { + "epoch": 1.6651825467497774, + "ewc_loss": 0.007901345379650593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90134581620805e-05, + "grad_norm": 3.9206953048706055, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8820027112960815, + "num_tokens": 499250337.0, + "step": 13090 + }, + { + "epoch": 1.665309757028368, + "ewc_loss": 0.007886615581810474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.886615640018135e-05, + "grad_norm": 3.969378709793091, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8793364763259888, + "num_tokens": 499283675.0, + "step": 13091 + }, + { + "epoch": 1.6654369673069584, + "ewc_loss": 0.007919605821371078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919606287032366e-05, + "grad_norm": 3.948908805847168, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8802968263626099, + "num_tokens": 499319153.0, + "step": 13092 + }, + { + "epoch": 1.6655641775855488, + "ewc_loss": 0.007892048917710781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.892049325164407e-05, + "grad_norm": 3.9373950958251953, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8726477026939392, + "num_tokens": 499356675.0, + "step": 13093 + }, + { + "epoch": 1.6656913878641393, + "ewc_loss": 0.007920322008430958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.920322241261601e-05, + "grad_norm": 3.9380412101745605, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8890743255615234, + "num_tokens": 499392129.0, + "step": 13094 + }, + { + "epoch": 1.6658185981427298, + "ewc_loss": 0.007915761321783066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91576094343327e-05, + "grad_norm": 3.8970565795898438, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.885261058807373, + "num_tokens": 499424958.0, + "step": 13095 + }, + { + "epoch": 1.6659458084213203, + "ewc_loss": 0.007912960834801197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.912961154943332e-05, + "grad_norm": 3.898686408996582, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.882121205329895, + "num_tokens": 499467769.0, + "step": 13096 + }, + { + "epoch": 1.6660730186999109, + "ewc_loss": 0.007914776913821697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914776506368071e-05, + "grad_norm": 3.896805763244629, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8671177625656128, + "num_tokens": 499513828.0, + "step": 13097 + }, + { + "epoch": 1.6662002289785014, + "ewc_loss": 0.007912401109933853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91240090620704e-05, + "grad_norm": 3.9757940769195557, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8799037933349609, + "num_tokens": 499548219.0, + "step": 13098 + }, + { + "epoch": 1.666327439257092, + "ewc_loss": 0.007941152900457382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.941152580315247e-05, + "grad_norm": 3.953127145767212, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8591272234916687, + "num_tokens": 499587158.0, + "step": 13099 + }, + { + "epoch": 1.6664546495356825, + "ewc_loss": 0.007912473753094673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.912473665783182e-05, + "grad_norm": 3.9328513145446777, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8768829107284546, + "num_tokens": 499625235.0, + "step": 13100 + }, + { + "epoch": 1.666581859814273, + "ewc_loss": 0.007906001061201096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.906000973889604e-05, + "grad_norm": 3.955430269241333, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8577638864517212, + "num_tokens": 499662842.0, + "step": 13101 + }, + { + "epoch": 1.6667090700928635, + "ewc_loss": 0.00792845617979765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928456034278497e-05, + "grad_norm": 3.9478952884674072, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8529860377311707, + "num_tokens": 499703464.0, + "step": 13102 + }, + { + "epoch": 1.666836280371454, + "ewc_loss": 0.007931690663099289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.931690925033763e-05, + "grad_norm": 3.9571633338928223, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8728095889091492, + "num_tokens": 499739160.0, + "step": 13103 + }, + { + "epoch": 1.6669634906500446, + "ewc_loss": 0.007957645691931248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957645721035078e-05, + "grad_norm": 3.9779930114746094, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8626178503036499, + "num_tokens": 499775024.0, + "step": 13104 + }, + { + "epoch": 1.667090700928635, + "ewc_loss": 0.007958618924021721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.958618516568094e-05, + "grad_norm": 3.9239935874938965, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8772826790809631, + "num_tokens": 499816479.0, + "step": 13105 + }, + { + "epoch": 1.6672179112072256, + "ewc_loss": 0.007925909012556076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925909449113533e-05, + "grad_norm": 3.9911911487579346, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8762952089309692, + "num_tokens": 499848790.0, + "step": 13106 + }, + { + "epoch": 1.6673451214858162, + "ewc_loss": 0.007992465980350971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992465543793514e-05, + "grad_norm": 3.8469409942626953, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8934892416000366, + "num_tokens": 499890955.0, + "step": 13107 + }, + { + "epoch": 1.6674723317644067, + "ewc_loss": 0.007899888791143894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.899888441897929e-05, + "grad_norm": 3.960172653198242, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8842476606369019, + "num_tokens": 499924813.0, + "step": 13108 + }, + { + "epoch": 1.6675995420429972, + "ewc_loss": 0.00802087876945734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020878885872662e-05, + "grad_norm": 3.9530696868896484, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8713693618774414, + "num_tokens": 499960032.0, + "step": 13109 + }, + { + "epoch": 1.6677267523215877, + "ewc_loss": 0.007975318469107151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975318294484168e-05, + "grad_norm": 3.915447235107422, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8894568085670471, + "num_tokens": 499995868.0, + "step": 13110 + }, + { + "epoch": 1.667853962600178, + "ewc_loss": 0.007951238192617893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.951238512760028e-05, + "grad_norm": 3.923581123352051, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8792425394058228, + "num_tokens": 500031662.0, + "step": 13111 + }, + { + "epoch": 1.6679811728787686, + "ewc_loss": 0.007974017411470413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974017353262752e-05, + "grad_norm": 3.955474853515625, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8659778833389282, + "num_tokens": 500070189.0, + "step": 13112 + }, + { + "epoch": 1.668108383157359, + "ewc_loss": 0.007986302487552166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98630208009854e-05, + "grad_norm": 3.9062018394470215, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8731138706207275, + "num_tokens": 500109999.0, + "step": 13113 + }, + { + "epoch": 1.6682355934359496, + "ewc_loss": 0.007938441820442677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938441558508202e-05, + "grad_norm": 3.936758041381836, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.859387993812561, + "num_tokens": 500151473.0, + "step": 13114 + }, + { + "epoch": 1.6683628037145402, + "ewc_loss": 0.008003388531506062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003388938959688e-05, + "grad_norm": 3.9481048583984375, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8747831583023071, + "num_tokens": 500187436.0, + "step": 13115 + }, + { + "epoch": 1.6684900139931307, + "ewc_loss": 0.007976680994033813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976681081345305e-05, + "grad_norm": 3.960495948791504, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8751963973045349, + "num_tokens": 500222433.0, + "step": 13116 + }, + { + "epoch": 1.668617224271721, + "ewc_loss": 0.00797817949205637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.978179928613827e-05, + "grad_norm": 3.944812536239624, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8755237460136414, + "num_tokens": 500257390.0, + "step": 13117 + }, + { + "epoch": 1.6687444345503115, + "ewc_loss": 0.00798480212688446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984801777638495e-05, + "grad_norm": 3.925305128097534, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8751102685928345, + "num_tokens": 500301260.0, + "step": 13118 + }, + { + "epoch": 1.668871644828902, + "ewc_loss": 0.007942679338157177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942679076222703e-05, + "grad_norm": 3.910667657852173, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8799072504043579, + "num_tokens": 500343453.0, + "step": 13119 + }, + { + "epoch": 1.6689988551074926, + "ewc_loss": 0.007953395135700703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953395106596872e-05, + "grad_norm": 3.9930601119995117, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8821649551391602, + "num_tokens": 500375133.0, + "step": 13120 + }, + { + "epoch": 1.669126065386083, + "ewc_loss": 0.008000727742910385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00072739366442e-05, + "grad_norm": 3.964279890060425, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8709539175033569, + "num_tokens": 500408540.0, + "step": 13121 + }, + { + "epoch": 1.6692532756646736, + "ewc_loss": 0.007954606786370277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.954606553539634e-05, + "grad_norm": 3.895073175430298, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8873685598373413, + "num_tokens": 500446243.0, + "step": 13122 + }, + { + "epoch": 1.6693804859432642, + "ewc_loss": 0.007918449118733406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91844940977171e-05, + "grad_norm": 3.9725582599639893, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8496447205543518, + "num_tokens": 500484313.0, + "step": 13123 + }, + { + "epoch": 1.6695076962218547, + "ewc_loss": 0.008005276322364807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.005276322364807e-05, + "grad_norm": 3.9041872024536133, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8873664140701294, + "num_tokens": 500525566.0, + "step": 13124 + }, + { + "epoch": 1.6696349065004452, + "ewc_loss": 0.007924845442175865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924844976514578e-05, + "grad_norm": 3.893960952758789, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8810942769050598, + "num_tokens": 500566274.0, + "step": 13125 + }, + { + "epoch": 1.6697621167790357, + "ewc_loss": 0.007932811044156551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932810694910586e-05, + "grad_norm": 3.948115587234497, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8690096139907837, + "num_tokens": 500602769.0, + "step": 13126 + }, + { + "epoch": 1.6698893270576263, + "ewc_loss": 0.00796282384544611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.962824020069093e-05, + "grad_norm": 3.94939923286438, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8595603704452515, + "num_tokens": 500642895.0, + "step": 13127 + }, + { + "epoch": 1.6700165373362168, + "ewc_loss": 0.007933486253023148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.933486631372944e-05, + "grad_norm": 3.9270782470703125, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8739398717880249, + "num_tokens": 500681461.0, + "step": 13128 + }, + { + "epoch": 1.6701437476148073, + "ewc_loss": 0.007953640073537827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95364030636847e-05, + "grad_norm": 3.960310220718384, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8590730428695679, + "num_tokens": 500723395.0, + "step": 13129 + }, + { + "epoch": 1.6702709578933979, + "ewc_loss": 0.007949414663016796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949414430186152e-05, + "grad_norm": 3.8902134895324707, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8773093223571777, + "num_tokens": 500760571.0, + "step": 13130 + }, + { + "epoch": 1.6703981681719884, + "ewc_loss": 0.007904063910245895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.904064113972709e-05, + "grad_norm": 3.9462454319000244, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8681669235229492, + "num_tokens": 500797858.0, + "step": 13131 + }, + { + "epoch": 1.670525378450579, + "ewc_loss": 0.007976633496582508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976633060025051e-05, + "grad_norm": 3.8721842765808105, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8806817531585693, + "num_tokens": 500840199.0, + "step": 13132 + }, + { + "epoch": 1.6706525887291694, + "ewc_loss": 0.007924052886664867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924052624730393e-05, + "grad_norm": 3.919006824493408, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8705346584320068, + "num_tokens": 500883290.0, + "step": 13133 + }, + { + "epoch": 1.67077979900776, + "ewc_loss": 0.007956355810165405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956355693750083e-05, + "grad_norm": 4.001486778259277, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8783976435661316, + "num_tokens": 500923072.0, + "step": 13134 + }, + { + "epoch": 1.6709070092863505, + "ewc_loss": 0.007977678440511227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97767861513421e-05, + "grad_norm": 3.942281484603882, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8800482153892517, + "num_tokens": 500958726.0, + "step": 13135 + }, + { + "epoch": 1.6710342195649408, + "ewc_loss": 0.007918450981378555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918450864963233e-05, + "grad_norm": 3.9454288482666016, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8733206987380981, + "num_tokens": 500996746.0, + "step": 13136 + }, + { + "epoch": 1.6711614298435313, + "ewc_loss": 0.007929040119051933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929040293674916e-05, + "grad_norm": 3.9353623390197754, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8684581518173218, + "num_tokens": 501034135.0, + "step": 13137 + }, + { + "epoch": 1.6712886401221219, + "ewc_loss": 0.007911719381809235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911719148978591e-05, + "grad_norm": 3.9421064853668213, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8778811693191528, + "num_tokens": 501069810.0, + "step": 13138 + }, + { + "epoch": 1.6714158504007124, + "ewc_loss": 0.007935795933008194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935796020319685e-05, + "grad_norm": 3.928194999694824, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8696374893188477, + "num_tokens": 501110597.0, + "step": 13139 + }, + { + "epoch": 1.671543060679303, + "ewc_loss": 0.007912235334515572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.912235741969198e-05, + "grad_norm": 3.968216896057129, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8651249408721924, + "num_tokens": 501145927.0, + "step": 13140 + }, + { + "epoch": 1.6716702709578934, + "ewc_loss": 0.007981544360518456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.981544331414625e-05, + "grad_norm": 3.953526258468628, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8566883206367493, + "num_tokens": 501188473.0, + "step": 13141 + }, + { + "epoch": 1.6717974812364838, + "ewc_loss": 0.007940884679555893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940884825075045e-05, + "grad_norm": 3.9019789695739746, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8864723443984985, + "num_tokens": 501226143.0, + "step": 13142 + }, + { + "epoch": 1.6719246915150743, + "ewc_loss": 0.007933801971375942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.933801680337638e-05, + "grad_norm": 3.934119939804077, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8653614521026611, + "num_tokens": 501266980.0, + "step": 13143 + }, + { + "epoch": 1.6720519017936648, + "ewc_loss": 0.007970599457621574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.970599108375609e-05, + "grad_norm": 3.9518353939056396, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8719977736473083, + "num_tokens": 501305414.0, + "step": 13144 + }, + { + "epoch": 1.6721791120722553, + "ewc_loss": 0.007951534353196621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.951533916639164e-05, + "grad_norm": 3.977107048034668, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8669599294662476, + "num_tokens": 501343009.0, + "step": 13145 + }, + { + "epoch": 1.6723063223508459, + "ewc_loss": 0.007977033965289593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977033965289593e-05, + "grad_norm": 3.909198045730591, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8802920579910278, + "num_tokens": 501383571.0, + "step": 13146 + }, + { + "epoch": 1.6724335326294364, + "ewc_loss": 0.007941307500004768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.941307558212429e-05, + "grad_norm": 3.923524856567383, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8766399025917053, + "num_tokens": 501425444.0, + "step": 13147 + }, + { + "epoch": 1.672560742908027, + "ewc_loss": 0.0079531604424119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953160093165934e-05, + "grad_norm": 3.8916406631469727, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8851886987686157, + "num_tokens": 501466536.0, + "step": 13148 + }, + { + "epoch": 1.6726879531866174, + "ewc_loss": 0.00793752446770668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.937524787848815e-05, + "grad_norm": 3.977008819580078, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8769155144691467, + "num_tokens": 501507346.0, + "step": 13149 + }, + { + "epoch": 1.672815163465208, + "ewc_loss": 0.007993797771632671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993797771632671e-05, + "grad_norm": 3.9361672401428223, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8823121786117554, + "num_tokens": 501545515.0, + "step": 13150 + }, + { + "epoch": 1.6729423737437985, + "ewc_loss": 0.007955146953463554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955147157190368e-05, + "grad_norm": 3.937446117401123, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8679784536361694, + "num_tokens": 501588793.0, + "step": 13151 + }, + { + "epoch": 1.673069584022389, + "ewc_loss": 0.007938782684504986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938782800920308e-05, + "grad_norm": 3.940671682357788, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8730307221412659, + "num_tokens": 501624384.0, + "step": 13152 + }, + { + "epoch": 1.6731967943009796, + "ewc_loss": 0.007944772019982338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944772369228303e-05, + "grad_norm": 3.988481044769287, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8747996687889099, + "num_tokens": 501663150.0, + "step": 13153 + }, + { + "epoch": 1.67332400457957, + "ewc_loss": 0.00796547718346119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.965477561810985e-05, + "grad_norm": 3.887763261795044, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.867713451385498, + "num_tokens": 501706177.0, + "step": 13154 + }, + { + "epoch": 1.6734512148581606, + "ewc_loss": 0.007897443138062954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.897442992543802e-05, + "grad_norm": 3.893362045288086, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8869293928146362, + "num_tokens": 501745267.0, + "step": 13155 + }, + { + "epoch": 1.6735784251367511, + "ewc_loss": 0.00793522223830223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935222674859688e-05, + "grad_norm": 3.957864284515381, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8701881170272827, + "num_tokens": 501783591.0, + "step": 13156 + }, + { + "epoch": 1.6737056354153417, + "ewc_loss": 0.007955619134008884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955619366839528e-05, + "grad_norm": 3.941162347793579, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8663164377212524, + "num_tokens": 501825190.0, + "step": 13157 + }, + { + "epoch": 1.6738328456939322, + "ewc_loss": 0.00791907124221325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919070776551962e-05, + "grad_norm": 4.004361629486084, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8759765625, + "num_tokens": 501855813.0, + "step": 13158 + }, + { + "epoch": 1.6739600559725227, + "ewc_loss": 0.007958666421473026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.958666537888348e-05, + "grad_norm": 3.9653937816619873, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8690200448036194, + "num_tokens": 501890106.0, + "step": 13159 + }, + { + "epoch": 1.674087266251113, + "ewc_loss": 0.007936349138617516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936348993098363e-05, + "grad_norm": 3.9131572246551514, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8613998293876648, + "num_tokens": 501935437.0, + "step": 13160 + }, + { + "epoch": 1.6742144765297036, + "ewc_loss": 0.007909848354756832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909848500275984e-05, + "grad_norm": 3.8980562686920166, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8624637126922607, + "num_tokens": 501980134.0, + "step": 13161 + }, + { + "epoch": 1.674341686808294, + "ewc_loss": 0.00791008397936821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910084241302684e-05, + "grad_norm": 3.9571170806884766, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.868720293045044, + "num_tokens": 502015977.0, + "step": 13162 + }, + { + "epoch": 1.6744688970868846, + "ewc_loss": 0.007964197546243668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96419772086665e-05, + "grad_norm": 3.8947904109954834, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8781134486198425, + "num_tokens": 502061717.0, + "step": 13163 + }, + { + "epoch": 1.6745961073654752, + "ewc_loss": 0.007891670800745487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89167097536847e-05, + "grad_norm": 3.9282429218292236, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8730606436729431, + "num_tokens": 502101594.0, + "step": 13164 + }, + { + "epoch": 1.6747233176440657, + "ewc_loss": 0.007922177202999592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.922177610453218e-05, + "grad_norm": 3.9352431297302246, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8743770122528076, + "num_tokens": 502136521.0, + "step": 13165 + }, + { + "epoch": 1.674850527922656, + "ewc_loss": 0.007932741194963455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93274084571749e-05, + "grad_norm": 3.937025308609009, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8573460578918457, + "num_tokens": 502174821.0, + "step": 13166 + }, + { + "epoch": 1.6749777382012465, + "ewc_loss": 0.007928119972348213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928119885036722e-05, + "grad_norm": 3.9551424980163574, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8679477572441101, + "num_tokens": 502210615.0, + "step": 13167 + }, + { + "epoch": 1.675104948479837, + "ewc_loss": 0.007952908053994179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952907617436722e-05, + "grad_norm": 3.922811269760132, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8745763301849365, + "num_tokens": 502248220.0, + "step": 13168 + }, + { + "epoch": 1.6752321587584276, + "ewc_loss": 0.007935085333883762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93508515926078e-05, + "grad_norm": 3.890423536300659, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8742071390151978, + "num_tokens": 502290686.0, + "step": 13169 + }, + { + "epoch": 1.675359369037018, + "ewc_loss": 0.007913063280284405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91306301834993e-05, + "grad_norm": 3.9188406467437744, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8630083203315735, + "num_tokens": 502331789.0, + "step": 13170 + }, + { + "epoch": 1.6754865793156086, + "ewc_loss": 0.007971866987645626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971867307787761e-05, + "grad_norm": 3.935944080352783, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8807830214500427, + "num_tokens": 502371490.0, + "step": 13171 + }, + { + "epoch": 1.6756137895941992, + "ewc_loss": 0.007946077734231949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946077676024288e-05, + "grad_norm": 3.8883261680603027, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8734942674636841, + "num_tokens": 502415606.0, + "step": 13172 + }, + { + "epoch": 1.6757409998727897, + "ewc_loss": 0.007929093204438686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9290934081655e-05, + "grad_norm": 3.9269542694091797, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8731786012649536, + "num_tokens": 502456203.0, + "step": 13173 + }, + { + "epoch": 1.6758682101513802, + "ewc_loss": 0.00795069895684719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.950699364300817e-05, + "grad_norm": 3.916348934173584, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8913337588310242, + "num_tokens": 502491508.0, + "step": 13174 + }, + { + "epoch": 1.6759954204299707, + "ewc_loss": 0.007923237048089504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923236989881843e-05, + "grad_norm": 3.930917978286743, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.863426923751831, + "num_tokens": 502533408.0, + "step": 13175 + }, + { + "epoch": 1.6761226307085613, + "ewc_loss": 0.007955229841172695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955230103107169e-05, + "grad_norm": 3.9896292686462402, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8679488897323608, + "num_tokens": 502569824.0, + "step": 13176 + }, + { + "epoch": 1.6762498409871518, + "ewc_loss": 0.007956631481647491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956631452543661e-05, + "grad_norm": 3.9494714736938477, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8822126984596252, + "num_tokens": 502604776.0, + "step": 13177 + }, + { + "epoch": 1.6763770512657423, + "ewc_loss": 0.007924912497401237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92491264292039e-05, + "grad_norm": 3.9374661445617676, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8563088178634644, + "num_tokens": 502645732.0, + "step": 13178 + }, + { + "epoch": 1.6765042615443329, + "ewc_loss": 0.007928074337542057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928074774099514e-05, + "grad_norm": 3.951073169708252, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8773003220558167, + "num_tokens": 502686044.0, + "step": 13179 + }, + { + "epoch": 1.6766314718229234, + "ewc_loss": 0.007948128506541252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948128768475726e-05, + "grad_norm": 3.9127917289733887, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.865983247756958, + "num_tokens": 502728288.0, + "step": 13180 + }, + { + "epoch": 1.676758682101514, + "ewc_loss": 0.007902299053966999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.902298966655508e-05, + "grad_norm": 3.9670398235321045, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8653532266616821, + "num_tokens": 502760772.0, + "step": 13181 + }, + { + "epoch": 1.6768858923801044, + "ewc_loss": 0.007958351634442806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.958351488923654e-05, + "grad_norm": 3.9635465145111084, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8756003975868225, + "num_tokens": 502791585.0, + "step": 13182 + }, + { + "epoch": 1.677013102658695, + "ewc_loss": 0.007946348749101162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946349069243297e-05, + "grad_norm": 3.9135899543762207, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8918048143386841, + "num_tokens": 502826339.0, + "step": 13183 + }, + { + "epoch": 1.6771403129372855, + "ewc_loss": 0.007926305755972862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926305988803506e-05, + "grad_norm": 3.8630995750427246, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8909007906913757, + "num_tokens": 502868716.0, + "step": 13184 + }, + { + "epoch": 1.6772675232158758, + "ewc_loss": 0.00792729202657938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.927291881060228e-05, + "grad_norm": 3.93404221534729, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8749454021453857, + "num_tokens": 502907992.0, + "step": 13185 + }, + { + "epoch": 1.6773947334944663, + "ewc_loss": 0.007982473820447922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982473471201956e-05, + "grad_norm": 4.004344463348389, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8692234754562378, + "num_tokens": 502947197.0, + "step": 13186 + }, + { + "epoch": 1.6775219437730569, + "ewc_loss": 0.007987597025930882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987597200553864e-05, + "grad_norm": 3.9481523036956787, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8824558258056641, + "num_tokens": 502985890.0, + "step": 13187 + }, + { + "epoch": 1.6776491540516474, + "ewc_loss": 0.007932126522064209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932126754894853e-05, + "grad_norm": 3.9217910766601562, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8686487674713135, + "num_tokens": 503033476.0, + "step": 13188 + }, + { + "epoch": 1.677776364330238, + "ewc_loss": 0.007921747863292694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92174760135822e-05, + "grad_norm": 3.9527368545532227, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8821262121200562, + "num_tokens": 503068002.0, + "step": 13189 + }, + { + "epoch": 1.6779035746088284, + "ewc_loss": 0.007952926680445671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952926534926519e-05, + "grad_norm": 3.9180407524108887, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8664662837982178, + "num_tokens": 503110059.0, + "step": 13190 + }, + { + "epoch": 1.6780307848874187, + "ewc_loss": 0.007922589778900146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.922589429654181e-05, + "grad_norm": 3.989194393157959, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.867713451385498, + "num_tokens": 503146617.0, + "step": 13191 + }, + { + "epoch": 1.6781579951660093, + "ewc_loss": 0.007968450896441936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96845051809214e-05, + "grad_norm": 3.93847918510437, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8712955713272095, + "num_tokens": 503186436.0, + "step": 13192 + }, + { + "epoch": 1.6782852054445998, + "ewc_loss": 0.007923850789666176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92385108070448e-05, + "grad_norm": 3.900947093963623, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8805274963378906, + "num_tokens": 503227983.0, + "step": 13193 + }, + { + "epoch": 1.6784124157231903, + "ewc_loss": 0.007918482646346092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918482879176736e-05, + "grad_norm": 3.920851945877075, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8622062802314758, + "num_tokens": 503269301.0, + "step": 13194 + }, + { + "epoch": 1.6785396260017809, + "ewc_loss": 0.007965578697621822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.965578697621822e-05, + "grad_norm": 3.9640655517578125, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.857769250869751, + "num_tokens": 503308919.0, + "step": 13195 + }, + { + "epoch": 1.6786668362803714, + "ewc_loss": 0.00796697847545147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.966978591866791e-05, + "grad_norm": 3.9414854049682617, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8830986022949219, + "num_tokens": 503345030.0, + "step": 13196 + }, + { + "epoch": 1.678794046558962, + "ewc_loss": 0.00794768426567316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9476842074655e-05, + "grad_norm": 4.020358085632324, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8653363585472107, + "num_tokens": 503379142.0, + "step": 13197 + }, + { + "epoch": 1.6789212568375524, + "ewc_loss": 0.008004910312592983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004910341696814e-05, + "grad_norm": 3.9626951217651367, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8739677667617798, + "num_tokens": 503411810.0, + "step": 13198 + }, + { + "epoch": 1.679048467116143, + "ewc_loss": 0.007948198355734348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948198617668822e-05, + "grad_norm": 3.8791959285736084, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8726907968521118, + "num_tokens": 503450266.0, + "step": 13199 + }, + { + "epoch": 1.6791756773947335, + "ewc_loss": 0.007942913100123405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942912634462118e-05, + "grad_norm": 3.8866941928863525, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8824270367622375, + "num_tokens": 503491015.0, + "step": 13200 + }, + { + "epoch": 1.679302887673324, + "ewc_loss": 0.007975073531270027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97507309471257e-05, + "grad_norm": 3.915247917175293, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8816322088241577, + "num_tokens": 503531964.0, + "step": 13201 + }, + { + "epoch": 1.6794300979519146, + "ewc_loss": 0.00796964205801487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969641592353582e-05, + "grad_norm": 3.8914034366607666, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8779410123825073, + "num_tokens": 503576069.0, + "step": 13202 + }, + { + "epoch": 1.679557308230505, + "ewc_loss": 0.007960758171975613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960758375702426e-05, + "grad_norm": 3.9865803718566895, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8749566078186035, + "num_tokens": 503607895.0, + "step": 13203 + }, + { + "epoch": 1.6796845185090956, + "ewc_loss": 0.008023803122341633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023803093237802e-05, + "grad_norm": 4.0347185134887695, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8693526983261108, + "num_tokens": 503636891.0, + "step": 13204 + }, + { + "epoch": 1.6798117287876861, + "ewc_loss": 0.00802579801529646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025798160815611e-05, + "grad_norm": 3.85129714012146, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8888087272644043, + "num_tokens": 503682037.0, + "step": 13205 + }, + { + "epoch": 1.6799389390662767, + "ewc_loss": 0.007921465672552586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.92146529420279e-05, + "grad_norm": 3.8977911472320557, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8770151734352112, + "num_tokens": 503724008.0, + "step": 13206 + }, + { + "epoch": 1.6800661493448672, + "ewc_loss": 0.008013276383280754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013276237761602e-05, + "grad_norm": 3.939038038253784, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8819152116775513, + "num_tokens": 503759008.0, + "step": 13207 + }, + { + "epoch": 1.6801933596234577, + "ewc_loss": 0.00801403820514679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.014038030523807e-05, + "grad_norm": 3.9636080265045166, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8719143867492676, + "num_tokens": 503796599.0, + "step": 13208 + }, + { + "epoch": 1.680320569902048, + "ewc_loss": 0.007997691631317139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997691864147782e-05, + "grad_norm": 3.953620672225952, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8712809681892395, + "num_tokens": 503835539.0, + "step": 13209 + }, + { + "epoch": 1.6804477801806386, + "ewc_loss": 0.007988741621375084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988741708686575e-05, + "grad_norm": 3.9150867462158203, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8723262548446655, + "num_tokens": 503875933.0, + "step": 13210 + }, + { + "epoch": 1.680574990459229, + "ewc_loss": 0.007960595190525055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960595394251868e-05, + "grad_norm": 3.944361448287964, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8732205629348755, + "num_tokens": 503913740.0, + "step": 13211 + }, + { + "epoch": 1.6807022007378196, + "ewc_loss": 0.007992181926965714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992181781446561e-05, + "grad_norm": 3.899949550628662, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8782111406326294, + "num_tokens": 503955105.0, + "step": 13212 + }, + { + "epoch": 1.6808294110164101, + "ewc_loss": 0.007953220047056675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95321975601837e-05, + "grad_norm": 4.005619525909424, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.874238908290863, + "num_tokens": 503987641.0, + "step": 13213 + }, + { + "epoch": 1.6809566212950007, + "ewc_loss": 0.008042491041123867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042490662774071e-05, + "grad_norm": 3.953979969024658, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8649005889892578, + "num_tokens": 504028699.0, + "step": 13214 + }, + { + "epoch": 1.681083831573591, + "ewc_loss": 0.007963502779603004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963502866914496e-05, + "grad_norm": 3.927351474761963, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8681707382202148, + "num_tokens": 504065597.0, + "step": 13215 + }, + { + "epoch": 1.6812110418521815, + "ewc_loss": 0.007969547063112259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969547004904598e-05, + "grad_norm": 3.9610841274261475, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8603062629699707, + "num_tokens": 504105968.0, + "step": 13216 + }, + { + "epoch": 1.681338252130772, + "ewc_loss": 0.007980790920555592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.980790542205796e-05, + "grad_norm": 3.929845094680786, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8771049976348877, + "num_tokens": 504143743.0, + "step": 13217 + }, + { + "epoch": 1.6814654624093626, + "ewc_loss": 0.007977486588060856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977486529853195e-05, + "grad_norm": 3.994906187057495, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8773977756500244, + "num_tokens": 504177023.0, + "step": 13218 + }, + { + "epoch": 1.681592672687953, + "ewc_loss": 0.008032129146158695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032128971535712e-05, + "grad_norm": 3.976658344268799, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8738114237785339, + "num_tokens": 504210492.0, + "step": 13219 + }, + { + "epoch": 1.6817198829665436, + "ewc_loss": 0.008009824901819229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009825251065195e-05, + "grad_norm": 3.9096100330352783, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8746229410171509, + "num_tokens": 504248322.0, + "step": 13220 + }, + { + "epoch": 1.6818470932451342, + "ewc_loss": 0.007982516661286354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982517126947641e-05, + "grad_norm": 3.9785149097442627, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.859756350517273, + "num_tokens": 504285915.0, + "step": 13221 + }, + { + "epoch": 1.6819743035237247, + "ewc_loss": 0.00805987324565649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059872925514355e-05, + "grad_norm": 3.949733018875122, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8725812435150146, + "num_tokens": 504325578.0, + "step": 13222 + }, + { + "epoch": 1.6821015138023152, + "ewc_loss": 0.008007979020476341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007978613022715e-05, + "grad_norm": 3.930957078933716, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8927810192108154, + "num_tokens": 504362876.0, + "step": 13223 + }, + { + "epoch": 1.6822287240809057, + "ewc_loss": 0.008005008101463318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.005008567124605e-05, + "grad_norm": 3.9554636478424072, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8729954957962036, + "num_tokens": 504402303.0, + "step": 13224 + }, + { + "epoch": 1.6823559343594963, + "ewc_loss": 0.008036560378968716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036560757318512e-05, + "grad_norm": 3.9639008045196533, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8751810789108276, + "num_tokens": 504440039.0, + "step": 13225 + }, + { + "epoch": 1.6824831446380868, + "ewc_loss": 0.008010960184037685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010960300453007e-05, + "grad_norm": 3.900418519973755, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8657495975494385, + "num_tokens": 504485102.0, + "step": 13226 + }, + { + "epoch": 1.6826103549166773, + "ewc_loss": 0.007976317778229713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976318011060357e-05, + "grad_norm": 3.92043399810791, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8730734586715698, + "num_tokens": 504522762.0, + "step": 13227 + }, + { + "epoch": 1.6827375651952678, + "ewc_loss": 0.00800130981951952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001309470273554e-05, + "grad_norm": 3.919191598892212, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8776938915252686, + "num_tokens": 504562818.0, + "step": 13228 + }, + { + "epoch": 1.6828647754738584, + "ewc_loss": 0.007981115020811558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.981115049915388e-05, + "grad_norm": 3.923529624938965, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8658852577209473, + "num_tokens": 504602914.0, + "step": 13229 + }, + { + "epoch": 1.682991985752449, + "ewc_loss": 0.007985473610460758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.985473348526284e-05, + "grad_norm": 3.9351863861083984, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8789800405502319, + "num_tokens": 504641376.0, + "step": 13230 + }, + { + "epoch": 1.6831191960310394, + "ewc_loss": 0.007975076325237751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975076732691377e-05, + "grad_norm": 3.9915785789489746, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8810485601425171, + "num_tokens": 504675031.0, + "step": 13231 + }, + { + "epoch": 1.68324640630963, + "ewc_loss": 0.00801169965416193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011699537746608e-05, + "grad_norm": 3.9928765296936035, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8803181648254395, + "num_tokens": 504709624.0, + "step": 13232 + }, + { + "epoch": 1.6833736165882205, + "ewc_loss": 0.00796892587095499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968925638124347e-05, + "grad_norm": 3.918963670730591, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8781156539916992, + "num_tokens": 504749310.0, + "step": 13233 + }, + { + "epoch": 1.6835008268668108, + "ewc_loss": 0.007907772436738014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.907772669568658e-05, + "grad_norm": 3.907320976257324, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8748961687088013, + "num_tokens": 504791700.0, + "step": 13234 + }, + { + "epoch": 1.6836280371454013, + "ewc_loss": 0.007957251742482185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95725136413239e-05, + "grad_norm": 3.9293129444122314, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8638324737548828, + "num_tokens": 504833246.0, + "step": 13235 + }, + { + "epoch": 1.6837552474239919, + "ewc_loss": 0.007961923256516457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961923256516457e-05, + "grad_norm": 3.9320335388183594, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8808615207672119, + "num_tokens": 504872901.0, + "step": 13236 + }, + { + "epoch": 1.6838824577025824, + "ewc_loss": 0.007936868816614151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936869224067777e-05, + "grad_norm": 3.9938735961914062, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8686345815658569, + "num_tokens": 504906138.0, + "step": 13237 + }, + { + "epoch": 1.684009667981173, + "ewc_loss": 0.007992432452738285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992432074388489e-05, + "grad_norm": 3.961555004119873, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8540512323379517, + "num_tokens": 504947147.0, + "step": 13238 + }, + { + "epoch": 1.6841368782597634, + "ewc_loss": 0.007931675761938095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.931675645522773e-05, + "grad_norm": 3.9271111488342285, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.87548828125, + "num_tokens": 504985332.0, + "step": 13239 + }, + { + "epoch": 1.6842640885383537, + "ewc_loss": 0.007942114025354385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942113734316081e-05, + "grad_norm": 3.952338695526123, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8646186590194702, + "num_tokens": 505027995.0, + "step": 13240 + }, + { + "epoch": 1.6843912988169443, + "ewc_loss": 0.007959422655403614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959422509884462e-05, + "grad_norm": 3.9339964389801025, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8781362175941467, + "num_tokens": 505065311.0, + "step": 13241 + }, + { + "epoch": 1.6845185090955348, + "ewc_loss": 0.007936300709843636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936300244182348e-05, + "grad_norm": 3.9319264888763428, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8692569732666016, + "num_tokens": 505105810.0, + "step": 13242 + }, + { + "epoch": 1.6846457193741253, + "ewc_loss": 0.007962440140545368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.962440577102825e-05, + "grad_norm": 3.953204870223999, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8809615969657898, + "num_tokens": 505141003.0, + "step": 13243 + }, + { + "epoch": 1.6847729296527159, + "ewc_loss": 0.007959950715303421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959950744407251e-05, + "grad_norm": 3.860424518585205, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8636371493339539, + "num_tokens": 505188025.0, + "step": 13244 + }, + { + "epoch": 1.6849001399313064, + "ewc_loss": 0.007907379418611526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90737904026173e-05, + "grad_norm": 3.9192605018615723, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8872389197349548, + "num_tokens": 505226101.0, + "step": 13245 + }, + { + "epoch": 1.685027350209897, + "ewc_loss": 0.007973338477313519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973338506417349e-05, + "grad_norm": 3.942470073699951, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.865527331829071, + "num_tokens": 505264880.0, + "step": 13246 + }, + { + "epoch": 1.6851545604884874, + "ewc_loss": 0.00794464536011219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944645039970055e-05, + "grad_norm": 3.921976327896118, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.866872251033783, + "num_tokens": 505306208.0, + "step": 13247 + }, + { + "epoch": 1.685281770767078, + "ewc_loss": 0.007942449301481247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942449155962095e-05, + "grad_norm": 3.90444278717041, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8937057256698608, + "num_tokens": 505346024.0, + "step": 13248 + }, + { + "epoch": 1.6854089810456685, + "ewc_loss": 0.007935487665235996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935487519716844e-05, + "grad_norm": 3.9242286682128906, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8959502577781677, + "num_tokens": 505383644.0, + "step": 13249 + }, + { + "epoch": 1.685536191324259, + "ewc_loss": 0.007934221997857094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934222230687737e-05, + "grad_norm": 3.9486935138702393, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8801165223121643, + "num_tokens": 505420365.0, + "step": 13250 + }, + { + "epoch": 1.6856634016028496, + "ewc_loss": 0.007941102609038353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.941102376207709e-05, + "grad_norm": 3.9489219188690186, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8781183958053589, + "num_tokens": 505458211.0, + "step": 13251 + }, + { + "epoch": 1.68579061188144, + "ewc_loss": 0.007932541891932487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932542212074623e-05, + "grad_norm": 3.9204561710357666, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8705589771270752, + "num_tokens": 505501649.0, + "step": 13252 + }, + { + "epoch": 1.6859178221600306, + "ewc_loss": 0.007896088063716888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896088209236041e-05, + "grad_norm": 3.9430460929870605, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8778350353240967, + "num_tokens": 505538763.0, + "step": 13253 + }, + { + "epoch": 1.6860450324386211, + "ewc_loss": 0.007912283763289452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.912283763289452e-05, + "grad_norm": 3.9479777812957764, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8959977626800537, + "num_tokens": 505574562.0, + "step": 13254 + }, + { + "epoch": 1.6861722427172117, + "ewc_loss": 0.007920142263174057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.920142525108531e-05, + "grad_norm": 4.001238822937012, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8881110548973083, + "num_tokens": 505609718.0, + "step": 13255 + }, + { + "epoch": 1.6862994529958022, + "ewc_loss": 0.007920525036752224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.920525240479037e-05, + "grad_norm": 3.9380202293395996, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8863961100578308, + "num_tokens": 505644337.0, + "step": 13256 + }, + { + "epoch": 1.6864266632743927, + "ewc_loss": 0.007864322513341904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.864322105888277e-05, + "grad_norm": 3.905953884124756, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8845648765563965, + "num_tokens": 505681929.0, + "step": 13257 + }, + { + "epoch": 1.686553873552983, + "ewc_loss": 0.007871425710618496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.871425623307005e-05, + "grad_norm": 3.9695682525634766, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8536499738693237, + "num_tokens": 505719373.0, + "step": 13258 + }, + { + "epoch": 1.6866810838315736, + "ewc_loss": 0.00792465079575777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924650708446279e-05, + "grad_norm": 3.932906150817871, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8621547818183899, + "num_tokens": 505757767.0, + "step": 13259 + }, + { + "epoch": 1.686808294110164, + "ewc_loss": 0.00788574293255806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.885743252700195e-05, + "grad_norm": 4.005284786224365, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8451130390167236, + "num_tokens": 505794777.0, + "step": 13260 + }, + { + "epoch": 1.6869355043887546, + "ewc_loss": 0.007968681864440441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968681893544272e-05, + "grad_norm": 3.891249895095825, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.882838785648346, + "num_tokens": 505832672.0, + "step": 13261 + }, + { + "epoch": 1.6870627146673451, + "ewc_loss": 0.007880840450525284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.880840712459758e-05, + "grad_norm": 3.9297235012054443, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8639287948608398, + "num_tokens": 505873142.0, + "step": 13262 + }, + { + "epoch": 1.6871899249459357, + "ewc_loss": 0.007945743389427662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.945743709569797e-05, + "grad_norm": 3.918300151824951, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8785231113433838, + "num_tokens": 505911994.0, + "step": 13263 + }, + { + "epoch": 1.687317135224526, + "ewc_loss": 0.007926685735583305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.926685793790966e-05, + "grad_norm": 3.940589189529419, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8787148594856262, + "num_tokens": 505948874.0, + "step": 13264 + }, + { + "epoch": 1.6874443455031165, + "ewc_loss": 0.007960733026266098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960732909850776e-05, + "grad_norm": 3.903057336807251, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8794021010398865, + "num_tokens": 505989679.0, + "step": 13265 + }, + { + "epoch": 1.687571555781707, + "ewc_loss": 0.0079324496909976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932449807412922e-05, + "grad_norm": 3.9177865982055664, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8833770751953125, + "num_tokens": 506033209.0, + "step": 13266 + }, + { + "epoch": 1.6876987660602976, + "ewc_loss": 0.0079403817653656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940381328808144e-05, + "grad_norm": 3.951643943786621, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8844850659370422, + "num_tokens": 506065379.0, + "step": 13267 + }, + { + "epoch": 1.687825976338888, + "ewc_loss": 0.007955086417496204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955086039146408e-05, + "grad_norm": 4.032534599304199, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8669296503067017, + "num_tokens": 506099484.0, + "step": 13268 + }, + { + "epoch": 1.6879531866174786, + "ewc_loss": 0.008007034659385681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007034193724394e-05, + "grad_norm": 3.9433746337890625, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8594959378242493, + "num_tokens": 506136559.0, + "step": 13269 + }, + { + "epoch": 1.6880803968960691, + "ewc_loss": 0.007945152930915356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.945152901811525e-05, + "grad_norm": 3.9603567123413086, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8743732571601868, + "num_tokens": 506172997.0, + "step": 13270 + }, + { + "epoch": 1.6882076071746597, + "ewc_loss": 0.008007162250578403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007162250578403e-05, + "grad_norm": 3.8795671463012695, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8891646862030029, + "num_tokens": 506209858.0, + "step": 13271 + }, + { + "epoch": 1.6883348174532502, + "ewc_loss": 0.007964951917529106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96495151007548e-05, + "grad_norm": 3.964876651763916, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8865029215812683, + "num_tokens": 506242260.0, + "step": 13272 + }, + { + "epoch": 1.6884620277318407, + "ewc_loss": 0.008048501797020435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04850205895491e-05, + "grad_norm": 3.9824976921081543, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8624918460845947, + "num_tokens": 506276856.0, + "step": 13273 + }, + { + "epoch": 1.6885892380104313, + "ewc_loss": 0.008013768121600151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01376809249632e-05, + "grad_norm": 3.9251933097839355, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8924412131309509, + "num_tokens": 506311469.0, + "step": 13274 + }, + { + "epoch": 1.6887164482890218, + "ewc_loss": 0.007983718998730183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.983719115145504e-05, + "grad_norm": 3.911024570465088, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8836864233016968, + "num_tokens": 506348240.0, + "step": 13275 + }, + { + "epoch": 1.6888436585676123, + "ewc_loss": 0.008014013059437275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.014013292267919e-05, + "grad_norm": 3.934741497039795, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8759900331497192, + "num_tokens": 506387203.0, + "step": 13276 + }, + { + "epoch": 1.6889708688462028, + "ewc_loss": 0.008028804324567318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028804586501792e-05, + "grad_norm": 4.01696252822876, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8530299663543701, + "num_tokens": 506421869.0, + "step": 13277 + }, + { + "epoch": 1.6890980791247934, + "ewc_loss": 0.008063218556344509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.063218410825357e-05, + "grad_norm": 3.975841999053955, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8762760162353516, + "num_tokens": 506458873.0, + "step": 13278 + }, + { + "epoch": 1.689225289403384, + "ewc_loss": 0.00800258293747902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002582762856036e-05, + "grad_norm": 3.9603002071380615, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8624866604804993, + "num_tokens": 506495947.0, + "step": 13279 + }, + { + "epoch": 1.6893524996819744, + "ewc_loss": 0.008015924133360386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.015923958737403e-05, + "grad_norm": 4.013460159301758, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8565762639045715, + "num_tokens": 506533098.0, + "step": 13280 + }, + { + "epoch": 1.689479709960565, + "ewc_loss": 0.00806035939604044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060359687078744e-05, + "grad_norm": 3.9011940956115723, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8860280513763428, + "num_tokens": 506571346.0, + "step": 13281 + }, + { + "epoch": 1.6896069202391555, + "ewc_loss": 0.007960428483784199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960428774822503e-05, + "grad_norm": 3.9850053787231445, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8767192363739014, + "num_tokens": 506607912.0, + "step": 13282 + }, + { + "epoch": 1.6897341305177458, + "ewc_loss": 0.008067124523222446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067124872468412e-05, + "grad_norm": 3.916527509689331, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8634580373764038, + "num_tokens": 506654258.0, + "step": 13283 + }, + { + "epoch": 1.6898613407963363, + "ewc_loss": 0.007976376451551914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97637621872127e-05, + "grad_norm": 3.949801206588745, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8826532959938049, + "num_tokens": 506693436.0, + "step": 13284 + }, + { + "epoch": 1.6899885510749268, + "ewc_loss": 0.008013088256120682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013088518055156e-05, + "grad_norm": 3.9115562438964844, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8768101930618286, + "num_tokens": 506737383.0, + "step": 13285 + }, + { + "epoch": 1.6901157613535174, + "ewc_loss": 0.007986103184521198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986103446455672e-05, + "grad_norm": 4.001752853393555, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8838666677474976, + "num_tokens": 506772282.0, + "step": 13286 + }, + { + "epoch": 1.690242971632108, + "ewc_loss": 0.008035036735236645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035037171794102e-05, + "grad_norm": 3.9533233642578125, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8854019641876221, + "num_tokens": 506807208.0, + "step": 13287 + }, + { + "epoch": 1.6903701819106984, + "ewc_loss": 0.007967832498252392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967832061694935e-05, + "grad_norm": 3.893155813217163, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.881067156791687, + "num_tokens": 506847952.0, + "step": 13288 + }, + { + "epoch": 1.6904973921892887, + "ewc_loss": 0.007950250059366226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.950249710120261e-05, + "grad_norm": 3.942312240600586, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8786618113517761, + "num_tokens": 506888265.0, + "step": 13289 + }, + { + "epoch": 1.6906246024678793, + "ewc_loss": 0.007995452731847763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995452324394137e-05, + "grad_norm": 3.952972412109375, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8777683973312378, + "num_tokens": 506926096.0, + "step": 13290 + }, + { + "epoch": 1.6907518127464698, + "ewc_loss": 0.00795814674347639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.958146306918934e-05, + "grad_norm": 3.962644338607788, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8711795806884766, + "num_tokens": 506962252.0, + "step": 13291 + }, + { + "epoch": 1.6908790230250603, + "ewc_loss": 0.007975085638463497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975085463840514e-05, + "grad_norm": 3.8924002647399902, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8807482123374939, + "num_tokens": 507004709.0, + "step": 13292 + }, + { + "epoch": 1.6910062333036509, + "ewc_loss": 0.007918312214314938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918311894172803e-05, + "grad_norm": 3.9049508571624756, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8904315233230591, + "num_tokens": 507041041.0, + "step": 13293 + }, + { + "epoch": 1.6911334435822414, + "ewc_loss": 0.007953304797410965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953304884722456e-05, + "grad_norm": 3.9236390590667725, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8688454627990723, + "num_tokens": 507080251.0, + "step": 13294 + }, + { + "epoch": 1.691260653860832, + "ewc_loss": 0.007955399341881275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95539963291958e-05, + "grad_norm": 3.9541428089141846, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8744078874588013, + "num_tokens": 507120813.0, + "step": 13295 + }, + { + "epoch": 1.6913878641394224, + "ewc_loss": 0.007959325797855854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959325739648193e-05, + "grad_norm": 3.943777084350586, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8615318536758423, + "num_tokens": 507160284.0, + "step": 13296 + }, + { + "epoch": 1.691515074418013, + "ewc_loss": 0.007934313267469406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934313180157915e-05, + "grad_norm": 3.8806347846984863, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.86933833360672, + "num_tokens": 507205069.0, + "step": 13297 + }, + { + "epoch": 1.6916422846966035, + "ewc_loss": 0.007904275320470333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90427511674352e-05, + "grad_norm": 3.9209794998168945, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8825580477714539, + "num_tokens": 507244689.0, + "step": 13298 + }, + { + "epoch": 1.691769494975194, + "ewc_loss": 0.007957680150866508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957679918035865e-05, + "grad_norm": 3.9177639484405518, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.876774251461029, + "num_tokens": 507283917.0, + "step": 13299 + }, + { + "epoch": 1.6918967052537845, + "ewc_loss": 0.0079202214255929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.920221105450764e-05, + "grad_norm": 3.8949878215789795, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8782683610916138, + "num_tokens": 507327937.0, + "step": 13300 + }, + { + "epoch": 1.692023915532375, + "ewc_loss": 0.007914284244179726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91428392403759e-05, + "grad_norm": 3.8850250244140625, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8896018266677856, + "num_tokens": 507369521.0, + "step": 13301 + }, + { + "epoch": 1.6921511258109656, + "ewc_loss": 0.007910070940852165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910071144578978e-05, + "grad_norm": 3.9580888748168945, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8850604295730591, + "num_tokens": 507408157.0, + "step": 13302 + }, + { + "epoch": 1.6922783360895561, + "ewc_loss": 0.007924462668597698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924462988739833e-05, + "grad_norm": 3.9031426906585693, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8812874555587769, + "num_tokens": 507448003.0, + "step": 13303 + }, + { + "epoch": 1.6924055463681467, + "ewc_loss": 0.007878328673541546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.878329051891342e-05, + "grad_norm": 3.8703625202178955, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8830360174179077, + "num_tokens": 507489970.0, + "step": 13304 + }, + { + "epoch": 1.6925327566467372, + "ewc_loss": 0.007881845347583294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.881845522206277e-05, + "grad_norm": 3.988112688064575, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8765906095504761, + "num_tokens": 507524536.0, + "step": 13305 + }, + { + "epoch": 1.6926599669253277, + "ewc_loss": 0.007948527112603188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948527490952983e-05, + "grad_norm": 3.8912453651428223, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8951660990715027, + "num_tokens": 507561575.0, + "step": 13306 + }, + { + "epoch": 1.692787177203918, + "ewc_loss": 0.007834367454051971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.834367715986446e-05, + "grad_norm": 3.9638233184814453, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.878480076789856, + "num_tokens": 507597212.0, + "step": 13307 + }, + { + "epoch": 1.6929143874825086, + "ewc_loss": 0.007928391918540001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928392005851492e-05, + "grad_norm": 3.951395034790039, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8777711987495422, + "num_tokens": 507634251.0, + "step": 13308 + }, + { + "epoch": 1.693041597761099, + "ewc_loss": 0.007876487448811531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.876487507019192e-05, + "grad_norm": 3.9258949756622314, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8810022473335266, + "num_tokens": 507671345.0, + "step": 13309 + }, + { + "epoch": 1.6931688080396896, + "ewc_loss": 0.007862262427806854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.862262282287702e-05, + "grad_norm": 3.954265594482422, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8647649884223938, + "num_tokens": 507709797.0, + "step": 13310 + }, + { + "epoch": 1.6932960183182801, + "ewc_loss": 0.007913143374025822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913143781479448e-05, + "grad_norm": 3.937633514404297, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8751024603843689, + "num_tokens": 507747047.0, + "step": 13311 + }, + { + "epoch": 1.6934232285968707, + "ewc_loss": 0.007873617112636566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.873617141740397e-05, + "grad_norm": 3.942270278930664, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8767957091331482, + "num_tokens": 507782187.0, + "step": 13312 + }, + { + "epoch": 1.693550438875461, + "ewc_loss": 0.00789891928434372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89891928434372e-05, + "grad_norm": 3.9067556858062744, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8635331392288208, + "num_tokens": 507826714.0, + "step": 13313 + }, + { + "epoch": 1.6936776491540515, + "ewc_loss": 0.007899612188339233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89961195550859e-05, + "grad_norm": 3.9018566608428955, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8676165342330933, + "num_tokens": 507869423.0, + "step": 13314 + }, + { + "epoch": 1.693804859432642, + "ewc_loss": 0.00789489783346653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89489786257036e-05, + "grad_norm": 3.9333102703094482, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8844786882400513, + "num_tokens": 507904711.0, + "step": 13315 + }, + { + "epoch": 1.6939320697112326, + "ewc_loss": 0.007931102998554707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.931103027658537e-05, + "grad_norm": 3.9035565853118896, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8712136745452881, + "num_tokens": 507947760.0, + "step": 13316 + }, + { + "epoch": 1.694059279989823, + "ewc_loss": 0.007911503314971924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911503780633211e-05, + "grad_norm": 3.94366717338562, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8737998604774475, + "num_tokens": 507984249.0, + "step": 13317 + }, + { + "epoch": 1.6941864902684136, + "ewc_loss": 0.007946937344968319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946936966618523e-05, + "grad_norm": 4.054513931274414, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8715097904205322, + "num_tokens": 508017206.0, + "step": 13318 + }, + { + "epoch": 1.6943137005470041, + "ewc_loss": 0.007983498275279999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.983498653629795e-05, + "grad_norm": 3.9086649417877197, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8705813884735107, + "num_tokens": 508056357.0, + "step": 13319 + }, + { + "epoch": 1.6944409108255947, + "ewc_loss": 0.007877103984355927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.877103780629113e-05, + "grad_norm": 3.934627056121826, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8673616647720337, + "num_tokens": 508094870.0, + "step": 13320 + }, + { + "epoch": 1.6945681211041852, + "ewc_loss": 0.007955201901495457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955201726872474e-05, + "grad_norm": 4.114448070526123, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8731879591941833, + "num_tokens": 508120323.0, + "step": 13321 + }, + { + "epoch": 1.6946953313827757, + "ewc_loss": 0.008038456551730633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038456144277006e-05, + "grad_norm": 3.914163112640381, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8677156567573547, + "num_tokens": 508160018.0, + "step": 13322 + }, + { + "epoch": 1.6948225416613663, + "ewc_loss": 0.007868160493671894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868160173529759e-05, + "grad_norm": 3.8844101428985596, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8917120695114136, + "num_tokens": 508200947.0, + "step": 13323 + }, + { + "epoch": 1.6949497519399568, + "ewc_loss": 0.00796746276319027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967463170643896e-05, + "grad_norm": 3.9476656913757324, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8621951341629028, + "num_tokens": 508246457.0, + "step": 13324 + }, + { + "epoch": 1.6950769622185473, + "ewc_loss": 0.007980617694556713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.980617374414578e-05, + "grad_norm": 3.9927215576171875, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8709595799446106, + "num_tokens": 508281951.0, + "step": 13325 + }, + { + "epoch": 1.6952041724971378, + "ewc_loss": 0.007991325110197067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991325401235372e-05, + "grad_norm": 4.002735614776611, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8574041724205017, + "num_tokens": 508322571.0, + "step": 13326 + }, + { + "epoch": 1.6953313827757284, + "ewc_loss": 0.007969397120177746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969397120177746e-05, + "grad_norm": 3.9332125186920166, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8814889788627625, + "num_tokens": 508357347.0, + "step": 13327 + }, + { + "epoch": 1.695458593054319, + "ewc_loss": 0.007938671857118607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93867147876881e-05, + "grad_norm": 3.8861587047576904, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8756861090660095, + "num_tokens": 508401813.0, + "step": 13328 + }, + { + "epoch": 1.6955858033329094, + "ewc_loss": 0.007933980785310268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.933980668894947e-05, + "grad_norm": 3.9633350372314453, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8646031618118286, + "num_tokens": 508440457.0, + "step": 13329 + }, + { + "epoch": 1.6957130136115, + "ewc_loss": 0.008002137765288353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00213820184581e-05, + "grad_norm": 4.047041416168213, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8667963743209839, + "num_tokens": 508472272.0, + "step": 13330 + }, + { + "epoch": 1.6958402238900905, + "ewc_loss": 0.008029325865209103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029325545066968e-05, + "grad_norm": 3.939639091491699, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8873066902160645, + "num_tokens": 508509352.0, + "step": 13331 + }, + { + "epoch": 1.6959674341686808, + "ewc_loss": 0.007940754294395447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94075385783799e-05, + "grad_norm": 3.894906759262085, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.883642315864563, + "num_tokens": 508552515.0, + "step": 13332 + }, + { + "epoch": 1.6960946444472713, + "ewc_loss": 0.007956336252391338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956336048664525e-05, + "grad_norm": 3.9279026985168457, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8874936699867249, + "num_tokens": 508591399.0, + "step": 13333 + }, + { + "epoch": 1.6962218547258618, + "ewc_loss": 0.007983438670635223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.983438990777358e-05, + "grad_norm": 3.9536421298980713, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8790293335914612, + "num_tokens": 508630427.0, + "step": 13334 + }, + { + "epoch": 1.6963490650044524, + "ewc_loss": 0.00797447469085455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974475010996684e-05, + "grad_norm": 3.929964303970337, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8818331956863403, + "num_tokens": 508667469.0, + "step": 13335 + }, + { + "epoch": 1.696476275283043, + "ewc_loss": 0.00795244611799717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952446321723983e-05, + "grad_norm": 3.946877956390381, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8792884349822998, + "num_tokens": 508706308.0, + "step": 13336 + }, + { + "epoch": 1.6966034855616334, + "ewc_loss": 0.0079661188647151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.966119301272556e-05, + "grad_norm": 4.025315284729004, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8639507293701172, + "num_tokens": 508738968.0, + "step": 13337 + }, + { + "epoch": 1.6967306958402237, + "ewc_loss": 0.008003897964954376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00389752839692e-05, + "grad_norm": 3.991014003753662, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8813583254814148, + "num_tokens": 508769879.0, + "step": 13338 + }, + { + "epoch": 1.6968579061188143, + "ewc_loss": 0.00794969778507948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949697464937344e-05, + "grad_norm": 3.9005846977233887, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8616219758987427, + "num_tokens": 508812277.0, + "step": 13339 + }, + { + "epoch": 1.6969851163974048, + "ewc_loss": 0.007942606694996357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942607044242322e-05, + "grad_norm": 3.9530234336853027, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8654651045799255, + "num_tokens": 508849293.0, + "step": 13340 + }, + { + "epoch": 1.6971123266759953, + "ewc_loss": 0.00799598265439272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99598274170421e-05, + "grad_norm": 3.9068820476531982, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8765292763710022, + "num_tokens": 508885798.0, + "step": 13341 + }, + { + "epoch": 1.6972395369545858, + "ewc_loss": 0.007951340638101101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.951340376166627e-05, + "grad_norm": 3.963385581970215, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8749192357063293, + "num_tokens": 508920491.0, + "step": 13342 + }, + { + "epoch": 1.6973667472331764, + "ewc_loss": 0.00802488625049591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024886483326554e-05, + "grad_norm": 3.929550886154175, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8750179409980774, + "num_tokens": 508956161.0, + "step": 13343 + }, + { + "epoch": 1.697493957511767, + "ewc_loss": 0.007985945791006088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.985945558175445e-05, + "grad_norm": 3.9399805068969727, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8722981214523315, + "num_tokens": 508995696.0, + "step": 13344 + }, + { + "epoch": 1.6976211677903574, + "ewc_loss": 0.008024872280657291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024871931411326e-05, + "grad_norm": 3.959664821624756, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8800904750823975, + "num_tokens": 509029505.0, + "step": 13345 + }, + { + "epoch": 1.697748378068948, + "ewc_loss": 0.008024031296372414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024031558306888e-05, + "grad_norm": 3.9881670475006104, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8829535245895386, + "num_tokens": 509062815.0, + "step": 13346 + }, + { + "epoch": 1.6978755883475385, + "ewc_loss": 0.00805207435041666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052074554143474e-05, + "grad_norm": 3.909198045730591, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8812143802642822, + "num_tokens": 509103376.0, + "step": 13347 + }, + { + "epoch": 1.698002798626129, + "ewc_loss": 0.007980337366461754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.980337250046432e-05, + "grad_norm": 3.914942741394043, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8847211599349976, + "num_tokens": 509142797.0, + "step": 13348 + }, + { + "epoch": 1.6981300089047195, + "ewc_loss": 0.008021464571356773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021464600460604e-05, + "grad_norm": 3.929790496826172, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8656845688819885, + "num_tokens": 509182724.0, + "step": 13349 + }, + { + "epoch": 1.69825721918331, + "ewc_loss": 0.008011827245354652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011827594600618e-05, + "grad_norm": 3.965421676635742, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8843437433242798, + "num_tokens": 509217856.0, + "step": 13350 + }, + { + "epoch": 1.6983844294619006, + "ewc_loss": 0.008014879189431667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.014879131224006e-05, + "grad_norm": 4.024655342102051, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8769422769546509, + "num_tokens": 509251581.0, + "step": 13351 + }, + { + "epoch": 1.6985116397404911, + "ewc_loss": 0.008048280142247677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.048280142247677e-05, + "grad_norm": 3.9145305156707764, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8919445276260376, + "num_tokens": 509286687.0, + "step": 13352 + }, + { + "epoch": 1.6986388500190817, + "ewc_loss": 0.007949603721499443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949603605084121e-05, + "grad_norm": 3.9537198543548584, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8703294992446899, + "num_tokens": 509323372.0, + "step": 13353 + }, + { + "epoch": 1.6987660602976722, + "ewc_loss": 0.008012662641704082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012662874534726e-05, + "grad_norm": 3.9727492332458496, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8679125308990479, + "num_tokens": 509362472.0, + "step": 13354 + }, + { + "epoch": 1.6988932705762627, + "ewc_loss": 0.008002947084605694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002947288332507e-05, + "grad_norm": 3.9314627647399902, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8773698806762695, + "num_tokens": 509397676.0, + "step": 13355 + }, + { + "epoch": 1.699020480854853, + "ewc_loss": 0.007963805459439754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963805546751246e-05, + "grad_norm": 3.8665924072265625, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8792227506637573, + "num_tokens": 509441100.0, + "step": 13356 + }, + { + "epoch": 1.6991476911334435, + "ewc_loss": 0.007940842770040035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940842624520883e-05, + "grad_norm": 3.9303197860717773, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8711825609207153, + "num_tokens": 509485430.0, + "step": 13357 + }, + { + "epoch": 1.699274901412034, + "ewc_loss": 0.00800173357129097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001733658602461e-05, + "grad_norm": 4.0238566398620605, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8480924367904663, + "num_tokens": 509520102.0, + "step": 13358 + }, + { + "epoch": 1.6994021116906246, + "ewc_loss": 0.008031301200389862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031300967559218e-05, + "grad_norm": 3.959798574447632, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8762822151184082, + "num_tokens": 509556525.0, + "step": 13359 + }, + { + "epoch": 1.6995293219692151, + "ewc_loss": 0.007950705476105213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95070591266267e-05, + "grad_norm": 3.9014289379119873, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8704222440719604, + "num_tokens": 509596899.0, + "step": 13360 + }, + { + "epoch": 1.6996565322478057, + "ewc_loss": 0.007956065237522125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956065383041278e-05, + "grad_norm": 3.9723081588745117, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.874854564666748, + "num_tokens": 509632125.0, + "step": 13361 + }, + { + "epoch": 1.699783742526396, + "ewc_loss": 0.008016059175133705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.016059291549027e-05, + "grad_norm": 3.9484877586364746, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8719698190689087, + "num_tokens": 509668131.0, + "step": 13362 + }, + { + "epoch": 1.6999109528049865, + "ewc_loss": 0.007961058989167213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961058872751892e-05, + "grad_norm": 3.9593405723571777, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.887412965297699, + "num_tokens": 509704078.0, + "step": 13363 + }, + { + "epoch": 1.700038163083577, + "ewc_loss": 0.007985603995621204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98560431576334e-05, + "grad_norm": 3.93902325630188, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8870980739593506, + "num_tokens": 509739464.0, + "step": 13364 + }, + { + "epoch": 1.7001653733621676, + "ewc_loss": 0.007973854430019855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973854371812195e-05, + "grad_norm": 3.92592453956604, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8752032518386841, + "num_tokens": 509777970.0, + "step": 13365 + }, + { + "epoch": 1.700292583640758, + "ewc_loss": 0.00797181949019432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971819286467507e-05, + "grad_norm": 3.914243221282959, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8622400760650635, + "num_tokens": 509817010.0, + "step": 13366 + }, + { + "epoch": 1.7004197939193486, + "ewc_loss": 0.007975325919687748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975325570441782e-05, + "grad_norm": 3.9510366916656494, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8805965185165405, + "num_tokens": 509854192.0, + "step": 13367 + }, + { + "epoch": 1.7005470041979391, + "ewc_loss": 0.007989213801920414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.989213918335736e-05, + "grad_norm": 3.962700128555298, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8876771330833435, + "num_tokens": 509888633.0, + "step": 13368 + }, + { + "epoch": 1.7006742144765297, + "ewc_loss": 0.007992821745574474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992821338120848e-05, + "grad_norm": 3.930633783340454, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8758782148361206, + "num_tokens": 509928474.0, + "step": 13369 + }, + { + "epoch": 1.7008014247551202, + "ewc_loss": 0.00795924011617899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959239883348346e-05, + "grad_norm": 3.8984835147857666, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8673157691955566, + "num_tokens": 509968875.0, + "step": 13370 + }, + { + "epoch": 1.7009286350337107, + "ewc_loss": 0.007948046550154686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948046550154686e-05, + "grad_norm": 3.942736864089966, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.876211404800415, + "num_tokens": 510006021.0, + "step": 13371 + }, + { + "epoch": 1.7010558453123013, + "ewc_loss": 0.007995236665010452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995236228452995e-05, + "grad_norm": 3.9172091484069824, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8812552690505981, + "num_tokens": 510042111.0, + "step": 13372 + }, + { + "epoch": 1.7011830555908918, + "ewc_loss": 0.007962299510836601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96229942352511e-05, + "grad_norm": 3.9747416973114014, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8802513480186462, + "num_tokens": 510074894.0, + "step": 13373 + }, + { + "epoch": 1.7013102658694823, + "ewc_loss": 0.007988003082573414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988003198988736e-05, + "grad_norm": 3.950486898422241, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8671941757202148, + "num_tokens": 510111516.0, + "step": 13374 + }, + { + "epoch": 1.7014374761480728, + "ewc_loss": 0.007976004853844643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976005144882947e-05, + "grad_norm": 3.9445230960845947, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8792970776557922, + "num_tokens": 510151549.0, + "step": 13375 + }, + { + "epoch": 1.7015646864266634, + "ewc_loss": 0.007960578426718712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960578659549356e-05, + "grad_norm": 3.924644708633423, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8757802248001099, + "num_tokens": 510189215.0, + "step": 13376 + }, + { + "epoch": 1.701691896705254, + "ewc_loss": 0.007962467148900032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.962466770550236e-05, + "grad_norm": 3.8748767375946045, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8942215442657471, + "num_tokens": 510231189.0, + "step": 13377 + }, + { + "epoch": 1.7018191069838444, + "ewc_loss": 0.007925671525299549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925671525299549e-05, + "grad_norm": 3.919005870819092, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8796310424804688, + "num_tokens": 510273956.0, + "step": 13378 + }, + { + "epoch": 1.701946317262435, + "ewc_loss": 0.007967659272253513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967658893903717e-05, + "grad_norm": 3.9257476329803467, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8898785710334778, + "num_tokens": 510315095.0, + "step": 13379 + }, + { + "epoch": 1.7020735275410255, + "ewc_loss": 0.007922140881419182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.922141230665147e-05, + "grad_norm": 3.9987196922302246, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8554134368896484, + "num_tokens": 510350753.0, + "step": 13380 + }, + { + "epoch": 1.7022007378196158, + "ewc_loss": 0.007960979826748371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.960979564813897e-05, + "grad_norm": 3.925739049911499, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8812316656112671, + "num_tokens": 510389257.0, + "step": 13381 + }, + { + "epoch": 1.7023279480982063, + "ewc_loss": 0.007905114442110062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.905114034656435e-05, + "grad_norm": 3.952970266342163, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8758950233459473, + "num_tokens": 510428879.0, + "step": 13382 + }, + { + "epoch": 1.7024551583767968, + "ewc_loss": 0.007946690544486046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946690311655402e-05, + "grad_norm": 3.924003839492798, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8784698843955994, + "num_tokens": 510468409.0, + "step": 13383 + }, + { + "epoch": 1.7025823686553874, + "ewc_loss": 0.007908221334218979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908220868557692e-05, + "grad_norm": 3.908010721206665, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8904616832733154, + "num_tokens": 510503988.0, + "step": 13384 + }, + { + "epoch": 1.702709578933978, + "ewc_loss": 0.00791068747639656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9106874181889e-05, + "grad_norm": 3.999297857284546, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8733813762664795, + "num_tokens": 510539009.0, + "step": 13385 + }, + { + "epoch": 1.7028367892125684, + "ewc_loss": 0.00797012448310852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.970124715939164e-05, + "grad_norm": 3.91953182220459, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8907862305641174, + "num_tokens": 510576177.0, + "step": 13386 + }, + { + "epoch": 1.7029639994911587, + "ewc_loss": 0.007891091518104076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89109108154662e-05, + "grad_norm": 3.9088077545166016, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.876293957233429, + "num_tokens": 510622194.0, + "step": 13387 + }, + { + "epoch": 1.7030912097697493, + "ewc_loss": 0.00791478343307972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914783054729924e-05, + "grad_norm": 3.905027151107788, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8916156888008118, + "num_tokens": 510663564.0, + "step": 13388 + }, + { + "epoch": 1.7032184200483398, + "ewc_loss": 0.007886398583650589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.886398816481233e-05, + "grad_norm": 3.9188120365142822, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8868645429611206, + "num_tokens": 510702153.0, + "step": 13389 + }, + { + "epoch": 1.7033456303269303, + "ewc_loss": 0.007898214273154736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.898214244050905e-05, + "grad_norm": 3.956679582595825, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8718352913856506, + "num_tokens": 510742812.0, + "step": 13390 + }, + { + "epoch": 1.7034728406055208, + "ewc_loss": 0.00792516116052866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925161480670795e-05, + "grad_norm": 3.95963454246521, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8711273074150085, + "num_tokens": 510783363.0, + "step": 13391 + }, + { + "epoch": 1.7036000508841114, + "ewc_loss": 0.00788737554103136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.887375977588817e-05, + "grad_norm": 3.9651217460632324, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8620384335517883, + "num_tokens": 510818362.0, + "step": 13392 + }, + { + "epoch": 1.703727261162702, + "ewc_loss": 0.007894541136920452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.894541340647265e-05, + "grad_norm": 3.9175424575805664, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8881598711013794, + "num_tokens": 510854732.0, + "step": 13393 + }, + { + "epoch": 1.7038544714412924, + "ewc_loss": 0.007873747497797012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87374738138169e-05, + "grad_norm": 3.920775890350342, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8705875873565674, + "num_tokens": 510895107.0, + "step": 13394 + }, + { + "epoch": 1.703981681719883, + "ewc_loss": 0.007895774208009243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89577461546287e-05, + "grad_norm": 3.9922900199890137, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8734108209609985, + "num_tokens": 510930737.0, + "step": 13395 + }, + { + "epoch": 1.7041088919984735, + "ewc_loss": 0.007925786077976227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925786485429853e-05, + "grad_norm": 3.969727039337158, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8816761374473572, + "num_tokens": 510964646.0, + "step": 13396 + }, + { + "epoch": 1.704236102277064, + "ewc_loss": 0.00791301392018795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913014269433916e-05, + "grad_norm": 3.9551374912261963, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8793039917945862, + "num_tokens": 511000583.0, + "step": 13397 + }, + { + "epoch": 1.7043633125556545, + "ewc_loss": 0.007924525998532772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924525561975315e-05, + "grad_norm": 3.9644038677215576, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8786650896072388, + "num_tokens": 511033729.0, + "step": 13398 + }, + { + "epoch": 1.704490522834245, + "ewc_loss": 0.007924320176243782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924320379970595e-05, + "grad_norm": 3.941283941268921, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8820974826812744, + "num_tokens": 511074099.0, + "step": 13399 + }, + { + "epoch": 1.7046177331128356, + "ewc_loss": 0.00789667945355177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896679744590074e-05, + "grad_norm": 3.9920873641967773, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8649552464485168, + "num_tokens": 511112473.0, + "step": 13400 + }, + { + "epoch": 1.7047449433914261, + "ewc_loss": 0.0079539455473423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953945168992504e-05, + "grad_norm": 3.9428341388702393, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8764427304267883, + "num_tokens": 511149839.0, + "step": 13401 + }, + { + "epoch": 1.7048721536700167, + "ewc_loss": 0.00789386685937643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.89386685937643e-05, + "grad_norm": 3.9948320388793945, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8672475814819336, + "num_tokens": 511183054.0, + "step": 13402 + }, + { + "epoch": 1.7049993639486072, + "ewc_loss": 0.007963491603732109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963491225382313e-05, + "grad_norm": 4.114780902862549, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8799105882644653, + "num_tokens": 511218498.0, + "step": 13403 + }, + { + "epoch": 1.7051265742271977, + "ewc_loss": 0.008003081195056438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003081165952608e-05, + "grad_norm": 3.8965835571289062, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8723248243331909, + "num_tokens": 511259776.0, + "step": 13404 + }, + { + "epoch": 1.705253784505788, + "ewc_loss": 0.007867848500609398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86784803494811e-05, + "grad_norm": 3.9339187145233154, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8833519220352173, + "num_tokens": 511297331.0, + "step": 13405 + }, + { + "epoch": 1.7053809947843785, + "ewc_loss": 0.007968051359057426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968051795614883e-05, + "grad_norm": 3.9291632175445557, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8883687853813171, + "num_tokens": 511331055.0, + "step": 13406 + }, + { + "epoch": 1.705508205062969, + "ewc_loss": 0.00794792827218771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.947927952045575e-05, + "grad_norm": 3.92301869392395, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8939184546470642, + "num_tokens": 511368651.0, + "step": 13407 + }, + { + "epoch": 1.7056354153415596, + "ewc_loss": 0.007966231554746628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.966231351019815e-05, + "grad_norm": 3.9716596603393555, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8817068338394165, + "num_tokens": 511402163.0, + "step": 13408 + }, + { + "epoch": 1.7057626256201501, + "ewc_loss": 0.007993543520569801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993543840711936e-05, + "grad_norm": 3.904670476913452, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8692648410797119, + "num_tokens": 511443769.0, + "step": 13409 + }, + { + "epoch": 1.7058898358987407, + "ewc_loss": 0.007953958585858345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953958993311971e-05, + "grad_norm": 3.929387331008911, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8734482526779175, + "num_tokens": 511481038.0, + "step": 13410 + }, + { + "epoch": 1.706017046177331, + "ewc_loss": 0.007972102612257004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97210304881446e-05, + "grad_norm": 3.902846336364746, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8781942129135132, + "num_tokens": 511522940.0, + "step": 13411 + }, + { + "epoch": 1.7061442564559215, + "ewc_loss": 0.007962701842188835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.962701783981174e-05, + "grad_norm": 3.942899227142334, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8729189038276672, + "num_tokens": 511562630.0, + "step": 13412 + }, + { + "epoch": 1.706271466734512, + "ewc_loss": 0.008008859120309353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.008859003894031e-05, + "grad_norm": 3.9877638816833496, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8629617691040039, + "num_tokens": 511598986.0, + "step": 13413 + }, + { + "epoch": 1.7063986770131025, + "ewc_loss": 0.008006681688129902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006681309780106e-05, + "grad_norm": 3.9866819381713867, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8783407211303711, + "num_tokens": 511633582.0, + "step": 13414 + }, + { + "epoch": 1.706525887291693, + "ewc_loss": 0.00798949133604765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.989491132320836e-05, + "grad_norm": 3.9270379543304443, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.876558780670166, + "num_tokens": 511673296.0, + "step": 13415 + }, + { + "epoch": 1.7066530975702836, + "ewc_loss": 0.007971041835844517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971041486598551e-05, + "grad_norm": 4.032413482666016, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8682619333267212, + "num_tokens": 511707890.0, + "step": 13416 + }, + { + "epoch": 1.7067803078488741, + "ewc_loss": 0.008046424947679043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04642477305606e-05, + "grad_norm": 3.9587252140045166, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8822212815284729, + "num_tokens": 511741597.0, + "step": 13417 + }, + { + "epoch": 1.7069075181274647, + "ewc_loss": 0.007982631213963032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982631359482184e-05, + "grad_norm": 3.985318183898926, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8794856071472168, + "num_tokens": 511776816.0, + "step": 13418 + }, + { + "epoch": 1.7070347284060552, + "ewc_loss": 0.008020181208848953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020181121537462e-05, + "grad_norm": 3.957245349884033, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.869438886642456, + "num_tokens": 511814866.0, + "step": 13419 + }, + { + "epoch": 1.7071619386846457, + "ewc_loss": 0.008000598289072514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000597881618887e-05, + "grad_norm": 3.9197590351104736, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.878474771976471, + "num_tokens": 511852458.0, + "step": 13420 + }, + { + "epoch": 1.7072891489632362, + "ewc_loss": 0.007980551570653915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98055189079605e-05, + "grad_norm": 3.9451935291290283, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8704769015312195, + "num_tokens": 511891201.0, + "step": 13421 + }, + { + "epoch": 1.7074163592418268, + "ewc_loss": 0.008027716539800167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02771610324271e-05, + "grad_norm": 3.934563159942627, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8754369020462036, + "num_tokens": 511930862.0, + "step": 13422 + }, + { + "epoch": 1.7075435695204173, + "ewc_loss": 0.007994468323886395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994467887328938e-05, + "grad_norm": 3.913654327392578, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8836554884910583, + "num_tokens": 511967695.0, + "step": 13423 + }, + { + "epoch": 1.7076707797990078, + "ewc_loss": 0.007998750545084476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998750515980646e-05, + "grad_norm": 3.8940789699554443, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8894438743591309, + "num_tokens": 512005976.0, + "step": 13424 + }, + { + "epoch": 1.7077979900775984, + "ewc_loss": 0.007990196347236633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99019617261365e-05, + "grad_norm": 3.9609031677246094, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8779294490814209, + "num_tokens": 512041575.0, + "step": 13425 + }, + { + "epoch": 1.7079252003561889, + "ewc_loss": 0.008043719455599785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043719572015107e-05, + "grad_norm": 3.939216136932373, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8703164458274841, + "num_tokens": 512076171.0, + "step": 13426 + }, + { + "epoch": 1.7080524106347794, + "ewc_loss": 0.008009477518498898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009477460291237e-05, + "grad_norm": 3.9254696369171143, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8730349540710449, + "num_tokens": 512112082.0, + "step": 13427 + }, + { + "epoch": 1.70817962091337, + "ewc_loss": 0.007994221523404121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994221959961578e-05, + "grad_norm": 3.9532370567321777, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8618900775909424, + "num_tokens": 512149126.0, + "step": 13428 + }, + { + "epoch": 1.7083068311919605, + "ewc_loss": 0.008020924404263496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020924724405631e-05, + "grad_norm": 3.9660258293151855, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8774186372756958, + "num_tokens": 512182368.0, + "step": 13429 + }, + { + "epoch": 1.7084340414705508, + "ewc_loss": 0.00800437293946743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004373376024887e-05, + "grad_norm": 3.9772725105285645, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8579978346824646, + "num_tokens": 512218224.0, + "step": 13430 + }, + { + "epoch": 1.7085612517491413, + "ewc_loss": 0.008020974695682526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020974928513169e-05, + "grad_norm": 3.934030294418335, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.875392496585846, + "num_tokens": 512255523.0, + "step": 13431 + }, + { + "epoch": 1.7086884620277318, + "ewc_loss": 0.007987475022673607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987474964465946e-05, + "grad_norm": 3.9261982440948486, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8849598169326782, + "num_tokens": 512290514.0, + "step": 13432 + }, + { + "epoch": 1.7088156723063224, + "ewc_loss": 0.00801484752446413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.014847844606265e-05, + "grad_norm": 3.9225423336029053, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8853142857551575, + "num_tokens": 512328468.0, + "step": 13433 + }, + { + "epoch": 1.708942882584913, + "ewc_loss": 0.008002876304090023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00287671154365e-05, + "grad_norm": 3.994328022003174, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8760882616043091, + "num_tokens": 512359786.0, + "step": 13434 + }, + { + "epoch": 1.7090700928635034, + "ewc_loss": 0.008039684034883976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039683598326519e-05, + "grad_norm": 4.026595592498779, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8549530506134033, + "num_tokens": 512395679.0, + "step": 13435 + }, + { + "epoch": 1.7091973031420937, + "ewc_loss": 0.008033888414502144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033888298086822e-05, + "grad_norm": 3.9164795875549316, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8744034767150879, + "num_tokens": 512433491.0, + "step": 13436 + }, + { + "epoch": 1.7093245134206843, + "ewc_loss": 0.007954063825309277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.954063767101616e-05, + "grad_norm": 3.942774534225464, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8720740079879761, + "num_tokens": 512470215.0, + "step": 13437 + }, + { + "epoch": 1.7094517236992748, + "ewc_loss": 0.008033616468310356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033616904867813e-05, + "grad_norm": 3.9577267169952393, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.871338427066803, + "num_tokens": 512507055.0, + "step": 13438 + }, + { + "epoch": 1.7095789339778653, + "ewc_loss": 0.008019485510885715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019485539989546e-05, + "grad_norm": 3.9585812091827393, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8588643670082092, + "num_tokens": 512547877.0, + "step": 13439 + }, + { + "epoch": 1.7097061442564558, + "ewc_loss": 0.008030545897781849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.030545723158866e-05, + "grad_norm": 3.9483730792999268, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8620390892028809, + "num_tokens": 512586558.0, + "step": 13440 + }, + { + "epoch": 1.7098333545350464, + "ewc_loss": 0.008029822260141373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029822492972016e-05, + "grad_norm": 3.9404685497283936, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.873435914516449, + "num_tokens": 512631181.0, + "step": 13441 + }, + { + "epoch": 1.709960564813637, + "ewc_loss": 0.008055372163653374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055372018134221e-05, + "grad_norm": 3.935281991958618, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8864065408706665, + "num_tokens": 512667925.0, + "step": 13442 + }, + { + "epoch": 1.7100877750922274, + "ewc_loss": 0.008028976619243622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028976299101487e-05, + "grad_norm": 4.009232521057129, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8568148016929626, + "num_tokens": 512703136.0, + "step": 13443 + }, + { + "epoch": 1.710214985370818, + "ewc_loss": 0.008090011775493622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090011397143826e-05, + "grad_norm": 3.986844301223755, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8792471885681152, + "num_tokens": 512739001.0, + "step": 13444 + }, + { + "epoch": 1.7103421956494085, + "ewc_loss": 0.008016406558454037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.016406354727224e-05, + "grad_norm": 4.8682684898376465, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8800662159919739, + "num_tokens": 512780517.0, + "step": 13445 + }, + { + "epoch": 1.710469405927999, + "ewc_loss": 0.008600123226642609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600122964708135e-05, + "grad_norm": 3.984602212905884, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8813091516494751, + "num_tokens": 512815964.0, + "step": 13446 + }, + { + "epoch": 1.7105966162065895, + "ewc_loss": 0.007813999429345131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8139994002413e-05, + "grad_norm": 3.824366807937622, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8812401294708252, + "num_tokens": 512855451.0, + "step": 13447 + }, + { + "epoch": 1.71072382648518, + "ewc_loss": 0.008019656874239445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019656524993479e-05, + "grad_norm": 3.9274790287017822, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8781155347824097, + "num_tokens": 512895809.0, + "step": 13448 + }, + { + "epoch": 1.7108510367637706, + "ewc_loss": 0.008092223666608334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092224015854299e-05, + "grad_norm": 3.905035972595215, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8811358213424683, + "num_tokens": 512936424.0, + "step": 13449 + }, + { + "epoch": 1.7109782470423611, + "ewc_loss": 0.008015105500817299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.015105413505808e-05, + "grad_norm": 3.887263536453247, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8888999819755554, + "num_tokens": 512976486.0, + "step": 13450 + }, + { + "epoch": 1.7111054573209517, + "ewc_loss": 0.00802615750581026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026157593121752e-05, + "grad_norm": 3.9510958194732666, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8668591976165771, + "num_tokens": 513014131.0, + "step": 13451 + }, + { + "epoch": 1.7112326675995422, + "ewc_loss": 0.008074271492660046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074271318037063e-05, + "grad_norm": 3.951655864715576, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8487650752067566, + "num_tokens": 513054706.0, + "step": 13452 + }, + { + "epoch": 1.7113598778781327, + "ewc_loss": 0.008049516007304192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.049515599850565e-05, + "grad_norm": 3.9233670234680176, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8802878856658936, + "num_tokens": 513094089.0, + "step": 13453 + }, + { + "epoch": 1.711487088156723, + "ewc_loss": 0.008035268634557724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035268547246233e-05, + "grad_norm": 3.9220755100250244, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8738025426864624, + "num_tokens": 513136306.0, + "step": 13454 + }, + { + "epoch": 1.7116142984353135, + "ewc_loss": 0.008039163425564766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039163367357105e-05, + "grad_norm": 3.9730758666992188, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.872897744178772, + "num_tokens": 513174376.0, + "step": 13455 + }, + { + "epoch": 1.711741508713904, + "ewc_loss": 0.008055790327489376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055790385697037e-05, + "grad_norm": 3.947387933731079, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8783440589904785, + "num_tokens": 513211834.0, + "step": 13456 + }, + { + "epoch": 1.7118687189924946, + "ewc_loss": 0.00802144967019558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021450048545375e-05, + "grad_norm": 4.054102897644043, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8676525354385376, + "num_tokens": 513241765.0, + "step": 13457 + }, + { + "epoch": 1.7119959292710851, + "ewc_loss": 0.008102315478026867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.10231504146941e-05, + "grad_norm": 3.9456841945648193, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8636764883995056, + "num_tokens": 513283510.0, + "step": 13458 + }, + { + "epoch": 1.7121231395496757, + "ewc_loss": 0.007978186011314392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.978185749379918e-05, + "grad_norm": 4.021805286407471, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8726488351821899, + "num_tokens": 513320746.0, + "step": 13459 + }, + { + "epoch": 1.712250349828266, + "ewc_loss": 0.008068643510341644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068643364822492e-05, + "grad_norm": 3.976867198944092, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8727271556854248, + "num_tokens": 513358036.0, + "step": 13460 + }, + { + "epoch": 1.7123775601068565, + "ewc_loss": 0.007998364977538586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998364890227094e-05, + "grad_norm": 3.9317257404327393, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8800574541091919, + "num_tokens": 513395156.0, + "step": 13461 + }, + { + "epoch": 1.712504770385447, + "ewc_loss": 0.00799858570098877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998586079338565e-05, + "grad_norm": 3.9309608936309814, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8689361810684204, + "num_tokens": 513438631.0, + "step": 13462 + }, + { + "epoch": 1.7126319806640375, + "ewc_loss": 0.008031632751226425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031632751226425e-05, + "grad_norm": 4.1053786277771, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8669655323028564, + "num_tokens": 513471225.0, + "step": 13463 + }, + { + "epoch": 1.712759190942628, + "ewc_loss": 0.008140534162521362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.140534191625193e-05, + "grad_norm": 3.922084093093872, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8671420216560364, + "num_tokens": 513513824.0, + "step": 13464 + }, + { + "epoch": 1.7128864012212186, + "ewc_loss": 0.007974005304276943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974004984134808e-05, + "grad_norm": 3.99003267288208, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8697006702423096, + "num_tokens": 513556263.0, + "step": 13465 + }, + { + "epoch": 1.7130136114998091, + "ewc_loss": 0.008077929727733135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077929669525474e-05, + "grad_norm": 3.9415230751037598, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8786760568618774, + "num_tokens": 513595093.0, + "step": 13466 + }, + { + "epoch": 1.7131408217783997, + "ewc_loss": 0.008014858700335026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.014858758542687e-05, + "grad_norm": 3.953216314315796, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8676024675369263, + "num_tokens": 513633761.0, + "step": 13467 + }, + { + "epoch": 1.7132680320569902, + "ewc_loss": 0.008035318925976753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03531875135377e-05, + "grad_norm": 3.9554450511932373, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8711748123168945, + "num_tokens": 513671404.0, + "step": 13468 + }, + { + "epoch": 1.7133952423355807, + "ewc_loss": 0.008024406619369984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02440699771978e-05, + "grad_norm": 3.901519298553467, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8762465715408325, + "num_tokens": 513713752.0, + "step": 13469 + }, + { + "epoch": 1.7135224526141712, + "ewc_loss": 0.00799950398504734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.999503577593714e-05, + "grad_norm": 3.9437949657440186, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8710534572601318, + "num_tokens": 513755261.0, + "step": 13470 + }, + { + "epoch": 1.7136496628927618, + "ewc_loss": 0.008052334189414978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0523343058303e-05, + "grad_norm": 3.9001448154449463, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8782508373260498, + "num_tokens": 513799042.0, + "step": 13471 + }, + { + "epoch": 1.7137768731713523, + "ewc_loss": 0.007997368462383747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997368084033951e-05, + "grad_norm": 3.9664928913116455, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8651396036148071, + "num_tokens": 513838148.0, + "step": 13472 + }, + { + "epoch": 1.7139040834499428, + "ewc_loss": 0.008044221438467503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044221613090485e-05, + "grad_norm": 3.910513162612915, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.881078839302063, + "num_tokens": 513878960.0, + "step": 13473 + }, + { + "epoch": 1.7140312937285334, + "ewc_loss": 0.007995828986167908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995829218998551e-05, + "grad_norm": 3.992832660675049, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8760700225830078, + "num_tokens": 513914083.0, + "step": 13474 + }, + { + "epoch": 1.7141585040071239, + "ewc_loss": 0.00803354475647211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033544872887433e-05, + "grad_norm": 3.9784512519836426, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8627961874008179, + "num_tokens": 513951591.0, + "step": 13475 + }, + { + "epoch": 1.7142857142857144, + "ewc_loss": 0.007997997105121613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997996726771817e-05, + "grad_norm": 3.9848787784576416, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8657781481742859, + "num_tokens": 513986127.0, + "step": 13476 + }, + { + "epoch": 1.714412924564305, + "ewc_loss": 0.00802684761583805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026848081499338e-05, + "grad_norm": 3.955920934677124, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8700954914093018, + "num_tokens": 514028248.0, + "step": 13477 + }, + { + "epoch": 1.7145401348428955, + "ewc_loss": 0.007978193461894989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.978193752933294e-05, + "grad_norm": 3.9888737201690674, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8772008419036865, + "num_tokens": 514062757.0, + "step": 13478 + }, + { + "epoch": 1.7146673451214858, + "ewc_loss": 0.008011448197066784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011447789613158e-05, + "grad_norm": 3.979440689086914, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8619973659515381, + "num_tokens": 514102635.0, + "step": 13479 + }, + { + "epoch": 1.7147945554000763, + "ewc_loss": 0.007982888258993626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982888200785965e-05, + "grad_norm": 3.961411952972412, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8740427494049072, + "num_tokens": 514140883.0, + "step": 13480 + }, + { + "epoch": 1.7149217656786668, + "ewc_loss": 0.00797799602150917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977995846886188e-05, + "grad_norm": 3.9731805324554443, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.883527398109436, + "num_tokens": 514176536.0, + "step": 13481 + }, + { + "epoch": 1.7150489759572574, + "ewc_loss": 0.007993560284376144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993559847818688e-05, + "grad_norm": 3.983818292617798, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8704984188079834, + "num_tokens": 514209969.0, + "step": 13482 + }, + { + "epoch": 1.7151761862358479, + "ewc_loss": 0.007989104837179184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.989104778971523e-05, + "grad_norm": 3.9208972454071045, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.874870777130127, + "num_tokens": 514249679.0, + "step": 13483 + }, + { + "epoch": 1.7153033965144384, + "ewc_loss": 0.007962669245898724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96266904217191e-05, + "grad_norm": 3.917590618133545, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8691516518592834, + "num_tokens": 514292719.0, + "step": 13484 + }, + { + "epoch": 1.7154306067930287, + "ewc_loss": 0.00799306109547615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993060717126355e-05, + "grad_norm": 4.0380539894104, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8705774545669556, + "num_tokens": 514323396.0, + "step": 13485 + }, + { + "epoch": 1.7155578170716193, + "ewc_loss": 0.008052556775510311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052556950133294e-05, + "grad_norm": 3.9266176223754883, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8807485103607178, + "num_tokens": 514360951.0, + "step": 13486 + }, + { + "epoch": 1.7156850273502098, + "ewc_loss": 0.00796942412853241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969424041220918e-05, + "grad_norm": 3.9590210914611816, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.859839677810669, + "num_tokens": 514403879.0, + "step": 13487 + }, + { + "epoch": 1.7158122376288003, + "ewc_loss": 0.008032999001443386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032999176066369e-05, + "grad_norm": 3.916086435317993, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8822500705718994, + "num_tokens": 514446628.0, + "step": 13488 + }, + { + "epoch": 1.7159394479073908, + "ewc_loss": 0.007990703918039799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99070403445512e-05, + "grad_norm": 3.9504313468933105, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8757314682006836, + "num_tokens": 514486132.0, + "step": 13489 + }, + { + "epoch": 1.7160666581859814, + "ewc_loss": 0.008029242977499962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029243326745927e-05, + "grad_norm": 3.9444692134857178, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8645080924034119, + "num_tokens": 514526384.0, + "step": 13490 + }, + { + "epoch": 1.716193868464572, + "ewc_loss": 0.008000783622264862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000783418538049e-05, + "grad_norm": 3.957523822784424, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8737680315971375, + "num_tokens": 514565828.0, + "step": 13491 + }, + { + "epoch": 1.7163210787431624, + "ewc_loss": 0.008007267490029335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007267751963809e-05, + "grad_norm": 3.9619836807250977, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8699822425842285, + "num_tokens": 514605260.0, + "step": 13492 + }, + { + "epoch": 1.716448289021753, + "ewc_loss": 0.008002743124961853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002742833923548e-05, + "grad_norm": 3.957091808319092, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8745970129966736, + "num_tokens": 514643433.0, + "step": 13493 + }, + { + "epoch": 1.7165754993003435, + "ewc_loss": 0.007998964749276638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998964429134503e-05, + "grad_norm": 3.9231340885162354, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8786294460296631, + "num_tokens": 514683264.0, + "step": 13494 + }, + { + "epoch": 1.716702709578934, + "ewc_loss": 0.007977240718901157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977240602485836e-05, + "grad_norm": 3.958202362060547, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8816535472869873, + "num_tokens": 514719775.0, + "step": 13495 + }, + { + "epoch": 1.7168299198575245, + "ewc_loss": 0.00799435842782259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994358020368963e-05, + "grad_norm": 3.887582302093506, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8682827949523926, + "num_tokens": 514766287.0, + "step": 13496 + }, + { + "epoch": 1.716957130136115, + "ewc_loss": 0.007945832796394825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94583247625269e-05, + "grad_norm": 3.9322829246520996, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8856576681137085, + "num_tokens": 514806725.0, + "step": 13497 + }, + { + "epoch": 1.7170843404147056, + "ewc_loss": 0.007997612468898296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997612556209788e-05, + "grad_norm": 3.957026958465576, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8781273365020752, + "num_tokens": 514844008.0, + "step": 13498 + }, + { + "epoch": 1.7172115506932961, + "ewc_loss": 0.007973698899149895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973698666319251e-05, + "grad_norm": 4.002906799316406, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8634032607078552, + "num_tokens": 514880356.0, + "step": 13499 + }, + { + "epoch": 1.7173387609718866, + "ewc_loss": 0.007984143681824207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984143303474411e-05, + "grad_norm": 3.933955669403076, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8874524831771851, + "num_tokens": 514915309.0, + "step": 13500 + }, + { + "epoch": 1.7174659712504772, + "ewc_loss": 0.007923469878733158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923469820525497e-05, + "grad_norm": 3.9186718463897705, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8664618134498596, + "num_tokens": 514956496.0, + "step": 13501 + }, + { + "epoch": 1.7175931815290677, + "ewc_loss": 0.007963044568896294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963044481584802e-05, + "grad_norm": 3.9834179878234863, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8751335740089417, + "num_tokens": 514992247.0, + "step": 13502 + }, + { + "epoch": 1.717720391807658, + "ewc_loss": 0.008004743605852127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004743722267449e-05, + "grad_norm": 3.906801223754883, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8657655715942383, + "num_tokens": 515035443.0, + "step": 13503 + }, + { + "epoch": 1.7178476020862485, + "ewc_loss": 0.007917197421193123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91719721746631e-05, + "grad_norm": 3.945100784301758, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.877642810344696, + "num_tokens": 515072279.0, + "step": 13504 + }, + { + "epoch": 1.717974812364839, + "ewc_loss": 0.007987549528479576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98754917923361e-05, + "grad_norm": 3.9254941940307617, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8734660744667053, + "num_tokens": 515112547.0, + "step": 13505 + }, + { + "epoch": 1.7181020226434296, + "ewc_loss": 0.00794123113155365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94123116065748e-05, + "grad_norm": 4.028872489929199, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8637878894805908, + "num_tokens": 515146814.0, + "step": 13506 + }, + { + "epoch": 1.7182292329220201, + "ewc_loss": 0.007992701604962349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992701284820214e-05, + "grad_norm": 3.920029401779175, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8777329921722412, + "num_tokens": 515184393.0, + "step": 13507 + }, + { + "epoch": 1.7183564432006107, + "ewc_loss": 0.007897921837866306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.897921750554815e-05, + "grad_norm": 3.9321024417877197, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8676360845565796, + "num_tokens": 515225312.0, + "step": 13508 + }, + { + "epoch": 1.718483653479201, + "ewc_loss": 0.007961689494550228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961689698277041e-05, + "grad_norm": 4.036144733428955, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.852401077747345, + "num_tokens": 515261330.0, + "step": 13509 + }, + { + "epoch": 1.7186108637577915, + "ewc_loss": 0.007996905595064163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99690606072545e-05, + "grad_norm": 3.92297625541687, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8803075551986694, + "num_tokens": 515299325.0, + "step": 13510 + }, + { + "epoch": 1.718738074036382, + "ewc_loss": 0.0079107116907835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.910711428849027e-05, + "grad_norm": 3.96475887298584, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8756866455078125, + "num_tokens": 515335213.0, + "step": 13511 + }, + { + "epoch": 1.7188652843149725, + "ewc_loss": 0.007968174293637276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968174759298563e-05, + "grad_norm": 3.99635910987854, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8863519430160522, + "num_tokens": 515365699.0, + "step": 13512 + }, + { + "epoch": 1.718992494593563, + "ewc_loss": 0.00798716675490141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987166463863105e-05, + "grad_norm": 4.033605575561523, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8826617002487183, + "num_tokens": 515396025.0, + "step": 13513 + }, + { + "epoch": 1.7191197048721536, + "ewc_loss": 0.007999474182724953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.999474473763257e-05, + "grad_norm": 4.01757287979126, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8752405047416687, + "num_tokens": 515429724.0, + "step": 13514 + }, + { + "epoch": 1.7192469151507441, + "ewc_loss": 0.007992355152964592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992354949237779e-05, + "grad_norm": 3.94210147857666, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8698550462722778, + "num_tokens": 515465894.0, + "step": 13515 + }, + { + "epoch": 1.7193741254293347, + "ewc_loss": 0.00796877034008503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968770660227165e-05, + "grad_norm": 3.981199026107788, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8924835920333862, + "num_tokens": 515498894.0, + "step": 13516 + }, + { + "epoch": 1.7195013357079252, + "ewc_loss": 0.008023756556212902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023756527109072e-05, + "grad_norm": 3.9613144397735596, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.874297022819519, + "num_tokens": 515538783.0, + "step": 13517 + }, + { + "epoch": 1.7196285459865157, + "ewc_loss": 0.008007027208805084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007027645362541e-05, + "grad_norm": 3.995387077331543, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8732940554618835, + "num_tokens": 515573107.0, + "step": 13518 + }, + { + "epoch": 1.7197557562651062, + "ewc_loss": 0.008028379641473293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028379670577124e-05, + "grad_norm": 3.9194326400756836, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8757564425468445, + "num_tokens": 515614901.0, + "step": 13519 + }, + { + "epoch": 1.7198829665436968, + "ewc_loss": 0.007985033094882965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.985033153090626e-05, + "grad_norm": 3.9444496631622314, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.877423882484436, + "num_tokens": 515654197.0, + "step": 13520 + }, + { + "epoch": 1.7200101768222873, + "ewc_loss": 0.008028191514313221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028191223274916e-05, + "grad_norm": 3.896803140640259, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8922501802444458, + "num_tokens": 515697003.0, + "step": 13521 + }, + { + "epoch": 1.7201373871008778, + "ewc_loss": 0.007990404032170773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.990404265001416e-05, + "grad_norm": 3.9867448806762695, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8705525994300842, + "num_tokens": 515737801.0, + "step": 13522 + }, + { + "epoch": 1.7202645973794684, + "ewc_loss": 0.00806418713182211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064187568379566e-05, + "grad_norm": 3.999633550643921, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8671033382415771, + "num_tokens": 515774711.0, + "step": 13523 + }, + { + "epoch": 1.7203918076580589, + "ewc_loss": 0.00802131462842226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021314715733752e-05, + "grad_norm": 3.971707820892334, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8668596148490906, + "num_tokens": 515811728.0, + "step": 13524 + }, + { + "epoch": 1.7205190179366494, + "ewc_loss": 0.008000598289072514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000597881618887e-05, + "grad_norm": 3.9553637504577637, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.87962406873703, + "num_tokens": 515844425.0, + "step": 13525 + }, + { + "epoch": 1.72064622821524, + "ewc_loss": 0.008029733784496784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029733726289123e-05, + "grad_norm": 3.968493938446045, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8769501447677612, + "num_tokens": 515882439.0, + "step": 13526 + }, + { + "epoch": 1.7207734384938305, + "ewc_loss": 0.008010750636458397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010750752873719e-05, + "grad_norm": 3.9949052333831787, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8530519604682922, + "num_tokens": 515922168.0, + "step": 13527 + }, + { + "epoch": 1.7209006487724208, + "ewc_loss": 0.008034531958401203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034532220335677e-05, + "grad_norm": 3.901921272277832, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8816708326339722, + "num_tokens": 515961840.0, + "step": 13528 + }, + { + "epoch": 1.7210278590510113, + "ewc_loss": 0.007957463152706623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957463094498962e-05, + "grad_norm": 3.9482600688934326, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8759930729866028, + "num_tokens": 515997586.0, + "step": 13529 + }, + { + "epoch": 1.7211550693296018, + "ewc_loss": 0.008026561699807644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026562136365101e-05, + "grad_norm": 3.9353229999542236, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8808936476707458, + "num_tokens": 516036660.0, + "step": 13530 + }, + { + "epoch": 1.7212822796081924, + "ewc_loss": 0.007991279475390911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991279562702402e-05, + "grad_norm": 4.002745628356934, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8693374395370483, + "num_tokens": 516070846.0, + "step": 13531 + }, + { + "epoch": 1.7214094898867829, + "ewc_loss": 0.008046972565352917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046972652664408e-05, + "grad_norm": 3.9300951957702637, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8953882455825806, + "num_tokens": 516107027.0, + "step": 13532 + }, + { + "epoch": 1.7215367001653734, + "ewc_loss": 0.007980946451425552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.980946247698739e-05, + "grad_norm": 3.966477870941162, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8756961822509766, + "num_tokens": 516143971.0, + "step": 13533 + }, + { + "epoch": 1.7216639104439637, + "ewc_loss": 0.008026034571230412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026034629438072e-05, + "grad_norm": 3.924199104309082, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8779119253158569, + "num_tokens": 516180617.0, + "step": 13534 + }, + { + "epoch": 1.7217911207225542, + "ewc_loss": 0.008003626950085163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003626862773672e-05, + "grad_norm": 3.93579363822937, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8756212592124939, + "num_tokens": 516220371.0, + "step": 13535 + }, + { + "epoch": 1.7219183310011448, + "ewc_loss": 0.008024310693144798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024310955079272e-05, + "grad_norm": 3.953274726867676, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.877121090888977, + "num_tokens": 516258642.0, + "step": 13536 + }, + { + "epoch": 1.7220455412797353, + "ewc_loss": 0.008027123287320137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.027123112697154e-05, + "grad_norm": 3.9680392742156982, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8682422041893005, + "num_tokens": 516298971.0, + "step": 13537 + }, + { + "epoch": 1.7221727515583258, + "ewc_loss": 0.008015044964849949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01504502305761e-05, + "grad_norm": 3.980653762817383, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8869781494140625, + "num_tokens": 516334457.0, + "step": 13538 + }, + { + "epoch": 1.7222999618369164, + "ewc_loss": 0.008025863207876682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025862916838378e-05, + "grad_norm": 3.9252216815948486, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8988993763923645, + "num_tokens": 516374585.0, + "step": 13539 + }, + { + "epoch": 1.7224271721155069, + "ewc_loss": 0.007976546883583069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976547203725204e-05, + "grad_norm": 3.9994640350341797, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8616086840629578, + "num_tokens": 516414180.0, + "step": 13540 + }, + { + "epoch": 1.7225543823940974, + "ewc_loss": 0.008038810454308987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038810483412817e-05, + "grad_norm": 3.9699342250823975, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.885026216506958, + "num_tokens": 516450189.0, + "step": 13541 + }, + { + "epoch": 1.722681592672688, + "ewc_loss": 0.0079862205311656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986220589373261e-05, + "grad_norm": 3.9707300662994385, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8739645481109619, + "num_tokens": 516487549.0, + "step": 13542 + }, + { + "epoch": 1.7228088029512785, + "ewc_loss": 0.007989785633981228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98978580860421e-05, + "grad_norm": 3.9797677993774414, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8627594113349915, + "num_tokens": 516526649.0, + "step": 13543 + }, + { + "epoch": 1.722936013229869, + "ewc_loss": 0.00799030251801014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.990302401594818e-05, + "grad_norm": 3.9615349769592285, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8792421221733093, + "num_tokens": 516563829.0, + "step": 13544 + }, + { + "epoch": 1.7230632235084595, + "ewc_loss": 0.007976336404681206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976336200954393e-05, + "grad_norm": 3.9510438442230225, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8588036298751831, + "num_tokens": 516605179.0, + "step": 13545 + }, + { + "epoch": 1.72319043378705, + "ewc_loss": 0.007975732907652855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975733024068177e-05, + "grad_norm": 4.050899505615234, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8733757138252258, + "num_tokens": 516641576.0, + "step": 13546 + }, + { + "epoch": 1.7233176440656406, + "ewc_loss": 0.008053372614085674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053372584981844e-05, + "grad_norm": 3.9504027366638184, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8733510971069336, + "num_tokens": 516678687.0, + "step": 13547 + }, + { + "epoch": 1.7234448543442311, + "ewc_loss": 0.00795029941946268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.950299186632037e-05, + "grad_norm": 4.012246608734131, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8697786331176758, + "num_tokens": 516711484.0, + "step": 13548 + }, + { + "epoch": 1.7235720646228216, + "ewc_loss": 0.008036773651838303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036773942876607e-05, + "grad_norm": 3.9276580810546875, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8693084716796875, + "num_tokens": 516750598.0, + "step": 13549 + }, + { + "epoch": 1.7236992749014122, + "ewc_loss": 0.007975990884006023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97599132056348e-05, + "grad_norm": 3.8987672328948975, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8814708590507507, + "num_tokens": 516793438.0, + "step": 13550 + }, + { + "epoch": 1.7238264851800027, + "ewc_loss": 0.007992387749254704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992387691047043e-05, + "grad_norm": 3.9670538902282715, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8897135257720947, + "num_tokens": 516828692.0, + "step": 13551 + }, + { + "epoch": 1.723953695458593, + "ewc_loss": 0.00804591178894043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04591181804426e-05, + "grad_norm": 3.9660122394561768, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8771528005599976, + "num_tokens": 516867592.0, + "step": 13552 + }, + { + "epoch": 1.7240809057371835, + "ewc_loss": 0.008012047968804836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012048056116328e-05, + "grad_norm": 3.9528777599334717, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8615857362747192, + "num_tokens": 516907668.0, + "step": 13553 + }, + { + "epoch": 1.724208116015774, + "ewc_loss": 0.008018074557185173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018074731808156e-05, + "grad_norm": 3.9562833309173584, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8720792531967163, + "num_tokens": 516944628.0, + "step": 13554 + }, + { + "epoch": 1.7243353262943646, + "ewc_loss": 0.00803162157535553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031621837290004e-05, + "grad_norm": 3.9295814037323, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8701830506324768, + "num_tokens": 516989027.0, + "step": 13555 + }, + { + "epoch": 1.7244625365729551, + "ewc_loss": 0.007996272295713425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.996272324817255e-05, + "grad_norm": 3.9749889373779297, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8697850108146667, + "num_tokens": 517024464.0, + "step": 13556 + }, + { + "epoch": 1.7245897468515456, + "ewc_loss": 0.00803491473197937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034914935706183e-05, + "grad_norm": 3.9823944568634033, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8795878887176514, + "num_tokens": 517067714.0, + "step": 13557 + }, + { + "epoch": 1.724716957130136, + "ewc_loss": 0.008020048029720783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020047971513122e-05, + "grad_norm": 3.916301727294922, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8787157535552979, + "num_tokens": 517107564.0, + "step": 13558 + }, + { + "epoch": 1.7248441674087265, + "ewc_loss": 0.007970879785716534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.970879960339516e-05, + "grad_norm": 3.9171156883239746, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.880018949508667, + "num_tokens": 517148240.0, + "step": 13559 + }, + { + "epoch": 1.724971377687317, + "ewc_loss": 0.007994070649147034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994070620043203e-05, + "grad_norm": 3.9649012088775635, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8666571378707886, + "num_tokens": 517187841.0, + "step": 13560 + }, + { + "epoch": 1.7250985879659075, + "ewc_loss": 0.008019531145691872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019531378522515e-05, + "grad_norm": 3.9788990020751953, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8618501424789429, + "num_tokens": 517225732.0, + "step": 13561 + }, + { + "epoch": 1.725225798244498, + "ewc_loss": 0.00800681859254837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006818825379014e-05, + "grad_norm": 4.00993537902832, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.859808087348938, + "num_tokens": 517260423.0, + "step": 13562 + }, + { + "epoch": 1.7253530085230886, + "ewc_loss": 0.008023099973797798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023099508136511e-05, + "grad_norm": 3.939241409301758, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8614634275436401, + "num_tokens": 517300862.0, + "step": 13563 + }, + { + "epoch": 1.7254802188016791, + "ewc_loss": 0.00794824119657278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948240818222985e-05, + "grad_norm": 3.9265990257263184, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8792767524719238, + "num_tokens": 517341010.0, + "step": 13564 + }, + { + "epoch": 1.7256074290802697, + "ewc_loss": 0.007971211336553097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971211744006723e-05, + "grad_norm": 3.9940402507781982, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8585067987442017, + "num_tokens": 517376853.0, + "step": 13565 + }, + { + "epoch": 1.7257346393588602, + "ewc_loss": 0.008038300089538097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038300438784063e-05, + "grad_norm": 3.9526989459991455, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8688200116157532, + "num_tokens": 517421854.0, + "step": 13566 + }, + { + "epoch": 1.7258618496374507, + "ewc_loss": 0.00796972494572401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969724538270384e-05, + "grad_norm": 3.9360733032226562, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8685082197189331, + "num_tokens": 517461711.0, + "step": 13567 + }, + { + "epoch": 1.7259890599160412, + "ewc_loss": 0.007966192439198494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96619278844446e-05, + "grad_norm": 3.997495412826538, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8815418481826782, + "num_tokens": 517494755.0, + "step": 13568 + }, + { + "epoch": 1.7261162701946318, + "ewc_loss": 0.008000841364264488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000841626198962e-05, + "grad_norm": 3.9399826526641846, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8659126162528992, + "num_tokens": 517539061.0, + "step": 13569 + }, + { + "epoch": 1.7262434804732223, + "ewc_loss": 0.007964316755533218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.964316318975762e-05, + "grad_norm": 3.9394044876098633, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8692820072174072, + "num_tokens": 517579768.0, + "step": 13570 + }, + { + "epoch": 1.7263706907518128, + "ewc_loss": 0.007977969944477081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977970381034538e-05, + "grad_norm": 3.985684633255005, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8868443965911865, + "num_tokens": 517612006.0, + "step": 13571 + }, + { + "epoch": 1.7264979010304033, + "ewc_loss": 0.007994521409273148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994521001819521e-05, + "grad_norm": 4.005674362182617, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8732604384422302, + "num_tokens": 517647547.0, + "step": 13572 + }, + { + "epoch": 1.7266251113089939, + "ewc_loss": 0.007997918874025345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997918874025345e-05, + "grad_norm": 3.9381096363067627, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8649105429649353, + "num_tokens": 517689067.0, + "step": 13573 + }, + { + "epoch": 1.7267523215875844, + "ewc_loss": 0.007955364882946014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955364708323032e-05, + "grad_norm": 3.926659107208252, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.87958163022995, + "num_tokens": 517729967.0, + "step": 13574 + }, + { + "epoch": 1.726879531866175, + "ewc_loss": 0.007967532612383366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967533019836992e-05, + "grad_norm": 3.9321625232696533, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8746033906936646, + "num_tokens": 517770427.0, + "step": 13575 + }, + { + "epoch": 1.7270067421447655, + "ewc_loss": 0.007968214340507984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968214777065441e-05, + "grad_norm": 4.010663032531738, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8738210797309875, + "num_tokens": 517804202.0, + "step": 13576 + }, + { + "epoch": 1.7271339524233558, + "ewc_loss": 0.008018752560019493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018752851057798e-05, + "grad_norm": 4.020705223083496, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8670055270195007, + "num_tokens": 517834825.0, + "step": 13577 + }, + { + "epoch": 1.7272611627019463, + "ewc_loss": 0.00800895132124424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.008951408555731e-05, + "grad_norm": 3.9604780673980713, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.868377149105072, + "num_tokens": 517874600.0, + "step": 13578 + }, + { + "epoch": 1.7273883729805368, + "ewc_loss": 0.00796506181359291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.965062104631215e-05, + "grad_norm": 3.9644429683685303, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8637981414794922, + "num_tokens": 517913488.0, + "step": 13579 + }, + { + "epoch": 1.7275155832591274, + "ewc_loss": 0.008008245378732681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.008245640667155e-05, + "grad_norm": 3.924684762954712, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8877846002578735, + "num_tokens": 517951094.0, + "step": 13580 + }, + { + "epoch": 1.7276427935377179, + "ewc_loss": 0.007989748381078243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.989748701220378e-05, + "grad_norm": 3.927676200866699, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8635807037353516, + "num_tokens": 517993811.0, + "step": 13581 + }, + { + "epoch": 1.7277700038163084, + "ewc_loss": 0.008010802790522575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010803139768541e-05, + "grad_norm": 3.9445929527282715, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8772809505462646, + "num_tokens": 518032860.0, + "step": 13582 + }, + { + "epoch": 1.7278972140948987, + "ewc_loss": 0.008020040579140186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020040695555508e-05, + "grad_norm": 3.95061993598938, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8710366487503052, + "num_tokens": 518070256.0, + "step": 13583 + }, + { + "epoch": 1.7280244243734892, + "ewc_loss": 0.008028383366763592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028383308555931e-05, + "grad_norm": 3.9688827991485596, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.880466103553772, + "num_tokens": 518102828.0, + "step": 13584 + }, + { + "epoch": 1.7281516346520798, + "ewc_loss": 0.008029956370592117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029956370592117e-05, + "grad_norm": 3.9190969467163086, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8873093128204346, + "num_tokens": 518146859.0, + "step": 13585 + }, + { + "epoch": 1.7282788449306703, + "ewc_loss": 0.007992078550159931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99207846284844e-05, + "grad_norm": 3.9903993606567383, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8799089789390564, + "num_tokens": 518186173.0, + "step": 13586 + }, + { + "epoch": 1.7284060552092608, + "ewc_loss": 0.008047197945415974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047198207350448e-05, + "grad_norm": 4.000616550445557, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8593721389770508, + "num_tokens": 518221903.0, + "step": 13587 + }, + { + "epoch": 1.7285332654878514, + "ewc_loss": 0.008007174357771873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007174619706348e-05, + "grad_norm": 3.969949960708618, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8640889525413513, + "num_tokens": 518258048.0, + "step": 13588 + }, + { + "epoch": 1.7286604757664419, + "ewc_loss": 0.007988386787474155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988386641955003e-05, + "grad_norm": 3.9599556922912598, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.866086483001709, + "num_tokens": 518299127.0, + "step": 13589 + }, + { + "epoch": 1.7287876860450324, + "ewc_loss": 0.007996651344001293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.996651402208954e-05, + "grad_norm": 3.9649112224578857, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8844818472862244, + "num_tokens": 518336520.0, + "step": 13590 + }, + { + "epoch": 1.728914896323623, + "ewc_loss": 0.008006277494132519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006277494132519e-05, + "grad_norm": 4.106862545013428, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8704935908317566, + "num_tokens": 518368484.0, + "step": 13591 + }, + { + "epoch": 1.7290421066022135, + "ewc_loss": 0.008085990324616432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085990702966228e-05, + "grad_norm": 3.9512147903442383, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8731953501701355, + "num_tokens": 518408439.0, + "step": 13592 + }, + { + "epoch": 1.729169316880804, + "ewc_loss": 0.007921216078102589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.921215728856623e-05, + "grad_norm": 3.949392318725586, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8544420599937439, + "num_tokens": 518447414.0, + "step": 13593 + }, + { + "epoch": 1.7292965271593945, + "ewc_loss": 0.008007396012544632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007395808817819e-05, + "grad_norm": 3.9718780517578125, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8711148500442505, + "num_tokens": 518486455.0, + "step": 13594 + }, + { + "epoch": 1.729423737437985, + "ewc_loss": 0.00800917949527502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009179146029055e-05, + "grad_norm": 3.9191761016845703, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8900017738342285, + "num_tokens": 518527023.0, + "step": 13595 + }, + { + "epoch": 1.7295509477165756, + "ewc_loss": 0.007963066920638084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963067037053406e-05, + "grad_norm": 4.063628196716309, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8687354922294617, + "num_tokens": 518563285.0, + "step": 13596 + }, + { + "epoch": 1.729678157995166, + "ewc_loss": 0.008074351586401463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074351353570819e-05, + "grad_norm": 3.929835557937622, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8743500709533691, + "num_tokens": 518607610.0, + "step": 13597 + }, + { + "epoch": 1.7298053682737566, + "ewc_loss": 0.007938998751342297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.938998896861449e-05, + "grad_norm": 3.9159011840820312, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8821280002593994, + "num_tokens": 518649434.0, + "step": 13598 + }, + { + "epoch": 1.7299325785523472, + "ewc_loss": 0.007967392913997173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967392593855038e-05, + "grad_norm": 3.947075128555298, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8806928992271423, + "num_tokens": 518684508.0, + "step": 13599 + }, + { + "epoch": 1.7300597888309377, + "ewc_loss": 0.007971866987645626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971867307787761e-05, + "grad_norm": 3.9894955158233643, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.879342794418335, + "num_tokens": 518717883.0, + "step": 13600 + }, + { + "epoch": 1.730186999109528, + "ewc_loss": 0.007990479469299316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.990479934960604e-05, + "grad_norm": 3.931807518005371, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8808669447898865, + "num_tokens": 518756158.0, + "step": 13601 + }, + { + "epoch": 1.7303142093881185, + "ewc_loss": 0.007929269224405289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929268758744001e-05, + "grad_norm": 3.9670374393463135, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8726323843002319, + "num_tokens": 518793003.0, + "step": 13602 + }, + { + "epoch": 1.730441419666709, + "ewc_loss": 0.008012302219867706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012301987037063e-05, + "grad_norm": 3.902008056640625, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8674346804618835, + "num_tokens": 518838485.0, + "step": 13603 + }, + { + "epoch": 1.7305686299452996, + "ewc_loss": 0.007935303263366222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935303437989205e-05, + "grad_norm": 3.9096879959106445, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8676300048828125, + "num_tokens": 518885936.0, + "step": 13604 + }, + { + "epoch": 1.7306958402238901, + "ewc_loss": 0.007969626225531101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969626312842593e-05, + "grad_norm": 3.958771228790283, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8631219863891602, + "num_tokens": 518925684.0, + "step": 13605 + }, + { + "epoch": 1.7308230505024806, + "ewc_loss": 0.007975955493748188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975955668371171e-05, + "grad_norm": 3.948575496673584, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8775216341018677, + "num_tokens": 518966438.0, + "step": 13606 + }, + { + "epoch": 1.730950260781071, + "ewc_loss": 0.00794166699051857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.94166699051857e-05, + "grad_norm": 3.8762686252593994, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8764554262161255, + "num_tokens": 519013576.0, + "step": 13607 + }, + { + "epoch": 1.7310774710596615, + "ewc_loss": 0.00789097510278225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.890975393820554e-05, + "grad_norm": 4.009030342102051, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8717527389526367, + "num_tokens": 519045342.0, + "step": 13608 + }, + { + "epoch": 1.731204681338252, + "ewc_loss": 0.008003156632184982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003156835911795e-05, + "grad_norm": 3.948550224304199, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8597455024719238, + "num_tokens": 519088566.0, + "step": 13609 + }, + { + "epoch": 1.7313318916168425, + "ewc_loss": 0.007913012988865376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913012814242393e-05, + "grad_norm": 3.9173202514648438, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8780493140220642, + "num_tokens": 519126516.0, + "step": 13610 + }, + { + "epoch": 1.731459101895433, + "ewc_loss": 0.007923594675958157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9235942394007e-05, + "grad_norm": 3.9171056747436523, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8825818300247192, + "num_tokens": 519165431.0, + "step": 13611 + }, + { + "epoch": 1.7315863121740236, + "ewc_loss": 0.007924726232886314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924726378405467e-05, + "grad_norm": 3.983119487762451, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8882195353507996, + "num_tokens": 519200991.0, + "step": 13612 + }, + { + "epoch": 1.7317135224526141, + "ewc_loss": 0.00797706563025713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977065979503095e-05, + "grad_norm": 3.895073413848877, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8822803497314453, + "num_tokens": 519245988.0, + "step": 13613 + }, + { + "epoch": 1.7318407327312046, + "ewc_loss": 0.007905340753495693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.905340316938236e-05, + "grad_norm": 3.9531543254852295, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8799324631690979, + "num_tokens": 519282595.0, + "step": 13614 + }, + { + "epoch": 1.7319679430097952, + "ewc_loss": 0.007968495599925518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968495629029348e-05, + "grad_norm": 3.974632501602173, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8683792948722839, + "num_tokens": 519319061.0, + "step": 13615 + }, + { + "epoch": 1.7320951532883857, + "ewc_loss": 0.007970976643264294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.970976730575785e-05, + "grad_norm": 3.940890073776245, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8743235468864441, + "num_tokens": 519358617.0, + "step": 13616 + }, + { + "epoch": 1.7322223635669762, + "ewc_loss": 0.007925157435238361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925157842691988e-05, + "grad_norm": 4.003901958465576, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.873191237449646, + "num_tokens": 519392461.0, + "step": 13617 + }, + { + "epoch": 1.7323495738455668, + "ewc_loss": 0.007998056709766388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998056389624253e-05, + "grad_norm": 3.914577007293701, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8816738128662109, + "num_tokens": 519432893.0, + "step": 13618 + }, + { + "epoch": 1.7324767841241573, + "ewc_loss": 0.007915922440588474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.915922469692305e-05, + "grad_norm": 3.950876474380493, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8742924928665161, + "num_tokens": 519473445.0, + "step": 13619 + }, + { + "epoch": 1.7326039944027478, + "ewc_loss": 0.007986386306583881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986386481206864e-05, + "grad_norm": 3.886570692062378, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8752474784851074, + "num_tokens": 519518883.0, + "step": 13620 + }, + { + "epoch": 1.7327312046813383, + "ewc_loss": 0.007928851060569286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.928851118776947e-05, + "grad_norm": 4.011228561401367, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8790470361709595, + "num_tokens": 519550123.0, + "step": 13621 + }, + { + "epoch": 1.7328584149599289, + "ewc_loss": 0.008020438253879547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020438690437004e-05, + "grad_norm": 3.9367165565490723, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8659557104110718, + "num_tokens": 519592538.0, + "step": 13622 + }, + { + "epoch": 1.7329856252385194, + "ewc_loss": 0.007923907600343227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923907833173871e-05, + "grad_norm": 3.91965389251709, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8748679161071777, + "num_tokens": 519633466.0, + "step": 13623 + }, + { + "epoch": 1.73311283551711, + "ewc_loss": 0.007944006472826004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.944006938487291e-05, + "grad_norm": 3.9434287548065186, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8714436292648315, + "num_tokens": 519672569.0, + "step": 13624 + }, + { + "epoch": 1.7332400457957005, + "ewc_loss": 0.00795856211334467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.958561764098704e-05, + "grad_norm": 3.9598426818847656, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.880659818649292, + "num_tokens": 519707383.0, + "step": 13625 + }, + { + "epoch": 1.7333672560742908, + "ewc_loss": 0.007957791909575462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957791967783123e-05, + "grad_norm": 3.9927608966827393, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.855860710144043, + "num_tokens": 519748187.0, + "step": 13626 + }, + { + "epoch": 1.7334944663528813, + "ewc_loss": 0.007986089214682579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986089622136205e-05, + "grad_norm": 3.945744276046753, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8707883358001709, + "num_tokens": 519790215.0, + "step": 13627 + }, + { + "epoch": 1.7336216766314718, + "ewc_loss": 0.007947595790028572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.947596168378368e-05, + "grad_norm": 4.0048956871032715, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8653349876403809, + "num_tokens": 519827278.0, + "step": 13628 + }, + { + "epoch": 1.7337488869100623, + "ewc_loss": 0.007986428216099739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986428681761026e-05, + "grad_norm": 3.9748966693878174, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8814520835876465, + "num_tokens": 519860216.0, + "step": 13629 + }, + { + "epoch": 1.7338760971886529, + "ewc_loss": 0.00795495044440031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.954950706334785e-05, + "grad_norm": 3.9856505393981934, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8751140832901001, + "num_tokens": 519894442.0, + "step": 13630 + }, + { + "epoch": 1.7340033074672434, + "ewc_loss": 0.00797481182962656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974811887834221e-05, + "grad_norm": 3.9638333320617676, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8728830814361572, + "num_tokens": 519932524.0, + "step": 13631 + }, + { + "epoch": 1.7341305177458337, + "ewc_loss": 0.007963653653860092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96365347923711e-05, + "grad_norm": 3.9598004817962646, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8741104602813721, + "num_tokens": 519974366.0, + "step": 13632 + }, + { + "epoch": 1.7342577280244242, + "ewc_loss": 0.007951050996780396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.951050793053582e-05, + "grad_norm": 3.960557460784912, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8827078342437744, + "num_tokens": 520013983.0, + "step": 13633 + }, + { + "epoch": 1.7343849383030148, + "ewc_loss": 0.007969798520207405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969798753038049e-05, + "grad_norm": 4.055574893951416, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8751102685928345, + "num_tokens": 520047671.0, + "step": 13634 + }, + { + "epoch": 1.7345121485816053, + "ewc_loss": 0.008025352843105793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025352872209623e-05, + "grad_norm": 3.967790365219116, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8709425926208496, + "num_tokens": 520086574.0, + "step": 13635 + }, + { + "epoch": 1.7346393588601958, + "ewc_loss": 0.007951068691909313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.951068982947618e-05, + "grad_norm": 3.9579789638519287, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8606709241867065, + "num_tokens": 520130613.0, + "step": 13636 + }, + { + "epoch": 1.7347665691387864, + "ewc_loss": 0.00797446072101593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974460459081456e-05, + "grad_norm": 3.9328384399414062, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8932629823684692, + "num_tokens": 520163763.0, + "step": 13637 + }, + { + "epoch": 1.7348937794173769, + "ewc_loss": 0.007971003651618958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971003651618958e-05, + "grad_norm": 3.993544101715088, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8709689378738403, + "num_tokens": 520198663.0, + "step": 13638 + }, + { + "epoch": 1.7350209896959674, + "ewc_loss": 0.008009586483240128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00958659965545e-05, + "grad_norm": 3.982649803161621, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8778116703033447, + "num_tokens": 520231737.0, + "step": 13639 + }, + { + "epoch": 1.735148199974558, + "ewc_loss": 0.007986434735357761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986434502527118e-05, + "grad_norm": 3.934171676635742, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.878809928894043, + "num_tokens": 520275664.0, + "step": 13640 + }, + { + "epoch": 1.7352754102531485, + "ewc_loss": 0.007963085547089577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963085954543203e-05, + "grad_norm": 3.9722650051116943, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8624721765518188, + "num_tokens": 520316100.0, + "step": 13641 + }, + { + "epoch": 1.735402620531739, + "ewc_loss": 0.007997949607670307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997949433047324e-05, + "grad_norm": 4.024158954620361, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8712777495384216, + "num_tokens": 520348167.0, + "step": 13642 + }, + { + "epoch": 1.7355298308103295, + "ewc_loss": 0.008032253943383694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032254118006676e-05, + "grad_norm": 3.9669222831726074, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8599773645401001, + "num_tokens": 520385234.0, + "step": 13643 + }, + { + "epoch": 1.73565704108892, + "ewc_loss": 0.007977886125445366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977885979926214e-05, + "grad_norm": 3.977058172225952, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8577650785446167, + "num_tokens": 520423051.0, + "step": 13644 + }, + { + "epoch": 1.7357842513675106, + "ewc_loss": 0.008017858490347862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017858635867015e-05, + "grad_norm": 3.9297287464141846, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8642794489860535, + "num_tokens": 520467202.0, + "step": 13645 + }, + { + "epoch": 1.735911461646101, + "ewc_loss": 0.00799597892910242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995979103725404e-05, + "grad_norm": 3.9846503734588623, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8723632097244263, + "num_tokens": 520501434.0, + "step": 13646 + }, + { + "epoch": 1.7360386719246916, + "ewc_loss": 0.008032000623643398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032000914681703e-05, + "grad_norm": 3.907763719558716, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8710302114486694, + "num_tokens": 520542757.0, + "step": 13647 + }, + { + "epoch": 1.7361658822032822, + "ewc_loss": 0.0079797999933362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979800284374505e-05, + "grad_norm": 3.894369602203369, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8662394285202026, + "num_tokens": 520585216.0, + "step": 13648 + }, + { + "epoch": 1.7362930924818727, + "ewc_loss": 0.007993796840310097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99379704403691e-05, + "grad_norm": 4.01478385925293, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8746047019958496, + "num_tokens": 520620066.0, + "step": 13649 + }, + { + "epoch": 1.736420302760463, + "ewc_loss": 0.00807686522603035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076865196926519e-05, + "grad_norm": 3.9488587379455566, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8805521130561829, + "num_tokens": 520660917.0, + "step": 13650 + }, + { + "epoch": 1.7365475130390535, + "ewc_loss": 0.007982766255736351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982765964698046e-05, + "grad_norm": 3.97570538520813, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8674095869064331, + "num_tokens": 520704255.0, + "step": 13651 + }, + { + "epoch": 1.736674723317644, + "ewc_loss": 0.008031567558646202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031567995203659e-05, + "grad_norm": 4.002751350402832, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8606199026107788, + "num_tokens": 520743036.0, + "step": 13652 + }, + { + "epoch": 1.7368019335962346, + "ewc_loss": 0.008032985962927341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032986079342663e-05, + "grad_norm": 3.9846105575561523, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8674303889274597, + "num_tokens": 520778325.0, + "step": 13653 + }, + { + "epoch": 1.736929143874825, + "ewc_loss": 0.00800645537674427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006455027498305e-05, + "grad_norm": 3.940950632095337, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8864710927009583, + "num_tokens": 520815226.0, + "step": 13654 + }, + { + "epoch": 1.7370563541534156, + "ewc_loss": 0.007988547906279564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988547440618277e-05, + "grad_norm": 3.9680557250976562, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.856574296951294, + "num_tokens": 520856606.0, + "step": 13655 + }, + { + "epoch": 1.737183564432006, + "ewc_loss": 0.008016320876777172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.016321226023138e-05, + "grad_norm": 3.9987611770629883, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8649475574493408, + "num_tokens": 520893493.0, + "step": 13656 + }, + { + "epoch": 1.7373107747105965, + "ewc_loss": 0.008029050193727016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02904978627339e-05, + "grad_norm": 3.960268497467041, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8817338943481445, + "num_tokens": 520930391.0, + "step": 13657 + }, + { + "epoch": 1.737437984989187, + "ewc_loss": 0.007985487580299377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.985487900441512e-05, + "grad_norm": 3.9193668365478516, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8744900822639465, + "num_tokens": 520970512.0, + "step": 13658 + }, + { + "epoch": 1.7375651952677775, + "ewc_loss": 0.00798475556075573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984755939105526e-05, + "grad_norm": 3.902005672454834, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.866234540939331, + "num_tokens": 521011787.0, + "step": 13659 + }, + { + "epoch": 1.737692405546368, + "ewc_loss": 0.008001900278031826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001900278031826e-05, + "grad_norm": 3.945517063140869, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8754516839981079, + "num_tokens": 521053494.0, + "step": 13660 + }, + { + "epoch": 1.7378196158249586, + "ewc_loss": 0.007999390363693237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.999390072654933e-05, + "grad_norm": 4.006776809692383, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8639752268791199, + "num_tokens": 521089291.0, + "step": 13661 + }, + { + "epoch": 1.7379468261035491, + "ewc_loss": 0.008029831573367119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029831224121153e-05, + "grad_norm": 3.9236204624176025, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8864930868148804, + "num_tokens": 521128910.0, + "step": 13662 + }, + { + "epoch": 1.7380740363821396, + "ewc_loss": 0.007973365485668182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973365427460521e-05, + "grad_norm": 3.95277738571167, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8714702129364014, + "num_tokens": 521168175.0, + "step": 13663 + }, + { + "epoch": 1.7382012466607302, + "ewc_loss": 0.008024361915886402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024361886782572e-05, + "grad_norm": 3.9947147369384766, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8682956695556641, + "num_tokens": 521208020.0, + "step": 13664 + }, + { + "epoch": 1.7383284569393207, + "ewc_loss": 0.008018831722438335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018831431400031e-05, + "grad_norm": 3.9408016204833984, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8866319060325623, + "num_tokens": 521249828.0, + "step": 13665 + }, + { + "epoch": 1.7384556672179112, + "ewc_loss": 0.007960734888911247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96073509263806e-05, + "grad_norm": 3.930483102798462, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8776872158050537, + "num_tokens": 521289940.0, + "step": 13666 + }, + { + "epoch": 1.7385828774965018, + "ewc_loss": 0.007984645664691925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984646072145551e-05, + "grad_norm": 3.9816765785217285, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8904550671577454, + "num_tokens": 521327426.0, + "step": 13667 + }, + { + "epoch": 1.7387100877750923, + "ewc_loss": 0.00799302663654089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993026520125568e-05, + "grad_norm": 3.920083999633789, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8643941879272461, + "num_tokens": 521372112.0, + "step": 13668 + }, + { + "epoch": 1.7388372980536828, + "ewc_loss": 0.007946139201521873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946138794068247e-05, + "grad_norm": 3.963573932647705, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8722356557846069, + "num_tokens": 521410842.0, + "step": 13669 + }, + { + "epoch": 1.7389645083322733, + "ewc_loss": 0.007996028289198875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99602858023718e-05, + "grad_norm": 3.94834303855896, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8849503397941589, + "num_tokens": 521446854.0, + "step": 13670 + }, + { + "epoch": 1.7390917186108639, + "ewc_loss": 0.007949233055114746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949233258841559e-05, + "grad_norm": 4.021717071533203, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8640735745429993, + "num_tokens": 521486284.0, + "step": 13671 + }, + { + "epoch": 1.7392189288894544, + "ewc_loss": 0.007996469736099243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9964695032686e-05, + "grad_norm": 4.025424480438232, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8618878126144409, + "num_tokens": 521522716.0, + "step": 13672 + }, + { + "epoch": 1.739346139168045, + "ewc_loss": 0.00797093566507101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.970935985213146e-05, + "grad_norm": 4.0659379959106445, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8689993619918823, + "num_tokens": 521551549.0, + "step": 13673 + }, + { + "epoch": 1.7394733494466355, + "ewc_loss": 0.008004502393305302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004502160474658e-05, + "grad_norm": 3.9010539054870605, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8772677779197693, + "num_tokens": 521589552.0, + "step": 13674 + }, + { + "epoch": 1.7396005597252258, + "ewc_loss": 0.007914399728178978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914399611763656e-05, + "grad_norm": 3.9231879711151123, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.884310245513916, + "num_tokens": 521627906.0, + "step": 13675 + }, + { + "epoch": 1.7397277700038163, + "ewc_loss": 0.007992703467607498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992703467607498e-05, + "grad_norm": 3.969266891479492, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8580135107040405, + "num_tokens": 521666615.0, + "step": 13676 + }, + { + "epoch": 1.7398549802824068, + "ewc_loss": 0.008008522912859917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.008522854652256e-05, + "grad_norm": 3.933556318283081, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8693256378173828, + "num_tokens": 521707742.0, + "step": 13677 + }, + { + "epoch": 1.7399821905609973, + "ewc_loss": 0.00799089577049017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.990896119736135e-05, + "grad_norm": 3.962768793106079, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.864580512046814, + "num_tokens": 521746848.0, + "step": 13678 + }, + { + "epoch": 1.7401094008395879, + "ewc_loss": 0.007992284372448921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992284372448921e-05, + "grad_norm": 3.9076616764068604, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8762789964675903, + "num_tokens": 521782917.0, + "step": 13679 + }, + { + "epoch": 1.7402366111181784, + "ewc_loss": 0.007977199740707874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977199857123196e-05, + "grad_norm": 3.9356963634490967, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8708145618438721, + "num_tokens": 521825609.0, + "step": 13680 + }, + { + "epoch": 1.7403638213967687, + "ewc_loss": 0.00801638513803482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.016385254450142e-05, + "grad_norm": 3.9235353469848633, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8878543376922607, + "num_tokens": 521862277.0, + "step": 13681 + }, + { + "epoch": 1.7404910316753592, + "ewc_loss": 0.008002957329154015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002957474673167e-05, + "grad_norm": 3.9779481887817383, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8914897441864014, + "num_tokens": 521894951.0, + "step": 13682 + }, + { + "epoch": 1.7406182419539498, + "ewc_loss": 0.008008540607988834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00854031695053e-05, + "grad_norm": 3.952589273452759, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8727380037307739, + "num_tokens": 521932158.0, + "step": 13683 + }, + { + "epoch": 1.7407454522325403, + "ewc_loss": 0.007988575845956802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988575816852972e-05, + "grad_norm": 3.897280216217041, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8801969289779663, + "num_tokens": 521971050.0, + "step": 13684 + }, + { + "epoch": 1.7408726625111308, + "ewc_loss": 0.007955718785524368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955718319863081e-05, + "grad_norm": 3.964934825897217, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8840813636779785, + "num_tokens": 522007617.0, + "step": 13685 + }, + { + "epoch": 1.7409998727897213, + "ewc_loss": 0.008004751056432724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004750998225063e-05, + "grad_norm": 3.893556833267212, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8619008660316467, + "num_tokens": 522053484.0, + "step": 13686 + }, + { + "epoch": 1.7411270830683119, + "ewc_loss": 0.007930017076432705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93001672718674e-05, + "grad_norm": 3.9898016452789307, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.87106853723526, + "num_tokens": 522088047.0, + "step": 13687 + }, + { + "epoch": 1.7412542933469024, + "ewc_loss": 0.008024065755307674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024065755307674e-05, + "grad_norm": 3.935310125350952, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8716753721237183, + "num_tokens": 522129865.0, + "step": 13688 + }, + { + "epoch": 1.741381503625493, + "ewc_loss": 0.007932459935545921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932459993753582e-05, + "grad_norm": 4.0015459060668945, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8720357418060303, + "num_tokens": 522170154.0, + "step": 13689 + }, + { + "epoch": 1.7415087139040835, + "ewc_loss": 0.007996431551873684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.996431668289006e-05, + "grad_norm": 3.9352502822875977, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8964837789535522, + "num_tokens": 522204586.0, + "step": 13690 + }, + { + "epoch": 1.741635924182674, + "ewc_loss": 0.007927433587610722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.927433762233704e-05, + "grad_norm": 3.95560359954834, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8780794143676758, + "num_tokens": 522241942.0, + "step": 13691 + }, + { + "epoch": 1.7417631344612645, + "ewc_loss": 0.0079730786383152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973078754730523e-05, + "grad_norm": 4.006133556365967, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.871246337890625, + "num_tokens": 522275800.0, + "step": 13692 + }, + { + "epoch": 1.741890344739855, + "ewc_loss": 0.0079888217151165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988821744220331e-05, + "grad_norm": 3.9259138107299805, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8664727807044983, + "num_tokens": 522317024.0, + "step": 13693 + }, + { + "epoch": 1.7420175550184456, + "ewc_loss": 0.007920250296592712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.920250209281221e-05, + "grad_norm": 3.9296658039093018, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8825229406356812, + "num_tokens": 522358510.0, + "step": 13694 + }, + { + "epoch": 1.742144765297036, + "ewc_loss": 0.007943602278828621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.943602395243943e-05, + "grad_norm": 4.029150485992432, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8685954213142395, + "num_tokens": 522395960.0, + "step": 13695 + }, + { + "epoch": 1.7422719755756266, + "ewc_loss": 0.007992547936737537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992547762114555e-05, + "grad_norm": 4.016762733459473, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8845351934432983, + "num_tokens": 522428807.0, + "step": 13696 + }, + { + "epoch": 1.7423991858542172, + "ewc_loss": 0.007957537658512592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957537309266627e-05, + "grad_norm": 3.930356740951538, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8749404549598694, + "num_tokens": 522464472.0, + "step": 13697 + }, + { + "epoch": 1.7425263961328077, + "ewc_loss": 0.007936112582683563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936112524475902e-05, + "grad_norm": 3.8744046688079834, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8860766887664795, + "num_tokens": 522509522.0, + "step": 13698 + }, + { + "epoch": 1.742653606411398, + "ewc_loss": 0.007937101647257805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93710132711567e-05, + "grad_norm": 4.0117011070251465, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.858757734298706, + "num_tokens": 522545581.0, + "step": 13699 + }, + { + "epoch": 1.7427808166899885, + "ewc_loss": 0.008024387061595917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02438662503846e-05, + "grad_norm": 3.9543135166168213, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8781545162200928, + "num_tokens": 522582072.0, + "step": 13700 + }, + { + "epoch": 1.742908026968579, + "ewc_loss": 0.007950379513204098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.950379949761555e-05, + "grad_norm": 4.018634796142578, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8777409195899963, + "num_tokens": 522618183.0, + "step": 13701 + }, + { + "epoch": 1.7430352372471696, + "ewc_loss": 0.008002997376024723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002997492440045e-05, + "grad_norm": 3.9669084548950195, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8624926805496216, + "num_tokens": 522660578.0, + "step": 13702 + }, + { + "epoch": 1.74316244752576, + "ewc_loss": 0.007953447289764881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953447493491694e-05, + "grad_norm": 3.899186134338379, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8836268186569214, + "num_tokens": 522701934.0, + "step": 13703 + }, + { + "epoch": 1.7432896578043506, + "ewc_loss": 0.007929951883852482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929951971163973e-05, + "grad_norm": 3.926725387573242, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8906667828559875, + "num_tokens": 522737404.0, + "step": 13704 + }, + { + "epoch": 1.743416868082941, + "ewc_loss": 0.007992962375283241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992962491698563e-05, + "grad_norm": 3.962473154067993, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.871110200881958, + "num_tokens": 522774188.0, + "step": 13705 + }, + { + "epoch": 1.7435440783615315, + "ewc_loss": 0.007978488691151142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.978488429216668e-05, + "grad_norm": 3.9534220695495605, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8823016881942749, + "num_tokens": 522810467.0, + "step": 13706 + }, + { + "epoch": 1.743671288640122, + "ewc_loss": 0.007977036759257317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977036875672638e-05, + "grad_norm": 3.97385311126709, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8626002669334412, + "num_tokens": 522845246.0, + "step": 13707 + }, + { + "epoch": 1.7437984989187125, + "ewc_loss": 0.007995663210749626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995663327164948e-05, + "grad_norm": 4.024632453918457, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8821233510971069, + "num_tokens": 522877150.0, + "step": 13708 + }, + { + "epoch": 1.743925709197303, + "ewc_loss": 0.008015373721718788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.015373896341771e-05, + "grad_norm": 3.916530132293701, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8881036043167114, + "num_tokens": 522913161.0, + "step": 13709 + }, + { + "epoch": 1.7440529194758936, + "ewc_loss": 0.007945835590362549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.945835386635736e-05, + "grad_norm": 3.954436779022217, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8593637943267822, + "num_tokens": 522953464.0, + "step": 13710 + }, + { + "epoch": 1.744180129754484, + "ewc_loss": 0.008024870418012142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024870476219803e-05, + "grad_norm": 3.9289958477020264, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8781733512878418, + "num_tokens": 522996874.0, + "step": 13711 + }, + { + "epoch": 1.7443073400330746, + "ewc_loss": 0.007988987490534782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988987636053935e-05, + "grad_norm": 3.975029230117798, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8786073923110962, + "num_tokens": 523031349.0, + "step": 13712 + }, + { + "epoch": 1.7444345503116652, + "ewc_loss": 0.00802183523774147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021835674298927e-05, + "grad_norm": 4.053832530975342, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8673640489578247, + "num_tokens": 523068910.0, + "step": 13713 + }, + { + "epoch": 1.7445617605902557, + "ewc_loss": 0.008037546649575233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037546649575233e-05, + "grad_norm": 3.9137210845947266, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8875052332878113, + "num_tokens": 523106179.0, + "step": 13714 + }, + { + "epoch": 1.7446889708688462, + "ewc_loss": 0.00794681441038847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946814730530605e-05, + "grad_norm": 4.0121355056762695, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8759198188781738, + "num_tokens": 523140827.0, + "step": 13715 + }, + { + "epoch": 1.7448161811474368, + "ewc_loss": 0.008064135909080505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064135909080505e-05, + "grad_norm": 3.886453151702881, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.888690173625946, + "num_tokens": 523182880.0, + "step": 13716 + }, + { + "epoch": 1.7449433914260273, + "ewc_loss": 0.00794623326510191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946233381517231e-05, + "grad_norm": 4.034054279327393, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8788831233978271, + "num_tokens": 523213059.0, + "step": 13717 + }, + { + "epoch": 1.7450706017046178, + "ewc_loss": 0.008093108423054218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093108044704422e-05, + "grad_norm": 3.9239468574523926, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8734135627746582, + "num_tokens": 523254398.0, + "step": 13718 + }, + { + "epoch": 1.7451978119832083, + "ewc_loss": 0.007977664470672607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977664790814742e-05, + "grad_norm": 3.9213013648986816, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8882769346237183, + "num_tokens": 523293325.0, + "step": 13719 + }, + { + "epoch": 1.7453250222617989, + "ewc_loss": 0.008007909171283245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007908763829619e-05, + "grad_norm": 3.922579050064087, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.864668071269989, + "num_tokens": 523334402.0, + "step": 13720 + }, + { + "epoch": 1.7454522325403894, + "ewc_loss": 0.008022497408092022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022497786441818e-05, + "grad_norm": 3.93918514251709, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8786789178848267, + "num_tokens": 523376238.0, + "step": 13721 + }, + { + "epoch": 1.74557944281898, + "ewc_loss": 0.008039398118853569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039398380788043e-05, + "grad_norm": 3.907667398452759, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8970850110054016, + "num_tokens": 523417074.0, + "step": 13722 + }, + { + "epoch": 1.7457066530975704, + "ewc_loss": 0.007991022430360317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99102199380286e-05, + "grad_norm": 3.9012792110443115, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8814749121665955, + "num_tokens": 523457984.0, + "step": 13723 + }, + { + "epoch": 1.7458338633761608, + "ewc_loss": 0.008006568998098373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006568532437086e-05, + "grad_norm": 4.074849605560303, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8691478371620178, + "num_tokens": 523490071.0, + "step": 13724 + }, + { + "epoch": 1.7459610736547513, + "ewc_loss": 0.008106664754450321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.106664608931169e-05, + "grad_norm": 3.920407295227051, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8697770237922668, + "num_tokens": 523530105.0, + "step": 13725 + }, + { + "epoch": 1.7460882839333418, + "ewc_loss": 0.00795749481767416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957495108712465e-05, + "grad_norm": 3.882261037826538, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8880448937416077, + "num_tokens": 523576698.0, + "step": 13726 + }, + { + "epoch": 1.7462154942119323, + "ewc_loss": 0.00798606313765049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986063428688794e-05, + "grad_norm": 3.9723668098449707, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8740811347961426, + "num_tokens": 523614387.0, + "step": 13727 + }, + { + "epoch": 1.7463427044905229, + "ewc_loss": 0.008037421852350235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03742150310427e-05, + "grad_norm": 3.979111433029175, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8727303147315979, + "num_tokens": 523651149.0, + "step": 13728 + }, + { + "epoch": 1.7464699147691134, + "ewc_loss": 0.008006532676517963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006532880244777e-05, + "grad_norm": 3.9361937046051025, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8890554308891296, + "num_tokens": 523688448.0, + "step": 13729 + }, + { + "epoch": 1.7465971250477037, + "ewc_loss": 0.007984514348208904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984514377312735e-05, + "grad_norm": 3.952303409576416, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8801217079162598, + "num_tokens": 523725805.0, + "step": 13730 + }, + { + "epoch": 1.7467243353262942, + "ewc_loss": 0.008004311472177505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004311530385166e-05, + "grad_norm": 3.9699292182922363, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8793328404426575, + "num_tokens": 523763564.0, + "step": 13731 + }, + { + "epoch": 1.7468515456048848, + "ewc_loss": 0.007991987280547619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991987513378263e-05, + "grad_norm": 3.9473695755004883, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8704283237457275, + "num_tokens": 523804422.0, + "step": 13732 + }, + { + "epoch": 1.7469787558834753, + "ewc_loss": 0.007957217283546925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.957217167131603e-05, + "grad_norm": 3.930612087249756, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8838222026824951, + "num_tokens": 523845889.0, + "step": 13733 + }, + { + "epoch": 1.7471059661620658, + "ewc_loss": 0.00796368159353733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963681855471805e-05, + "grad_norm": 3.9793694019317627, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8852601051330566, + "num_tokens": 523883623.0, + "step": 13734 + }, + { + "epoch": 1.7472331764406563, + "ewc_loss": 0.008001496084034443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001496462384239e-05, + "grad_norm": 3.985395908355713, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8833391070365906, + "num_tokens": 523917273.0, + "step": 13735 + }, + { + "epoch": 1.7473603867192469, + "ewc_loss": 0.007971970364451408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971970626385882e-05, + "grad_norm": 3.948665142059326, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.871774435043335, + "num_tokens": 523956149.0, + "step": 13736 + }, + { + "epoch": 1.7474875969978374, + "ewc_loss": 0.007946643978357315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946643745526671e-05, + "grad_norm": 3.964622974395752, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8724910020828247, + "num_tokens": 523997229.0, + "step": 13737 + }, + { + "epoch": 1.747614807276428, + "ewc_loss": 0.007962186820805073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96218664618209e-05, + "grad_norm": 3.9813730716705322, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8791377544403076, + "num_tokens": 524032921.0, + "step": 13738 + }, + { + "epoch": 1.7477420175550185, + "ewc_loss": 0.007952924817800522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952925079734996e-05, + "grad_norm": 4.011483669281006, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8769626021385193, + "num_tokens": 524065016.0, + "step": 13739 + }, + { + "epoch": 1.747869227833609, + "ewc_loss": 0.007970643229782581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.970643491717055e-05, + "grad_norm": 3.9288129806518555, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8643903732299805, + "num_tokens": 524107001.0, + "step": 13740 + }, + { + "epoch": 1.7479964381121995, + "ewc_loss": 0.007911328226327896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.911328430054709e-05, + "grad_norm": 3.928925037384033, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8697526454925537, + "num_tokens": 524149283.0, + "step": 13741 + }, + { + "epoch": 1.74812364839079, + "ewc_loss": 0.00793435052037239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934350287541747e-05, + "grad_norm": 3.9681293964385986, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8752491474151611, + "num_tokens": 524187240.0, + "step": 13742 + }, + { + "epoch": 1.7482508586693806, + "ewc_loss": 0.00796117540448904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961175288073719e-05, + "grad_norm": 4.02052640914917, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8833403587341309, + "num_tokens": 524216793.0, + "step": 13743 + }, + { + "epoch": 1.748378068947971, + "ewc_loss": 0.00797492079436779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974921027198434e-05, + "grad_norm": 3.9270968437194824, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8827863931655884, + "num_tokens": 524257629.0, + "step": 13744 + }, + { + "epoch": 1.7485052792265616, + "ewc_loss": 0.007908642292022705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908642146503553e-05, + "grad_norm": 3.977224588394165, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8864374160766602, + "num_tokens": 524291090.0, + "step": 13745 + }, + { + "epoch": 1.7486324895051522, + "ewc_loss": 0.007976516149938107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976515917107463e-05, + "grad_norm": 3.935222625732422, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8880378007888794, + "num_tokens": 524324882.0, + "step": 13746 + }, + { + "epoch": 1.7487596997837427, + "ewc_loss": 0.007931690663099289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.931690925033763e-05, + "grad_norm": 4.0240631103515625, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8613994121551514, + "num_tokens": 524359422.0, + "step": 13747 + }, + { + "epoch": 1.748886910062333, + "ewc_loss": 0.008005714043974876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.005714335013181e-05, + "grad_norm": 3.9182028770446777, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8810410499572754, + "num_tokens": 524398383.0, + "step": 13748 + }, + { + "epoch": 1.7490141203409235, + "ewc_loss": 0.007912300527095795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.912300497991964e-05, + "grad_norm": 3.9529223442077637, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8706636428833008, + "num_tokens": 524437984.0, + "step": 13749 + }, + { + "epoch": 1.749141330619514, + "ewc_loss": 0.0079647246748209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.964724500197917e-05, + "grad_norm": 4.010103702545166, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8803268671035767, + "num_tokens": 524471324.0, + "step": 13750 + }, + { + "epoch": 1.7492685408981046, + "ewc_loss": 0.008007221855223179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00722191343084e-05, + "grad_norm": 3.949705123901367, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8640934824943542, + "num_tokens": 524511471.0, + "step": 13751 + }, + { + "epoch": 1.749395751176695, + "ewc_loss": 0.007943258620798588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.943258970044553e-05, + "grad_norm": 3.9938013553619385, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8787692785263062, + "num_tokens": 524541743.0, + "step": 13752 + }, + { + "epoch": 1.7495229614552856, + "ewc_loss": 0.008018938824534416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01893838797696e-05, + "grad_norm": 3.960120439529419, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8748419284820557, + "num_tokens": 524581470.0, + "step": 13753 + }, + { + "epoch": 1.749650171733876, + "ewc_loss": 0.00797292497009039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.972925232024863e-05, + "grad_norm": 4.033853530883789, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8772780299186707, + "num_tokens": 524612722.0, + "step": 13754 + }, + { + "epoch": 1.7497773820124665, + "ewc_loss": 0.008031155914068222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031156176002696e-05, + "grad_norm": 3.9312751293182373, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8822060227394104, + "num_tokens": 524647640.0, + "step": 13755 + }, + { + "epoch": 1.749904592291057, + "ewc_loss": 0.007967446930706501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967447163537145e-05, + "grad_norm": 4.063043117523193, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8705649375915527, + "num_tokens": 524682366.0, + "step": 13756 + }, + { + "epoch": 1.7500318025696475, + "ewc_loss": 0.008072886615991592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.072886703303084e-05, + "grad_norm": 3.9350087642669678, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8751979470252991, + "num_tokens": 524724242.0, + "step": 13757 + }, + { + "epoch": 1.750159012848238, + "ewc_loss": 0.007961212657392025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961212395457551e-05, + "grad_norm": 4.022150993347168, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8620961904525757, + "num_tokens": 524757373.0, + "step": 13758 + }, + { + "epoch": 1.7502862231268286, + "ewc_loss": 0.00805027037858963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050270844250917e-05, + "grad_norm": 3.9489569664001465, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8645143508911133, + "num_tokens": 524797704.0, + "step": 13759 + }, + { + "epoch": 1.750413433405419, + "ewc_loss": 0.00796952098608017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969520811457187e-05, + "grad_norm": 3.9593942165374756, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8770478963851929, + "num_tokens": 524836710.0, + "step": 13760 + }, + { + "epoch": 1.7505406436840096, + "ewc_loss": 0.008033309131860733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033309131860733e-05, + "grad_norm": 4.004094123840332, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8742642402648926, + "num_tokens": 524870224.0, + "step": 13761 + }, + { + "epoch": 1.7506678539626002, + "ewc_loss": 0.008051720447838306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051720215007663e-05, + "grad_norm": 3.97357439994812, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8774459362030029, + "num_tokens": 524905345.0, + "step": 13762 + }, + { + "epoch": 1.7507950642411907, + "ewc_loss": 0.008023076690733433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023076952667907e-05, + "grad_norm": 3.9852561950683594, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8593790531158447, + "num_tokens": 524943893.0, + "step": 13763 + }, + { + "epoch": 1.7509222745197812, + "ewc_loss": 0.008035284467041492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035284554352984e-05, + "grad_norm": 3.9314792156219482, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8744785785675049, + "num_tokens": 524982924.0, + "step": 13764 + }, + { + "epoch": 1.7510494847983717, + "ewc_loss": 0.008004328235983849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004328265087679e-05, + "grad_norm": 3.992525100708008, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8747961521148682, + "num_tokens": 525016749.0, + "step": 13765 + }, + { + "epoch": 1.7511766950769623, + "ewc_loss": 0.00805356353521347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053563942667097e-05, + "grad_norm": 3.9736270904541016, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8773189783096313, + "num_tokens": 525049178.0, + "step": 13766 + }, + { + "epoch": 1.7513039053555528, + "ewc_loss": 0.008050746284425259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050746691878885e-05, + "grad_norm": 4.001016616821289, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8634179830551147, + "num_tokens": 525083456.0, + "step": 13767 + }, + { + "epoch": 1.7514311156341433, + "ewc_loss": 0.008059990592300892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059990796027705e-05, + "grad_norm": 3.9455716609954834, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8776068687438965, + "num_tokens": 525120159.0, + "step": 13768 + }, + { + "epoch": 1.7515583259127339, + "ewc_loss": 0.008045275695621967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04527589934878e-05, + "grad_norm": 3.9548909664154053, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8822757005691528, + "num_tokens": 525159174.0, + "step": 13769 + }, + { + "epoch": 1.7516855361913244, + "ewc_loss": 0.00807065051048994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070650073932484e-05, + "grad_norm": 3.956289052963257, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8719158172607422, + "num_tokens": 525198373.0, + "step": 13770 + }, + { + "epoch": 1.751812746469915, + "ewc_loss": 0.008077967911958694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077967504505068e-05, + "grad_norm": 3.924704074859619, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8803542852401733, + "num_tokens": 525240618.0, + "step": 13771 + }, + { + "epoch": 1.7519399567485054, + "ewc_loss": 0.008037963882088661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037963561946526e-05, + "grad_norm": 3.93919038772583, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.88812655210495, + "num_tokens": 525274899.0, + "step": 13772 + }, + { + "epoch": 1.7520671670270958, + "ewc_loss": 0.008050624281167984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050624455790967e-05, + "grad_norm": 4.009746551513672, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8560649156570435, + "num_tokens": 525310144.0, + "step": 13773 + }, + { + "epoch": 1.7521943773056863, + "ewc_loss": 0.008097239769995213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.097240061033517e-05, + "grad_norm": 3.937378168106079, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8850029706954956, + "num_tokens": 525348565.0, + "step": 13774 + }, + { + "epoch": 1.7523215875842768, + "ewc_loss": 0.00802621804177761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02621798356995e-05, + "grad_norm": 3.970970869064331, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8819653987884521, + "num_tokens": 525383709.0, + "step": 13775 + }, + { + "epoch": 1.7524487978628673, + "ewc_loss": 0.00809739250689745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.097392128547654e-05, + "grad_norm": 3.9652514457702637, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8688721656799316, + "num_tokens": 525423983.0, + "step": 13776 + }, + { + "epoch": 1.7525760081414579, + "ewc_loss": 0.008073807694017887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07380783953704e-05, + "grad_norm": 3.9679219722747803, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8814917802810669, + "num_tokens": 525457357.0, + "step": 13777 + }, + { + "epoch": 1.7527032184200484, + "ewc_loss": 0.00807473435997963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074734068941325e-05, + "grad_norm": 3.9555585384368896, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8742839694023132, + "num_tokens": 525497021.0, + "step": 13778 + }, + { + "epoch": 1.7528304286986387, + "ewc_loss": 0.00806498248130083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064982102951035e-05, + "grad_norm": 3.907198905944824, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8903508186340332, + "num_tokens": 525533592.0, + "step": 13779 + }, + { + "epoch": 1.7529576389772292, + "ewc_loss": 0.008033799938857555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033799531403929e-05, + "grad_norm": 3.9973621368408203, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8677910566329956, + "num_tokens": 525572747.0, + "step": 13780 + }, + { + "epoch": 1.7530848492558198, + "ewc_loss": 0.008123976178467274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.123976294882596e-05, + "grad_norm": 3.9028749465942383, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8811448812484741, + "num_tokens": 525613578.0, + "step": 13781 + }, + { + "epoch": 1.7532120595344103, + "ewc_loss": 0.008022149093449116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022149268072098e-05, + "grad_norm": 3.9064953327178955, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8818479776382446, + "num_tokens": 525655047.0, + "step": 13782 + }, + { + "epoch": 1.7533392698130008, + "ewc_loss": 0.008051273413002491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051273471210152e-05, + "grad_norm": 4.0123162269592285, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8726789355278015, + "num_tokens": 525689561.0, + "step": 13783 + }, + { + "epoch": 1.7534664800915913, + "ewc_loss": 0.008107041008770466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.107040775939822e-05, + "grad_norm": 3.9480745792388916, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8721899390220642, + "num_tokens": 525726946.0, + "step": 13784 + }, + { + "epoch": 1.7535936903701819, + "ewc_loss": 0.008018315769731998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018315566005185e-05, + "grad_norm": 3.9930639266967773, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8702125549316406, + "num_tokens": 525762089.0, + "step": 13785 + }, + { + "epoch": 1.7537209006487724, + "ewc_loss": 0.008080093190073967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080092811724171e-05, + "grad_norm": 3.950507879257202, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.881630539894104, + "num_tokens": 525796081.0, + "step": 13786 + }, + { + "epoch": 1.753848110927363, + "ewc_loss": 0.008044983260333538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04498340585269e-05, + "grad_norm": 4.028900623321533, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8638629913330078, + "num_tokens": 525829945.0, + "step": 13787 + }, + { + "epoch": 1.7539753212059535, + "ewc_loss": 0.00810206588357687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102065476123244e-05, + "grad_norm": 3.8923866748809814, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.882814347743988, + "num_tokens": 525871289.0, + "step": 13788 + }, + { + "epoch": 1.754102531484544, + "ewc_loss": 0.008017200045287609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017200161702931e-05, + "grad_norm": 3.9719667434692383, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8757476210594177, + "num_tokens": 525908455.0, + "step": 13789 + }, + { + "epoch": 1.7542297417631345, + "ewc_loss": 0.008094551973044872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.094551594695076e-05, + "grad_norm": 3.932065725326538, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8956598043441772, + "num_tokens": 525943734.0, + "step": 13790 + }, + { + "epoch": 1.754356952041725, + "ewc_loss": 0.008041960187256336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.041960245463997e-05, + "grad_norm": 3.8878824710845947, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8961538672447205, + "num_tokens": 525988070.0, + "step": 13791 + }, + { + "epoch": 1.7544841623203156, + "ewc_loss": 0.008025610819458961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025611168704927e-05, + "grad_norm": 3.9467809200286865, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8891398906707764, + "num_tokens": 526027940.0, + "step": 13792 + }, + { + "epoch": 1.754611372598906, + "ewc_loss": 0.008061633445322514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061633707256988e-05, + "grad_norm": 3.979792833328247, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8855694532394409, + "num_tokens": 526061357.0, + "step": 13793 + }, + { + "epoch": 1.7547385828774966, + "ewc_loss": 0.008043437264859676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043437264859676e-05, + "grad_norm": 3.9442121982574463, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8783853054046631, + "num_tokens": 526102157.0, + "step": 13794 + }, + { + "epoch": 1.7548657931560872, + "ewc_loss": 0.008013107813894749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013107435544953e-05, + "grad_norm": 3.984755516052246, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8674614429473877, + "num_tokens": 526137241.0, + "step": 13795 + }, + { + "epoch": 1.7549930034346777, + "ewc_loss": 0.008036598563194275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036598592298105e-05, + "grad_norm": 4.007718563079834, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.868017852306366, + "num_tokens": 526169079.0, + "step": 13796 + }, + { + "epoch": 1.755120213713268, + "ewc_loss": 0.008032552897930145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03255315986462e-05, + "grad_norm": 3.8931262493133545, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8795789480209351, + "num_tokens": 526212212.0, + "step": 13797 + }, + { + "epoch": 1.7552474239918585, + "ewc_loss": 0.00797587912529707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975879270816222e-05, + "grad_norm": 3.9710536003112793, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8831710815429688, + "num_tokens": 526250380.0, + "step": 13798 + }, + { + "epoch": 1.755374634270449, + "ewc_loss": 0.008039142936468124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039142994675785e-05, + "grad_norm": 3.923729658126831, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8824503421783447, + "num_tokens": 526292525.0, + "step": 13799 + }, + { + "epoch": 1.7555018445490396, + "ewc_loss": 0.00797515269368887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975153130246326e-05, + "grad_norm": 3.9890856742858887, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8756512403488159, + "num_tokens": 526330427.0, + "step": 13800 + }, + { + "epoch": 1.75562905482763, + "ewc_loss": 0.008026801981031895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026802242966369e-05, + "grad_norm": 4.038302421569824, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8782620429992676, + "num_tokens": 526362954.0, + "step": 13801 + }, + { + "epoch": 1.7557562651062206, + "ewc_loss": 0.008023364469408989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023364352993667e-05, + "grad_norm": 4.068522930145264, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8689351081848145, + "num_tokens": 526394310.0, + "step": 13802 + }, + { + "epoch": 1.755883475384811, + "ewc_loss": 0.008006643503904343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006643474800512e-05, + "grad_norm": 3.929558515548706, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8562403321266174, + "num_tokens": 526438507.0, + "step": 13803 + }, + { + "epoch": 1.7560106856634015, + "ewc_loss": 0.007939817383885384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939817442093045e-05, + "grad_norm": 3.995954751968384, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8739144802093506, + "num_tokens": 526475124.0, + "step": 13804 + }, + { + "epoch": 1.756137895941992, + "ewc_loss": 0.00803552195429802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035521750571206e-05, + "grad_norm": 3.944382429122925, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8842920064926147, + "num_tokens": 526514936.0, + "step": 13805 + }, + { + "epoch": 1.7562651062205825, + "ewc_loss": 0.007975631393492222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975631160661578e-05, + "grad_norm": 4.0361785888671875, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8709657192230225, + "num_tokens": 526548649.0, + "step": 13806 + }, + { + "epoch": 1.756392316499173, + "ewc_loss": 0.008034208789467812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034209167817608e-05, + "grad_norm": 3.963167905807495, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8609253168106079, + "num_tokens": 526589028.0, + "step": 13807 + }, + { + "epoch": 1.7565195267777636, + "ewc_loss": 0.00796408299356699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.964082760736346e-05, + "grad_norm": 3.9689180850982666, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8760597705841064, + "num_tokens": 526626327.0, + "step": 13808 + }, + { + "epoch": 1.756646737056354, + "ewc_loss": 0.008025676012039185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025675924727693e-05, + "grad_norm": 3.911332130432129, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8737142086029053, + "num_tokens": 526672264.0, + "step": 13809 + }, + { + "epoch": 1.7567739473349446, + "ewc_loss": 0.007974574342370033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974573964020237e-05, + "grad_norm": 3.960167646408081, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8701019287109375, + "num_tokens": 526709979.0, + "step": 13810 + }, + { + "epoch": 1.7569011576135352, + "ewc_loss": 0.008049708791077137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.049709140323102e-05, + "grad_norm": 3.9685654640197754, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8882393836975098, + "num_tokens": 526745002.0, + "step": 13811 + }, + { + "epoch": 1.7570283678921257, + "ewc_loss": 0.008015514351427555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.015514322323725e-05, + "grad_norm": 3.920783281326294, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8901587724685669, + "num_tokens": 526785684.0, + "step": 13812 + }, + { + "epoch": 1.7571555781707162, + "ewc_loss": 0.007977216504514217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977216591825709e-05, + "grad_norm": 3.9324584007263184, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8939014077186584, + "num_tokens": 526820246.0, + "step": 13813 + }, + { + "epoch": 1.7572827884493067, + "ewc_loss": 0.008016470819711685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.016470383154228e-05, + "grad_norm": 3.994286060333252, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8808764219284058, + "num_tokens": 526854815.0, + "step": 13814 + }, + { + "epoch": 1.7574099987278973, + "ewc_loss": 0.008032775484025478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032775804167613e-05, + "grad_norm": 3.9098355770111084, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.883394718170166, + "num_tokens": 526896336.0, + "step": 13815 + }, + { + "epoch": 1.7575372090064878, + "ewc_loss": 0.007959036156535149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959036156535149e-05, + "grad_norm": 3.9186112880706787, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8865188956260681, + "num_tokens": 526939586.0, + "step": 13816 + }, + { + "epoch": 1.7576644192850783, + "ewc_loss": 0.008003155700862408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003156108316034e-05, + "grad_norm": 3.9831254482269287, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8810540437698364, + "num_tokens": 526975027.0, + "step": 13817 + }, + { + "epoch": 1.7577916295636689, + "ewc_loss": 0.008018692024052143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018691733013839e-05, + "grad_norm": 3.9411685466766357, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.877914547920227, + "num_tokens": 527017820.0, + "step": 13818 + }, + { + "epoch": 1.7579188398422594, + "ewc_loss": 0.007981043308973312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.981043017935008e-05, + "grad_norm": 4.027346134185791, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8867676854133606, + "num_tokens": 527051647.0, + "step": 13819 + }, + { + "epoch": 1.75804605012085, + "ewc_loss": 0.008036208339035511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036208600969985e-05, + "grad_norm": 3.947355270385742, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8802443742752075, + "num_tokens": 527089590.0, + "step": 13820 + }, + { + "epoch": 1.7581732603994404, + "ewc_loss": 0.007975139655172825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97513930592686e-05, + "grad_norm": 3.984795331954956, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8735364675521851, + "num_tokens": 527126696.0, + "step": 13821 + }, + { + "epoch": 1.7583004706780307, + "ewc_loss": 0.00800965167582035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009652083273977e-05, + "grad_norm": 3.961919069290161, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8707780838012695, + "num_tokens": 527164823.0, + "step": 13822 + }, + { + "epoch": 1.7584276809566213, + "ewc_loss": 0.0079908836632967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.990883750608191e-05, + "grad_norm": 3.919994592666626, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8593720197677612, + "num_tokens": 527208032.0, + "step": 13823 + }, + { + "epoch": 1.7585548912352118, + "ewc_loss": 0.007972868159413338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.972868479555473e-05, + "grad_norm": 4.01002836227417, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8771841526031494, + "num_tokens": 527242773.0, + "step": 13824 + }, + { + "epoch": 1.7586821015138023, + "ewc_loss": 0.008033898659050465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033898484427482e-05, + "grad_norm": 3.9894442558288574, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.880529522895813, + "num_tokens": 527277968.0, + "step": 13825 + }, + { + "epoch": 1.7588093117923929, + "ewc_loss": 0.007979999296367168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979998918017372e-05, + "grad_norm": 3.999699354171753, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8908942341804504, + "num_tokens": 527311077.0, + "step": 13826 + }, + { + "epoch": 1.7589365220709834, + "ewc_loss": 0.007999285124242306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.999285298865288e-05, + "grad_norm": 4.014381408691406, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8823297023773193, + "num_tokens": 527340543.0, + "step": 13827 + }, + { + "epoch": 1.7590637323495737, + "ewc_loss": 0.008013356477022171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013356273295358e-05, + "grad_norm": 3.9755702018737793, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.873516321182251, + "num_tokens": 527379596.0, + "step": 13828 + }, + { + "epoch": 1.7591909426281642, + "ewc_loss": 0.007978087291121483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.978087523952127e-05, + "grad_norm": 3.962419033050537, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8766965866088867, + "num_tokens": 527417873.0, + "step": 13829 + }, + { + "epoch": 1.7593181529067548, + "ewc_loss": 0.007982105016708374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982104580150917e-05, + "grad_norm": 3.92647123336792, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8787790536880493, + "num_tokens": 527458607.0, + "step": 13830 + }, + { + "epoch": 1.7594453631853453, + "ewc_loss": 0.007965920493006706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.965920667629689e-05, + "grad_norm": 4.025475025177002, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8680912256240845, + "num_tokens": 527493299.0, + "step": 13831 + }, + { + "epoch": 1.7595725734639358, + "ewc_loss": 0.008030649274587631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.030649041756988e-05, + "grad_norm": 3.9350461959838867, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.864629864692688, + "num_tokens": 527536065.0, + "step": 13832 + }, + { + "epoch": 1.7596997837425263, + "ewc_loss": 0.007955566979944706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955566979944706e-05, + "grad_norm": 3.971205949783325, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8625020384788513, + "num_tokens": 527576477.0, + "step": 13833 + }, + { + "epoch": 1.7598269940211169, + "ewc_loss": 0.00800028070807457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000280649866909e-05, + "grad_norm": 3.9927926063537598, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.874640703201294, + "num_tokens": 527611777.0, + "step": 13834 + }, + { + "epoch": 1.7599542042997074, + "ewc_loss": 0.007995491847395897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995491614565253e-05, + "grad_norm": 3.9411072731018066, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8939427137374878, + "num_tokens": 527650236.0, + "step": 13835 + }, + { + "epoch": 1.760081414578298, + "ewc_loss": 0.007971001788973808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971002196427435e-05, + "grad_norm": 3.977534294128418, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8820158839225769, + "num_tokens": 527685486.0, + "step": 13836 + }, + { + "epoch": 1.7602086248568884, + "ewc_loss": 0.008011425845324993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011425961740315e-05, + "grad_norm": 3.9587416648864746, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.865437388420105, + "num_tokens": 527725386.0, + "step": 13837 + }, + { + "epoch": 1.760335835135479, + "ewc_loss": 0.007998242974281311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998242654139176e-05, + "grad_norm": 4.0642781257629395, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8643330335617065, + "num_tokens": 527756213.0, + "step": 13838 + }, + { + "epoch": 1.7604630454140695, + "ewc_loss": 0.008058540523052216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058540697675198e-05, + "grad_norm": 3.930142879486084, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8813730478286743, + "num_tokens": 527793884.0, + "step": 13839 + }, + { + "epoch": 1.76059025569266, + "ewc_loss": 0.007956095039844513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956095214467496e-05, + "grad_norm": 4.012078762054443, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8735495209693909, + "num_tokens": 527826428.0, + "step": 13840 + }, + { + "epoch": 1.7607174659712506, + "ewc_loss": 0.00805202778428793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052027988014743e-05, + "grad_norm": 3.910999059677124, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8682271242141724, + "num_tokens": 527866691.0, + "step": 13841 + }, + { + "epoch": 1.760844676249841, + "ewc_loss": 0.007984553463757038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984553667483851e-05, + "grad_norm": 3.920894145965576, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8844574689865112, + "num_tokens": 527909482.0, + "step": 13842 + }, + { + "epoch": 1.7609718865284316, + "ewc_loss": 0.008031073957681656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031073957681656e-05, + "grad_norm": 3.9670825004577637, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8642480969429016, + "num_tokens": 527947818.0, + "step": 13843 + }, + { + "epoch": 1.7610990968070221, + "ewc_loss": 0.008038047701120377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038047963054851e-05, + "grad_norm": 3.9640121459960938, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8783418536186218, + "num_tokens": 527986760.0, + "step": 13844 + }, + { + "epoch": 1.7612263070856127, + "ewc_loss": 0.008014306426048279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01430651335977e-05, + "grad_norm": 3.9446640014648438, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8704174757003784, + "num_tokens": 528024742.0, + "step": 13845 + }, + { + "epoch": 1.761353517364203, + "ewc_loss": 0.008018185384571552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018185326363891e-05, + "grad_norm": 4.009424209594727, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8634958267211914, + "num_tokens": 528060781.0, + "step": 13846 + }, + { + "epoch": 1.7614807276427935, + "ewc_loss": 0.008046481758356094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046481525525451e-05, + "grad_norm": 3.9482710361480713, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8787208199501038, + "num_tokens": 528099051.0, + "step": 13847 + }, + { + "epoch": 1.761607937921384, + "ewc_loss": 0.008006426505744457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00642665126361e-05, + "grad_norm": 3.9088244438171387, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8708242177963257, + "num_tokens": 528138325.0, + "step": 13848 + }, + { + "epoch": 1.7617351481999746, + "ewc_loss": 0.0080177653580904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017765503609553e-05, + "grad_norm": 3.9181253910064697, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8682328462600708, + "num_tokens": 528177997.0, + "step": 13849 + }, + { + "epoch": 1.761862358478565, + "ewc_loss": 0.008031127974390984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031127799768001e-05, + "grad_norm": 3.9650449752807617, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8720297813415527, + "num_tokens": 528216360.0, + "step": 13850 + }, + { + "epoch": 1.7619895687571556, + "ewc_loss": 0.00806804932653904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068049646681175e-05, + "grad_norm": 3.9692587852478027, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8907934427261353, + "num_tokens": 528251100.0, + "step": 13851 + }, + { + "epoch": 1.762116779035746, + "ewc_loss": 0.008040842600166798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040842658374459e-05, + "grad_norm": 3.9915926456451416, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8810334205627441, + "num_tokens": 528286298.0, + "step": 13852 + }, + { + "epoch": 1.7622439893143365, + "ewc_loss": 0.008065595291554928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06559546617791e-05, + "grad_norm": 3.9507250785827637, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8908687829971313, + "num_tokens": 528322970.0, + "step": 13853 + }, + { + "epoch": 1.762371199592927, + "ewc_loss": 0.008012448437511921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012448233785108e-05, + "grad_norm": 3.9688360691070557, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8924671411514282, + "num_tokens": 528355191.0, + "step": 13854 + }, + { + "epoch": 1.7624984098715175, + "ewc_loss": 0.008038374595344067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038374653551728e-05, + "grad_norm": 4.0011138916015625, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8748490810394287, + "num_tokens": 528388975.0, + "step": 13855 + }, + { + "epoch": 1.762625620150108, + "ewc_loss": 0.008063814602792263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.063814311753958e-05, + "grad_norm": 3.928382635116577, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8796215057373047, + "num_tokens": 528432139.0, + "step": 13856 + }, + { + "epoch": 1.7627528304286986, + "ewc_loss": 0.007995829917490482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995829946594313e-05, + "grad_norm": 3.9874746799468994, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8867542743682861, + "num_tokens": 528468315.0, + "step": 13857 + }, + { + "epoch": 1.762880040707289, + "ewc_loss": 0.008023448288440704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023448754101992e-05, + "grad_norm": 3.9199869632720947, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8659871220588684, + "num_tokens": 528510368.0, + "step": 13858 + }, + { + "epoch": 1.7630072509858796, + "ewc_loss": 0.00799611210823059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.996112253749743e-05, + "grad_norm": 3.979043960571289, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8794271945953369, + "num_tokens": 528545637.0, + "step": 13859 + }, + { + "epoch": 1.7631344612644702, + "ewc_loss": 0.008028378710150719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028378942981362e-05, + "grad_norm": 3.9475221633911133, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8814497590065002, + "num_tokens": 528586267.0, + "step": 13860 + }, + { + "epoch": 1.7632616715430607, + "ewc_loss": 0.00798447523266077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984475087141618e-05, + "grad_norm": 3.936023235321045, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8789335489273071, + "num_tokens": 528629640.0, + "step": 13861 + }, + { + "epoch": 1.7633888818216512, + "ewc_loss": 0.00797311868518591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9731187724974e-05, + "grad_norm": 3.9780795574188232, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8624428510665894, + "num_tokens": 528665373.0, + "step": 13862 + }, + { + "epoch": 1.7635160921002417, + "ewc_loss": 0.008010230958461761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010230521904305e-05, + "grad_norm": 3.96298885345459, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8898228406906128, + "num_tokens": 528703426.0, + "step": 13863 + }, + { + "epoch": 1.7636433023788323, + "ewc_loss": 0.007973707281053066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973707397468388e-05, + "grad_norm": 3.990535020828247, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8585901260375977, + "num_tokens": 528744886.0, + "step": 13864 + }, + { + "epoch": 1.7637705126574228, + "ewc_loss": 0.008003472350537777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003472612472251e-05, + "grad_norm": 3.9583916664123535, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8877822160720825, + "num_tokens": 528779773.0, + "step": 13865 + }, + { + "epoch": 1.7638977229360133, + "ewc_loss": 0.007968148216605186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968148565851152e-05, + "grad_norm": 3.9456787109375, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8783231973648071, + "num_tokens": 528821878.0, + "step": 13866 + }, + { + "epoch": 1.7640249332146039, + "ewc_loss": 0.007981033995747566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98103355919011e-05, + "grad_norm": 3.9928412437438965, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8789161443710327, + "num_tokens": 528858056.0, + "step": 13867 + }, + { + "epoch": 1.7641521434931944, + "ewc_loss": 0.007998205721378326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998205546755344e-05, + "grad_norm": 3.9168243408203125, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8732587099075317, + "num_tokens": 528903116.0, + "step": 13868 + }, + { + "epoch": 1.764279353771785, + "ewc_loss": 0.00794264767318964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942647789604962e-05, + "grad_norm": 3.987196207046509, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8709814548492432, + "num_tokens": 528941561.0, + "step": 13869 + }, + { + "epoch": 1.7644065640503754, + "ewc_loss": 0.007998785935342312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998786168172956e-05, + "grad_norm": 3.9412827491760254, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8929973840713501, + "num_tokens": 528976215.0, + "step": 13870 + }, + { + "epoch": 1.7645337743289657, + "ewc_loss": 0.007952438667416573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952439045766369e-05, + "grad_norm": 3.9012885093688965, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8835709095001221, + "num_tokens": 529019280.0, + "step": 13871 + }, + { + "epoch": 1.7646609846075563, + "ewc_loss": 0.007941442541778088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.941442163428292e-05, + "grad_norm": 3.939984083175659, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.880124032497406, + "num_tokens": 529060733.0, + "step": 13872 + }, + { + "epoch": 1.7647881948861468, + "ewc_loss": 0.00797769520431757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977695349836722e-05, + "grad_norm": 4.000321388244629, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8794351816177368, + "num_tokens": 529094948.0, + "step": 13873 + }, + { + "epoch": 1.7649154051647373, + "ewc_loss": 0.007997320033609867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997320062713698e-05, + "grad_norm": 3.960329055786133, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8878026008605957, + "num_tokens": 529131374.0, + "step": 13874 + }, + { + "epoch": 1.7650426154433279, + "ewc_loss": 0.007952824234962463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952823943924159e-05, + "grad_norm": 3.96622371673584, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8742226362228394, + "num_tokens": 529169133.0, + "step": 13875 + }, + { + "epoch": 1.7651698257219184, + "ewc_loss": 0.00798759888857603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987598655745387e-05, + "grad_norm": 3.9749934673309326, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8740686178207397, + "num_tokens": 529206467.0, + "step": 13876 + }, + { + "epoch": 1.7652970360005087, + "ewc_loss": 0.007972207851707935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.972207822604105e-05, + "grad_norm": 3.957166910171509, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8666870594024658, + "num_tokens": 529244891.0, + "step": 13877 + }, + { + "epoch": 1.7654242462790992, + "ewc_loss": 0.007965970784425735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.965970871737227e-05, + "grad_norm": 4.04681396484375, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8784353733062744, + "num_tokens": 529277384.0, + "step": 13878 + }, + { + "epoch": 1.7655514565576897, + "ewc_loss": 0.0080275759100914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.027575677260756e-05, + "grad_norm": 3.8881967067718506, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8792611360549927, + "num_tokens": 529322530.0, + "step": 13879 + }, + { + "epoch": 1.7656786668362803, + "ewc_loss": 0.007913075387477875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.913075387477875e-05, + "grad_norm": 4.012059211730957, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8726999759674072, + "num_tokens": 529356345.0, + "step": 13880 + }, + { + "epoch": 1.7658058771148708, + "ewc_loss": 0.00805753841996193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057538070715964e-05, + "grad_norm": 3.9939520359039307, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8784732818603516, + "num_tokens": 529391877.0, + "step": 13881 + }, + { + "epoch": 1.7659330873934613, + "ewc_loss": 0.007963982410728931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96398235252127e-05, + "grad_norm": 3.9753592014312744, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8838765621185303, + "num_tokens": 529425460.0, + "step": 13882 + }, + { + "epoch": 1.7660602976720519, + "ewc_loss": 0.00797200296074152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.972002640599385e-05, + "grad_norm": 3.981867790222168, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8680131435394287, + "num_tokens": 529459660.0, + "step": 13883 + }, + { + "epoch": 1.7661875079506424, + "ewc_loss": 0.008014660328626633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01466012489982e-05, + "grad_norm": 3.9451520442962646, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8680110573768616, + "num_tokens": 529501839.0, + "step": 13884 + }, + { + "epoch": 1.766314718229233, + "ewc_loss": 0.007985026575624943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.985026604728773e-05, + "grad_norm": 3.9344944953918457, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8738349676132202, + "num_tokens": 529540828.0, + "step": 13885 + }, + { + "epoch": 1.7664419285078234, + "ewc_loss": 0.007974843494594097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974843902047724e-05, + "grad_norm": 3.9632174968719482, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8758350014686584, + "num_tokens": 529581578.0, + "step": 13886 + }, + { + "epoch": 1.766569138786414, + "ewc_loss": 0.007992874830961227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992874452611431e-05, + "grad_norm": 3.927680492401123, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8775103092193604, + "num_tokens": 529624363.0, + "step": 13887 + }, + { + "epoch": 1.7666963490650045, + "ewc_loss": 0.007964328862726688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.964328688103706e-05, + "grad_norm": 3.923903703689575, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8836896419525146, + "num_tokens": 529665286.0, + "step": 13888 + }, + { + "epoch": 1.766823559343595, + "ewc_loss": 0.007973591797053814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973591709742323e-05, + "grad_norm": 3.9730327129364014, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8733400702476501, + "num_tokens": 529704150.0, + "step": 13889 + }, + { + "epoch": 1.7669507696221856, + "ewc_loss": 0.007996764965355396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.996764907147735e-05, + "grad_norm": 3.94085955619812, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8792665004730225, + "num_tokens": 529742110.0, + "step": 13890 + }, + { + "epoch": 1.767077979900776, + "ewc_loss": 0.007971868850290775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.971868762979284e-05, + "grad_norm": 3.981595993041992, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8756674528121948, + "num_tokens": 529776158.0, + "step": 13891 + }, + { + "epoch": 1.7672051901793666, + "ewc_loss": 0.008005156181752682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.005156269064173e-05, + "grad_norm": 4.003028869628906, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8547300696372986, + "num_tokens": 529814002.0, + "step": 13892 + }, + { + "epoch": 1.7673324004579571, + "ewc_loss": 0.008001337759196758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00133784650825e-05, + "grad_norm": 3.9824647903442383, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8800840973854065, + "num_tokens": 529846696.0, + "step": 13893 + }, + { + "epoch": 1.7674596107365477, + "ewc_loss": 0.007984844036400318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984843978192657e-05, + "grad_norm": 3.9770069122314453, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8740667104721069, + "num_tokens": 529881557.0, + "step": 13894 + }, + { + "epoch": 1.767586821015138, + "ewc_loss": 0.00799794401973486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997943612281233e-05, + "grad_norm": 3.95111083984375, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8649377226829529, + "num_tokens": 529924337.0, + "step": 13895 + }, + { + "epoch": 1.7677140312937285, + "ewc_loss": 0.008001139387488365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001139212865382e-05, + "grad_norm": 3.9274840354919434, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8765695691108704, + "num_tokens": 529967195.0, + "step": 13896 + }, + { + "epoch": 1.767841241572319, + "ewc_loss": 0.00799842830747366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998428191058338e-05, + "grad_norm": 4.0127272605896, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8569049835205078, + "num_tokens": 530003454.0, + "step": 13897 + }, + { + "epoch": 1.7679684518509096, + "ewc_loss": 0.00802794098854065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.027940930332989e-05, + "grad_norm": 3.9597725868225098, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.851271390914917, + "num_tokens": 530042577.0, + "step": 13898 + }, + { + "epoch": 1.7680956621295, + "ewc_loss": 0.00799638219177723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.996382191777229e-05, + "grad_norm": 3.96313738822937, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8604600429534912, + "num_tokens": 530084903.0, + "step": 13899 + }, + { + "epoch": 1.7682228724080906, + "ewc_loss": 0.008018335327506065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018335211090744e-05, + "grad_norm": 4.013859748840332, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8714905977249146, + "num_tokens": 530117598.0, + "step": 13900 + }, + { + "epoch": 1.768350082686681, + "ewc_loss": 0.008035522885620594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035523205762729e-05, + "grad_norm": 3.954634189605713, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8794610500335693, + "num_tokens": 530151891.0, + "step": 13901 + }, + { + "epoch": 1.7684772929652715, + "ewc_loss": 0.007992859929800034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992859900696203e-05, + "grad_norm": 3.9099972248077393, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8683112859725952, + "num_tokens": 530191263.0, + "step": 13902 + }, + { + "epoch": 1.768604503243862, + "ewc_loss": 0.008003102615475655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00310299382545e-05, + "grad_norm": 3.960977792739868, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8709003925323486, + "num_tokens": 530231156.0, + "step": 13903 + }, + { + "epoch": 1.7687317135224525, + "ewc_loss": 0.008043181151151657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043181151151657e-05, + "grad_norm": 3.9353208541870117, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8890185952186584, + "num_tokens": 530270089.0, + "step": 13904 + }, + { + "epoch": 1.768858923801043, + "ewc_loss": 0.00801238976418972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012389298528433e-05, + "grad_norm": 3.9521548748016357, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8796164989471436, + "num_tokens": 530309808.0, + "step": 13905 + }, + { + "epoch": 1.7689861340796336, + "ewc_loss": 0.008024858310818672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024858107091859e-05, + "grad_norm": 3.9258694648742676, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8657804727554321, + "num_tokens": 530357723.0, + "step": 13906 + }, + { + "epoch": 1.769113344358224, + "ewc_loss": 0.008001030422747135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001030801096931e-05, + "grad_norm": 4.068218231201172, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8622462749481201, + "num_tokens": 530389586.0, + "step": 13907 + }, + { + "epoch": 1.7692405546368146, + "ewc_loss": 0.008103888481855392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103888831101358e-05, + "grad_norm": 4.003105163574219, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8664340972900391, + "num_tokens": 530430356.0, + "step": 13908 + }, + { + "epoch": 1.7693677649154052, + "ewc_loss": 0.007993748411536217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993748295120895e-05, + "grad_norm": 3.9026336669921875, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8810497522354126, + "num_tokens": 530472859.0, + "step": 13909 + }, + { + "epoch": 1.7694949751939957, + "ewc_loss": 0.007976281456649303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976281631272286e-05, + "grad_norm": 3.984557867050171, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8633143901824951, + "num_tokens": 530515422.0, + "step": 13910 + }, + { + "epoch": 1.7696221854725862, + "ewc_loss": 0.008038287982344627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038288069656119e-05, + "grad_norm": 3.9261574745178223, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8729069232940674, + "num_tokens": 530553842.0, + "step": 13911 + }, + { + "epoch": 1.7697493957511767, + "ewc_loss": 0.007970236241817474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.970236038090661e-05, + "grad_norm": 3.971360445022583, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8740286827087402, + "num_tokens": 530592593.0, + "step": 13912 + }, + { + "epoch": 1.7698766060297673, + "ewc_loss": 0.007998323999345303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998324144864455e-05, + "grad_norm": 3.9569966793060303, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8693084716796875, + "num_tokens": 530632309.0, + "step": 13913 + }, + { + "epoch": 1.7700038163083578, + "ewc_loss": 0.007996066473424435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.996066415216774e-05, + "grad_norm": 4.020626544952393, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.870234489440918, + "num_tokens": 530667415.0, + "step": 13914 + }, + { + "epoch": 1.7701310265869483, + "ewc_loss": 0.008021198213100433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021198300411925e-05, + "grad_norm": 3.9528467655181885, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8735355138778687, + "num_tokens": 530705736.0, + "step": 13915 + }, + { + "epoch": 1.7702582368655388, + "ewc_loss": 0.007941148243844509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.941148214740679e-05, + "grad_norm": 3.945479393005371, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8858579397201538, + "num_tokens": 530744931.0, + "step": 13916 + }, + { + "epoch": 1.7703854471441294, + "ewc_loss": 0.007972258143126965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.972258026711643e-05, + "grad_norm": 3.886445999145508, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8866679668426514, + "num_tokens": 530784700.0, + "step": 13917 + }, + { + "epoch": 1.77051265742272, + "ewc_loss": 0.007936552166938782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936551992315799e-05, + "grad_norm": 3.9159557819366455, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8908906579017639, + "num_tokens": 530821097.0, + "step": 13918 + }, + { + "epoch": 1.7706398677013102, + "ewc_loss": 0.007969181053340435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969181024236605e-05, + "grad_norm": 3.980170726776123, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8689343333244324, + "num_tokens": 530860444.0, + "step": 13919 + }, + { + "epoch": 1.7707670779799007, + "ewc_loss": 0.00797585491091013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975855260156095e-05, + "grad_norm": 3.876535415649414, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8724790811538696, + "num_tokens": 530907304.0, + "step": 13920 + }, + { + "epoch": 1.7708942882584913, + "ewc_loss": 0.00789648201316595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.896481838542968e-05, + "grad_norm": 4.00330924987793, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8692931532859802, + "num_tokens": 530945692.0, + "step": 13921 + }, + { + "epoch": 1.7710214985370818, + "ewc_loss": 0.00802671443670988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026714203879237e-05, + "grad_norm": 3.9309682846069336, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8874031901359558, + "num_tokens": 530986695.0, + "step": 13922 + }, + { + "epoch": 1.7711487088156723, + "ewc_loss": 0.007921699434518814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.921699580037966e-05, + "grad_norm": 3.943559408187866, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.887248694896698, + "num_tokens": 531022656.0, + "step": 13923 + }, + { + "epoch": 1.7712759190942629, + "ewc_loss": 0.007953601889312267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.953601743793115e-05, + "grad_norm": 3.9829695224761963, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8831447958946228, + "num_tokens": 531056896.0, + "step": 13924 + }, + { + "epoch": 1.7714031293728534, + "ewc_loss": 0.007947354577481747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.947354606585577e-05, + "grad_norm": 3.994023561477661, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8508217334747314, + "num_tokens": 531097286.0, + "step": 13925 + }, + { + "epoch": 1.7715303396514437, + "ewc_loss": 0.007947295904159546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.947295671328902e-05, + "grad_norm": 3.9621191024780273, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8821680545806885, + "num_tokens": 531136430.0, + "step": 13926 + }, + { + "epoch": 1.7716575499300342, + "ewc_loss": 0.007924928329885006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.924928650027141e-05, + "grad_norm": 3.974431276321411, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.867890477180481, + "num_tokens": 531176698.0, + "step": 13927 + }, + { + "epoch": 1.7717847602086247, + "ewc_loss": 0.007939368486404419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93936851550825e-05, + "grad_norm": 3.9727041721343994, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.874325156211853, + "num_tokens": 531214343.0, + "step": 13928 + }, + { + "epoch": 1.7719119704872153, + "ewc_loss": 0.007932798936963081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932799053378403e-05, + "grad_norm": 3.961790084838867, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8860695362091064, + "num_tokens": 531252325.0, + "step": 13929 + }, + { + "epoch": 1.7720391807658058, + "ewc_loss": 0.007932859472930431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.932859443826601e-05, + "grad_norm": 4.017727851867676, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8643782138824463, + "num_tokens": 531290378.0, + "step": 13930 + }, + { + "epoch": 1.7721663910443963, + "ewc_loss": 0.007956638000905514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956638000905514e-05, + "grad_norm": 3.965014696121216, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8679272532463074, + "num_tokens": 531330406.0, + "step": 13931 + }, + { + "epoch": 1.7722936013229869, + "ewc_loss": 0.007919300347566605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919299969216809e-05, + "grad_norm": 3.9581401348114014, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8766306042671204, + "num_tokens": 531367425.0, + "step": 13932 + }, + { + "epoch": 1.7724208116015774, + "ewc_loss": 0.007934140972793102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.934140739962459e-05, + "grad_norm": 3.9341065883636475, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8958083391189575, + "num_tokens": 531406465.0, + "step": 13933 + }, + { + "epoch": 1.772548021880168, + "ewc_loss": 0.00794055126607418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940551586216316e-05, + "grad_norm": 4.004122257232666, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8655678033828735, + "num_tokens": 531445356.0, + "step": 13934 + }, + { + "epoch": 1.7726752321587584, + "ewc_loss": 0.007982082664966583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982082752278075e-05, + "grad_norm": 3.9451053142547607, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8733644485473633, + "num_tokens": 531484737.0, + "step": 13935 + }, + { + "epoch": 1.772802442437349, + "ewc_loss": 0.007919248193502426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.919248309917748e-05, + "grad_norm": 3.9510648250579834, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8717214465141296, + "num_tokens": 531523453.0, + "step": 13936 + }, + { + "epoch": 1.7729296527159395, + "ewc_loss": 0.007946624420583248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.946624100441113e-05, + "grad_norm": 3.9302780628204346, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8814985156059265, + "num_tokens": 531559627.0, + "step": 13937 + }, + { + "epoch": 1.77305686299453, + "ewc_loss": 0.00794257316738367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942573574837297e-05, + "grad_norm": 3.96779727935791, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8799591660499573, + "num_tokens": 531597677.0, + "step": 13938 + }, + { + "epoch": 1.7731840732731206, + "ewc_loss": 0.007961578667163849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961579103721306e-05, + "grad_norm": 3.8991644382476807, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8835362792015076, + "num_tokens": 531642753.0, + "step": 13939 + }, + { + "epoch": 1.773311283551711, + "ewc_loss": 0.007914095185697079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.914095476735383e-05, + "grad_norm": 3.9816484451293945, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8691229820251465, + "num_tokens": 531688249.0, + "step": 13940 + }, + { + "epoch": 1.7734384938303016, + "ewc_loss": 0.007986093871295452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986093987710774e-05, + "grad_norm": 3.940225601196289, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8879096508026123, + "num_tokens": 531727994.0, + "step": 13941 + }, + { + "epoch": 1.7735657041088921, + "ewc_loss": 0.007909947074949741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.909947453299537e-05, + "grad_norm": 3.972775459289551, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8782380819320679, + "num_tokens": 531764428.0, + "step": 13942 + }, + { + "epoch": 1.7736929143874827, + "ewc_loss": 0.007943423464894295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.943423406686634e-05, + "grad_norm": 3.9342539310455322, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8714637756347656, + "num_tokens": 531806604.0, + "step": 13943 + }, + { + "epoch": 1.773820124666073, + "ewc_loss": 0.007918724790215492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.918724440969527e-05, + "grad_norm": 3.967041492462158, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.893802285194397, + "num_tokens": 531845504.0, + "step": 13944 + }, + { + "epoch": 1.7739473349446635, + "ewc_loss": 0.007935655303299427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.93565486674197e-05, + "grad_norm": 3.98934006690979, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8845868706703186, + "num_tokens": 531880962.0, + "step": 13945 + }, + { + "epoch": 1.774074545223254, + "ewc_loss": 0.007923870347440243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.923869998194277e-05, + "grad_norm": 3.9592888355255127, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8730467557907104, + "num_tokens": 531918182.0, + "step": 13946 + }, + { + "epoch": 1.7742017555018446, + "ewc_loss": 0.007893676869571209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8936769568827e-05, + "grad_norm": 3.963768482208252, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8764908313751221, + "num_tokens": 531958340.0, + "step": 13947 + }, + { + "epoch": 1.774328965780435, + "ewc_loss": 0.007929208688437939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.929209095891565e-05, + "grad_norm": 4.006550312042236, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8636528253555298, + "num_tokens": 531996951.0, + "step": 13948 + }, + { + "epoch": 1.7744561760590256, + "ewc_loss": 0.007936907932162285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.936907786643133e-05, + "grad_norm": 3.920285224914551, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8790984749794006, + "num_tokens": 532036963.0, + "step": 13949 + }, + { + "epoch": 1.774583386337616, + "ewc_loss": 0.0078944256529212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.8944256529212e-05, + "grad_norm": 3.997671365737915, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8707067966461182, + "num_tokens": 532070937.0, + "step": 13950 + }, + { + "epoch": 1.7747105966162064, + "ewc_loss": 0.007962451316416264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.962451491039246e-05, + "grad_norm": 3.9764959812164307, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8629451990127563, + "num_tokens": 532107866.0, + "step": 13951 + }, + { + "epoch": 1.774837806894797, + "ewc_loss": 0.007935981266200542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935980829643086e-05, + "grad_norm": 3.932924747467041, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.868155837059021, + "num_tokens": 532151485.0, + "step": 13952 + }, + { + "epoch": 1.7749650171733875, + "ewc_loss": 0.007925021462142467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.925021054688841e-05, + "grad_norm": 3.9486594200134277, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8817447423934937, + "num_tokens": 532191209.0, + "step": 13953 + }, + { + "epoch": 1.775092227451978, + "ewc_loss": 0.007942735217511654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.942735101096332e-05, + "grad_norm": 3.9427859783172607, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8719826340675354, + "num_tokens": 532232219.0, + "step": 13954 + }, + { + "epoch": 1.7752194377305686, + "ewc_loss": 0.007939476519823074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.939476927276701e-05, + "grad_norm": 4.068309783935547, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8628331422805786, + "num_tokens": 532265330.0, + "step": 13955 + }, + { + "epoch": 1.775346648009159, + "ewc_loss": 0.008012204430997372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012204489205033e-05, + "grad_norm": 4.050467014312744, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.872942328453064, + "num_tokens": 532295964.0, + "step": 13956 + }, + { + "epoch": 1.7754738582877496, + "ewc_loss": 0.007977023720741272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977023778948933e-05, + "grad_norm": 3.9775047302246094, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8784958124160767, + "num_tokens": 532333804.0, + "step": 13957 + }, + { + "epoch": 1.7756010685663401, + "ewc_loss": 0.007963999174535275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963999087223783e-05, + "grad_norm": 3.9924681186676025, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8796950578689575, + "num_tokens": 532373088.0, + "step": 13958 + }, + { + "epoch": 1.7757282788449307, + "ewc_loss": 0.008002596907317638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002596587175503e-05, + "grad_norm": 3.9897305965423584, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.872570276260376, + "num_tokens": 532409473.0, + "step": 13959 + }, + { + "epoch": 1.7758554891235212, + "ewc_loss": 0.00799519568681717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995195483090356e-05, + "grad_norm": 3.9003634452819824, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8854151964187622, + "num_tokens": 532447883.0, + "step": 13960 + }, + { + "epoch": 1.7759826994021117, + "ewc_loss": 0.007956724613904953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.956724584801123e-05, + "grad_norm": 3.9622368812561035, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8804907202720642, + "num_tokens": 532485783.0, + "step": 13961 + }, + { + "epoch": 1.7761099096807023, + "ewc_loss": 0.008029691874980927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029692253330722e-05, + "grad_norm": 4.108781337738037, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8821335434913635, + "num_tokens": 532511082.0, + "step": 13962 + }, + { + "epoch": 1.7762371199592928, + "ewc_loss": 0.008081231266260147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081231499090791e-05, + "grad_norm": 3.896027088165283, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8910539746284485, + "num_tokens": 532555516.0, + "step": 13963 + }, + { + "epoch": 1.7763643302378833, + "ewc_loss": 0.007930195890367031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.930195715744048e-05, + "grad_norm": 3.994731903076172, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8601972460746765, + "num_tokens": 532599200.0, + "step": 13964 + }, + { + "epoch": 1.7764915405164738, + "ewc_loss": 0.008062383159995079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062383130891249e-05, + "grad_norm": 3.957495927810669, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8854935169219971, + "num_tokens": 532636335.0, + "step": 13965 + }, + { + "epoch": 1.7766187507950644, + "ewc_loss": 0.00800729263573885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007292490219697e-05, + "grad_norm": 3.9437849521636963, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8875715732574463, + "num_tokens": 532678337.0, + "step": 13966 + }, + { + "epoch": 1.776745961073655, + "ewc_loss": 0.00799853540956974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998535875231028e-05, + "grad_norm": 3.9911322593688965, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8714883923530579, + "num_tokens": 532715051.0, + "step": 13967 + }, + { + "epoch": 1.7768731713522452, + "ewc_loss": 0.008050499483942986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050499309320003e-05, + "grad_norm": 3.9808194637298584, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8859297633171082, + "num_tokens": 532750836.0, + "step": 13968 + }, + { + "epoch": 1.7770003816308357, + "ewc_loss": 0.008006996475160122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0069963587448e-05, + "grad_norm": 3.936556100845337, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.881786048412323, + "num_tokens": 532790331.0, + "step": 13969 + }, + { + "epoch": 1.7771275919094263, + "ewc_loss": 0.007997826673090458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997826469363645e-05, + "grad_norm": 3.969299554824829, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8714148998260498, + "num_tokens": 532830169.0, + "step": 13970 + }, + { + "epoch": 1.7772548021880168, + "ewc_loss": 0.008030423894524574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.030424214666709e-05, + "grad_norm": 3.963594675064087, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8681346774101257, + "num_tokens": 532865957.0, + "step": 13971 + }, + { + "epoch": 1.7773820124666073, + "ewc_loss": 0.008003667928278446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003668335732073e-05, + "grad_norm": 3.948657512664795, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8725513219833374, + "num_tokens": 532906112.0, + "step": 13972 + }, + { + "epoch": 1.7775092227451978, + "ewc_loss": 0.007987522520124912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987522258190438e-05, + "grad_norm": 4.031325340270996, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8820397853851318, + "num_tokens": 532939676.0, + "step": 13973 + }, + { + "epoch": 1.7776364330237884, + "ewc_loss": 0.00803352054208517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033520134631544e-05, + "grad_norm": 3.9930310249328613, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8584694266319275, + "num_tokens": 532977668.0, + "step": 13974 + }, + { + "epoch": 1.7777636433023787, + "ewc_loss": 0.007980598136782646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.980598456924781e-05, + "grad_norm": 3.954779624938965, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8744436502456665, + "num_tokens": 533013862.0, + "step": 13975 + }, + { + "epoch": 1.7778908535809692, + "ewc_loss": 0.00798540748655796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.985407864907756e-05, + "grad_norm": 3.9769608974456787, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8642063140869141, + "num_tokens": 533051222.0, + "step": 13976 + }, + { + "epoch": 1.7780180638595597, + "ewc_loss": 0.00800899975001812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.008999429875985e-05, + "grad_norm": 3.9622299671173096, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8736105561256409, + "num_tokens": 533090182.0, + "step": 13977 + }, + { + "epoch": 1.7781452741381503, + "ewc_loss": 0.00799126923084259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991269376361743e-05, + "grad_norm": 3.9257590770721436, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8887660503387451, + "num_tokens": 533129600.0, + "step": 13978 + }, + { + "epoch": 1.7782724844167408, + "ewc_loss": 0.007987285032868385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987285061972216e-05, + "grad_norm": 3.950352668762207, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8743253350257874, + "num_tokens": 533172351.0, + "step": 13979 + }, + { + "epoch": 1.7783996946953313, + "ewc_loss": 0.00800252240151167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002522372407839e-05, + "grad_norm": 3.962613105773926, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8727378249168396, + "num_tokens": 533211583.0, + "step": 13980 + }, + { + "epoch": 1.7785269049739219, + "ewc_loss": 0.007993066683411598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993066537892446e-05, + "grad_norm": 3.950469970703125, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8804808855056763, + "num_tokens": 533249960.0, + "step": 13981 + }, + { + "epoch": 1.7786541152525124, + "ewc_loss": 0.007987995631992817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987995923031121e-05, + "grad_norm": 3.9479477405548096, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8854194283485413, + "num_tokens": 533287328.0, + "step": 13982 + }, + { + "epoch": 1.778781325531103, + "ewc_loss": 0.00798251572996378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982515671756119e-05, + "grad_norm": 3.950143575668335, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8749222755432129, + "num_tokens": 533327772.0, + "step": 13983 + }, + { + "epoch": 1.7789085358096934, + "ewc_loss": 0.00797079224139452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.970792648848146e-05, + "grad_norm": 3.9109981060028076, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8695545196533203, + "num_tokens": 533372004.0, + "step": 13984 + }, + { + "epoch": 1.779035746088284, + "ewc_loss": 0.007974970154464245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974969776114449e-05, + "grad_norm": 4.0211181640625, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8747503161430359, + "num_tokens": 533409502.0, + "step": 13985 + }, + { + "epoch": 1.7791629563668745, + "ewc_loss": 0.008050906471908092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050906762946397e-05, + "grad_norm": 3.9616005420684814, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8637360334396362, + "num_tokens": 533451160.0, + "step": 13986 + }, + { + "epoch": 1.779290166645465, + "ewc_loss": 0.007979676127433777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979675865499303e-05, + "grad_norm": 4.018225193023682, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8831369876861572, + "num_tokens": 533480413.0, + "step": 13987 + }, + { + "epoch": 1.7794173769240555, + "ewc_loss": 0.00803716853260994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037168299779296e-05, + "grad_norm": 3.972064256668091, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8639772534370422, + "num_tokens": 533518826.0, + "step": 13988 + }, + { + "epoch": 1.779544587202646, + "ewc_loss": 0.007981695234775543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.981694943737239e-05, + "grad_norm": 3.939464569091797, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8645011186599731, + "num_tokens": 533563760.0, + "step": 13989 + }, + { + "epoch": 1.7796717974812366, + "ewc_loss": 0.007989616133272648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9896162787918e-05, + "grad_norm": 3.968804121017456, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8763801455497742, + "num_tokens": 533601499.0, + "step": 13990 + }, + { + "epoch": 1.7797990077598271, + "ewc_loss": 0.008033656515181065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03365619503893e-05, + "grad_norm": 3.948209524154663, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8798006176948547, + "num_tokens": 533640757.0, + "step": 13991 + }, + { + "epoch": 1.7799262180384177, + "ewc_loss": 0.0079858573153615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.985857519088313e-05, + "grad_norm": 3.998830795288086, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8778669834136963, + "num_tokens": 533678047.0, + "step": 13992 + }, + { + "epoch": 1.780053428317008, + "ewc_loss": 0.008023894391953945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023894770303741e-05, + "grad_norm": 3.9146313667297363, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8755546808242798, + "num_tokens": 533720903.0, + "step": 13993 + }, + { + "epoch": 1.7801806385955985, + "ewc_loss": 0.007961954921483994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.961954543134198e-05, + "grad_norm": 3.967531204223633, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8772925734519958, + "num_tokens": 533759720.0, + "step": 13994 + }, + { + "epoch": 1.780307848874189, + "ewc_loss": 0.008013383485376835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01338319433853e-05, + "grad_norm": 3.9841721057891846, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8649044036865234, + "num_tokens": 533796122.0, + "step": 13995 + }, + { + "epoch": 1.7804350591527796, + "ewc_loss": 0.007998217828571796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998217915883288e-05, + "grad_norm": 3.947953462600708, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8814665079116821, + "num_tokens": 533836326.0, + "step": 13996 + }, + { + "epoch": 1.78056226943137, + "ewc_loss": 0.007980373688042164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.980373629834503e-05, + "grad_norm": 4.01204776763916, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8874870538711548, + "num_tokens": 533867874.0, + "step": 13997 + }, + { + "epoch": 1.7806894797099606, + "ewc_loss": 0.00802184734493494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02184731583111e-05, + "grad_norm": 4.005260944366455, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8885602951049805, + "num_tokens": 533902877.0, + "step": 13998 + }, + { + "epoch": 1.780816689988551, + "ewc_loss": 0.007993623614311218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993623876245692e-05, + "grad_norm": 3.998206377029419, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8786094188690186, + "num_tokens": 533937440.0, + "step": 13999 + }, + { + "epoch": 1.7809439002671414, + "ewc_loss": 0.007986965589225292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986965647432953e-05, + "grad_norm": 3.953024387359619, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.884380578994751, + "num_tokens": 533972308.0, + "step": 14000 + }, + { + "epoch": 1.781071110545732, + "ewc_loss": 0.007983855903148651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.983855903148651e-05, + "grad_norm": 3.994843006134033, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.863598644733429, + "num_tokens": 534011913.0, + "step": 14001 + }, + { + "epoch": 1.7811983208243225, + "ewc_loss": 0.008003631606698036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003631955944002e-05, + "grad_norm": 3.9179794788360596, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8860688209533691, + "num_tokens": 534057235.0, + "step": 14002 + }, + { + "epoch": 1.781325531102913, + "ewc_loss": 0.007962379604578018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.962379459058866e-05, + "grad_norm": 3.998246908187866, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8759536743164062, + "num_tokens": 534092850.0, + "step": 14003 + }, + { + "epoch": 1.7814527413815036, + "ewc_loss": 0.00802368950098753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023689588299021e-05, + "grad_norm": 3.924729824066162, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8807159662246704, + "num_tokens": 534135310.0, + "step": 14004 + }, + { + "epoch": 1.781579951660094, + "ewc_loss": 0.00794888287782669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.948882557684556e-05, + "grad_norm": 3.9530463218688965, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8736865520477295, + "num_tokens": 534176008.0, + "step": 14005 + }, + { + "epoch": 1.7817071619386846, + "ewc_loss": 0.007995635271072388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995634950930253e-05, + "grad_norm": 4.01132345199585, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8764423131942749, + "num_tokens": 534208217.0, + "step": 14006 + }, + { + "epoch": 1.7818343722172751, + "ewc_loss": 0.008022690191864967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022690599318594e-05, + "grad_norm": 3.9304702281951904, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8801639080047607, + "num_tokens": 534247564.0, + "step": 14007 + }, + { + "epoch": 1.7819615824958657, + "ewc_loss": 0.00795260164886713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952602027216926e-05, + "grad_norm": 4.050118923187256, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8569883704185486, + "num_tokens": 534281144.0, + "step": 14008 + }, + { + "epoch": 1.7820887927744562, + "ewc_loss": 0.008073544129729271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.073544449871406e-05, + "grad_norm": 4.017951011657715, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8649073839187622, + "num_tokens": 534317893.0, + "step": 14009 + }, + { + "epoch": 1.7822160030530467, + "ewc_loss": 0.007994980551302433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994980842340738e-05, + "grad_norm": 4.009211540222168, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8744735717773438, + "num_tokens": 534356116.0, + "step": 14010 + }, + { + "epoch": 1.7823432133316373, + "ewc_loss": 0.00801814440637827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018144581001252e-05, + "grad_norm": 4.02005672454834, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8517177104949951, + "num_tokens": 534390499.0, + "step": 14011 + }, + { + "epoch": 1.7824704236102278, + "ewc_loss": 0.008060336112976074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060335676418617e-05, + "grad_norm": 4.000370502471924, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8700236082077026, + "num_tokens": 534424242.0, + "step": 14012 + }, + { + "epoch": 1.7825976338888183, + "ewc_loss": 0.008055070415139198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055070793488994e-05, + "grad_norm": 3.935687780380249, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8670271039009094, + "num_tokens": 534467465.0, + "step": 14013 + }, + { + "epoch": 1.7827248441674088, + "ewc_loss": 0.008010891266167164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010891178855672e-05, + "grad_norm": 3.979753017425537, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8706070780754089, + "num_tokens": 534502959.0, + "step": 14014 + }, + { + "epoch": 1.7828520544459994, + "ewc_loss": 0.008070011623203754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07001197244972e-05, + "grad_norm": 3.901414632797241, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8760015964508057, + "num_tokens": 534549888.0, + "step": 14015 + }, + { + "epoch": 1.78297926472459, + "ewc_loss": 0.008000529371201992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000529487617314e-05, + "grad_norm": 3.9497063159942627, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.878839910030365, + "num_tokens": 534591686.0, + "step": 14016 + }, + { + "epoch": 1.7831064750031802, + "ewc_loss": 0.00807395949959755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.073959907051176e-05, + "grad_norm": 3.928574562072754, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.891373872756958, + "num_tokens": 534628584.0, + "step": 14017 + }, + { + "epoch": 1.7832336852817707, + "ewc_loss": 0.008035203441977501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035203791223466e-05, + "grad_norm": 3.9883811473846436, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.867251455783844, + "num_tokens": 534665684.0, + "step": 14018 + }, + { + "epoch": 1.7833608955603613, + "ewc_loss": 0.008076249621808529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07624965091236e-05, + "grad_norm": 3.977456569671631, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8773898482322693, + "num_tokens": 534701604.0, + "step": 14019 + }, + { + "epoch": 1.7834881058389518, + "ewc_loss": 0.008058958686888218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058958337642252e-05, + "grad_norm": 3.9512288570404053, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8601410388946533, + "num_tokens": 534741765.0, + "step": 14020 + }, + { + "epoch": 1.7836153161175423, + "ewc_loss": 0.008034410886466503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03441071184352e-05, + "grad_norm": 3.9659154415130615, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8839058876037598, + "num_tokens": 534780331.0, + "step": 14021 + }, + { + "epoch": 1.7837425263961328, + "ewc_loss": 0.008067252114415169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06725220172666e-05, + "grad_norm": 4.071836471557617, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.864378809928894, + "num_tokens": 534816254.0, + "step": 14022 + }, + { + "epoch": 1.7838697366747234, + "ewc_loss": 0.008105569519102573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.105569577310234e-05, + "grad_norm": 3.984384536743164, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8755756616592407, + "num_tokens": 534851367.0, + "step": 14023 + }, + { + "epoch": 1.7839969469533137, + "ewc_loss": 0.008037819527089596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037819497985765e-05, + "grad_norm": 3.9264626502990723, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8741233348846436, + "num_tokens": 534889077.0, + "step": 14024 + }, + { + "epoch": 1.7841241572319042, + "ewc_loss": 0.008045497350394726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.045497088460252e-05, + "grad_norm": 4.018840789794922, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8651753664016724, + "num_tokens": 534924249.0, + "step": 14025 + }, + { + "epoch": 1.7842513675104947, + "ewc_loss": 0.008118836209177971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.118836558423936e-05, + "grad_norm": 3.9606845378875732, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8607181310653687, + "num_tokens": 534966370.0, + "step": 14026 + }, + { + "epoch": 1.7843785777890853, + "ewc_loss": 0.008036894723773003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036894723773003e-05, + "grad_norm": 3.9723639488220215, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8827751874923706, + "num_tokens": 535005869.0, + "step": 14027 + }, + { + "epoch": 1.7845057880676758, + "ewc_loss": 0.008068894036114216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068894385360181e-05, + "grad_norm": 3.983957290649414, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8742185831069946, + "num_tokens": 535042057.0, + "step": 14028 + }, + { + "epoch": 1.7846329983462663, + "ewc_loss": 0.008082644082605839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082643762463704e-05, + "grad_norm": 4.048296928405762, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8698025941848755, + "num_tokens": 535074410.0, + "step": 14029 + }, + { + "epoch": 1.7847602086248568, + "ewc_loss": 0.008098343387246132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.098343823803589e-05, + "grad_norm": 3.9054670333862305, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8656296730041504, + "num_tokens": 535120356.0, + "step": 14030 + }, + { + "epoch": 1.7848874189034474, + "ewc_loss": 0.008009579963982105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009580051293597e-05, + "grad_norm": 4.133070468902588, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8671205639839172, + "num_tokens": 535156939.0, + "step": 14031 + }, + { + "epoch": 1.785014629182038, + "ewc_loss": 0.008214982226490974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.214982517529279e-05, + "grad_norm": 3.9588828086853027, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8749028444290161, + "num_tokens": 535195553.0, + "step": 14032 + }, + { + "epoch": 1.7851418394606284, + "ewc_loss": 0.008002337999641895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002337563084438e-05, + "grad_norm": 3.9750216007232666, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8802734613418579, + "num_tokens": 535235692.0, + "step": 14033 + }, + { + "epoch": 1.785269049739219, + "ewc_loss": 0.008084407076239586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08440672699362e-05, + "grad_norm": 3.990741014480591, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8735385537147522, + "num_tokens": 535269265.0, + "step": 14034 + }, + { + "epoch": 1.7853962600178095, + "ewc_loss": 0.00808598380535841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085983427008614e-05, + "grad_norm": 3.9307916164398193, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8901953101158142, + "num_tokens": 535303825.0, + "step": 14035 + }, + { + "epoch": 1.7855234702964, + "ewc_loss": 0.008030756376683712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.030756725929677e-05, + "grad_norm": 3.9794046878814697, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8702438473701477, + "num_tokens": 535338802.0, + "step": 14036 + }, + { + "epoch": 1.7856506805749905, + "ewc_loss": 0.00809209980070591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092099596979097e-05, + "grad_norm": 3.93745493888855, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8819887638092041, + "num_tokens": 535376592.0, + "step": 14037 + }, + { + "epoch": 1.785777890853581, + "ewc_loss": 0.008050922304391861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050922770053148e-05, + "grad_norm": 3.933803081512451, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8716403841972351, + "num_tokens": 535416528.0, + "step": 14038 + }, + { + "epoch": 1.7859051011321716, + "ewc_loss": 0.00805083941668272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050839096540585e-05, + "grad_norm": 4.016294002532959, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.870981752872467, + "num_tokens": 535450892.0, + "step": 14039 + }, + { + "epoch": 1.7860323114107621, + "ewc_loss": 0.008119761012494564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.119761332636699e-05, + "grad_norm": 3.983996629714966, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8778126239776611, + "num_tokens": 535488198.0, + "step": 14040 + }, + { + "epoch": 1.7861595216893527, + "ewc_loss": 0.008050190284848213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0501900811214e-05, + "grad_norm": 4.010595321655273, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8739982843399048, + "num_tokens": 535526020.0, + "step": 14041 + }, + { + "epoch": 1.786286731967943, + "ewc_loss": 0.008086759597063065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086759771686047e-05, + "grad_norm": 3.9575514793395996, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8798943758010864, + "num_tokens": 535560400.0, + "step": 14042 + }, + { + "epoch": 1.7864139422465335, + "ewc_loss": 0.008046477101743221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046477159950882e-05, + "grad_norm": 4.035276412963867, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8690074682235718, + "num_tokens": 535594699.0, + "step": 14043 + }, + { + "epoch": 1.786541152525124, + "ewc_loss": 0.008122452534735203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.122452709358186e-05, + "grad_norm": 3.96087384223938, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8796257972717285, + "num_tokens": 535627019.0, + "step": 14044 + }, + { + "epoch": 1.7866683628037145, + "ewc_loss": 0.008060187101364136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060187246883288e-05, + "grad_norm": 3.9288363456726074, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8842747211456299, + "num_tokens": 535667479.0, + "step": 14045 + }, + { + "epoch": 1.786795573082305, + "ewc_loss": 0.00805127527564764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051274926401675e-05, + "grad_norm": 4.044490814208984, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8706948757171631, + "num_tokens": 535705211.0, + "step": 14046 + }, + { + "epoch": 1.7869227833608956, + "ewc_loss": 0.008136590011417866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.136589895002544e-05, + "grad_norm": 3.968203067779541, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.856299877166748, + "num_tokens": 535744907.0, + "step": 14047 + }, + { + "epoch": 1.787049993639486, + "ewc_loss": 0.008064426481723785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064426947385073e-05, + "grad_norm": 3.942840337753296, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.874264121055603, + "num_tokens": 535782526.0, + "step": 14048 + }, + { + "epoch": 1.7871772039180764, + "ewc_loss": 0.008070243522524834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070243347901851e-05, + "grad_norm": 3.9043660163879395, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8898552656173706, + "num_tokens": 535824153.0, + "step": 14049 + }, + { + "epoch": 1.787304414196667, + "ewc_loss": 0.008067682385444641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067682210821658e-05, + "grad_norm": 3.984665632247925, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8870952129364014, + "num_tokens": 535859019.0, + "step": 14050 + }, + { + "epoch": 1.7874316244752575, + "ewc_loss": 0.00811631977558136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11631980468519e-05, + "grad_norm": 3.9431064128875732, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8862327337265015, + "num_tokens": 535896866.0, + "step": 14051 + }, + { + "epoch": 1.787558834753848, + "ewc_loss": 0.008058873005211353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058873208938166e-05, + "grad_norm": 3.9456043243408203, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8757516741752625, + "num_tokens": 535937128.0, + "step": 14052 + }, + { + "epoch": 1.7876860450324386, + "ewc_loss": 0.008070141077041626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070140756899491e-05, + "grad_norm": 3.9283037185668945, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8908060789108276, + "num_tokens": 535972727.0, + "step": 14053 + }, + { + "epoch": 1.787813255311029, + "ewc_loss": 0.008055172860622406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055172656895593e-05, + "grad_norm": 3.9465503692626953, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8787912130355835, + "num_tokens": 536011491.0, + "step": 14054 + }, + { + "epoch": 1.7879404655896196, + "ewc_loss": 0.008060355670750141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060355321504176e-05, + "grad_norm": 3.949302911758423, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8774235248565674, + "num_tokens": 536048074.0, + "step": 14055 + }, + { + "epoch": 1.7880676758682101, + "ewc_loss": 0.008061885833740234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061885455390438e-05, + "grad_norm": 3.9629878997802734, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8821111917495728, + "num_tokens": 536089554.0, + "step": 14056 + }, + { + "epoch": 1.7881948861468007, + "ewc_loss": 0.008056489750742912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05648960522376e-05, + "grad_norm": 3.903865098953247, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8848473429679871, + "num_tokens": 536127672.0, + "step": 14057 + }, + { + "epoch": 1.7883220964253912, + "ewc_loss": 0.008017128333449364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01712812972255e-05, + "grad_norm": 4.037303447723389, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8778812289237976, + "num_tokens": 536157517.0, + "step": 14058 + }, + { + "epoch": 1.7884493067039817, + "ewc_loss": 0.008102504536509514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102504216367379e-05, + "grad_norm": 3.9799857139587402, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8876131772994995, + "num_tokens": 536192855.0, + "step": 14059 + }, + { + "epoch": 1.7885765169825723, + "ewc_loss": 0.00801947433501482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019474626053125e-05, + "grad_norm": 3.9508659839630127, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8776360750198364, + "num_tokens": 536231727.0, + "step": 14060 + }, + { + "epoch": 1.7887037272611628, + "ewc_loss": 0.00803921464830637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039214299060404e-05, + "grad_norm": 3.9353373050689697, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8887161612510681, + "num_tokens": 536270738.0, + "step": 14061 + }, + { + "epoch": 1.7888309375397533, + "ewc_loss": 0.00802604015916586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026039722608402e-05, + "grad_norm": 3.906400203704834, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8941915035247803, + "num_tokens": 536313006.0, + "step": 14062 + }, + { + "epoch": 1.7889581478183438, + "ewc_loss": 0.008010671474039555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010671444935724e-05, + "grad_norm": 3.9998347759246826, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8704524040222168, + "num_tokens": 536350697.0, + "step": 14063 + }, + { + "epoch": 1.7890853580969344, + "ewc_loss": 0.008062859997153282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062859706114978e-05, + "grad_norm": 3.892547607421875, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8841851353645325, + "num_tokens": 536394252.0, + "step": 14064 + }, + { + "epoch": 1.789212568375525, + "ewc_loss": 0.007977931760251522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977931818459183e-05, + "grad_norm": 4.078484535217285, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8748636841773987, + "num_tokens": 536426591.0, + "step": 14065 + }, + { + "epoch": 1.7893397786541152, + "ewc_loss": 0.008111394010484219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.111393981380388e-05, + "grad_norm": 3.9569640159606934, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8641225099563599, + "num_tokens": 536468567.0, + "step": 14066 + }, + { + "epoch": 1.7894669889327057, + "ewc_loss": 0.00794548075646162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.945481047499925e-05, + "grad_norm": 3.971667766571045, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8664506673812866, + "num_tokens": 536506352.0, + "step": 14067 + }, + { + "epoch": 1.7895941992112963, + "ewc_loss": 0.0080082593485713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.008259464986622e-05, + "grad_norm": 4.031116962432861, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8695486187934875, + "num_tokens": 536538122.0, + "step": 14068 + }, + { + "epoch": 1.7897214094898868, + "ewc_loss": 0.008044668473303318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044668356887996e-05, + "grad_norm": 3.963890314102173, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8652960658073425, + "num_tokens": 536574617.0, + "step": 14069 + }, + { + "epoch": 1.7898486197684773, + "ewc_loss": 0.007982113398611546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982113311300054e-05, + "grad_norm": 3.958113670349121, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8864718675613403, + "num_tokens": 536611895.0, + "step": 14070 + }, + { + "epoch": 1.7899758300470678, + "ewc_loss": 0.008023896254599094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023896225495264e-05, + "grad_norm": 3.980004072189331, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.868847131729126, + "num_tokens": 536651336.0, + "step": 14071 + }, + { + "epoch": 1.7901030403256584, + "ewc_loss": 0.008035920560359955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035920473048463e-05, + "grad_norm": 3.9616730213165283, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8713808059692383, + "num_tokens": 536692566.0, + "step": 14072 + }, + { + "epoch": 1.7902302506042487, + "ewc_loss": 0.008024219423532486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024219278013334e-05, + "grad_norm": 3.9631192684173584, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8807033896446228, + "num_tokens": 536730855.0, + "step": 14073 + }, + { + "epoch": 1.7903574608828392, + "ewc_loss": 0.0080271540209651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.027154399314895e-05, + "grad_norm": 3.9795002937316895, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8742420077323914, + "num_tokens": 536766478.0, + "step": 14074 + }, + { + "epoch": 1.7904846711614297, + "ewc_loss": 0.008027782663702965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.027782314456999e-05, + "grad_norm": 3.9958558082580566, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8715930581092834, + "num_tokens": 536799900.0, + "step": 14075 + }, + { + "epoch": 1.7906118814400203, + "ewc_loss": 0.008040251210331917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040251123020425e-05, + "grad_norm": 3.901015520095825, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8823199272155762, + "num_tokens": 536843597.0, + "step": 14076 + }, + { + "epoch": 1.7907390917186108, + "ewc_loss": 0.00798964686691761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98964683781378e-05, + "grad_norm": 3.986689329147339, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.886366605758667, + "num_tokens": 536879189.0, + "step": 14077 + }, + { + "epoch": 1.7908663019972013, + "ewc_loss": 0.008072476834058762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.072477066889405e-05, + "grad_norm": 3.9351725578308105, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8803988695144653, + "num_tokens": 536918099.0, + "step": 14078 + }, + { + "epoch": 1.7909935122757918, + "ewc_loss": 0.008018741384148598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018741209525615e-05, + "grad_norm": 4.001840114593506, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8678340911865234, + "num_tokens": 536954410.0, + "step": 14079 + }, + { + "epoch": 1.7911207225543824, + "ewc_loss": 0.00807826779782772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.078268001554534e-05, + "grad_norm": 3.948592185974121, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8968164920806885, + "num_tokens": 536986688.0, + "step": 14080 + }, + { + "epoch": 1.791247932832973, + "ewc_loss": 0.008017944172024727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017944492166862e-05, + "grad_norm": 3.9649195671081543, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8765506148338318, + "num_tokens": 537024181.0, + "step": 14081 + }, + { + "epoch": 1.7913751431115634, + "ewc_loss": 0.008076325990259647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076326048467308e-05, + "grad_norm": 3.9617531299591064, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8802282810211182, + "num_tokens": 537064174.0, + "step": 14082 + }, + { + "epoch": 1.791502353390154, + "ewc_loss": 0.008058367297053337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058367529883981e-05, + "grad_norm": 3.9770641326904297, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8684582114219666, + "num_tokens": 537100279.0, + "step": 14083 + }, + { + "epoch": 1.7916295636687445, + "ewc_loss": 0.008066658861935139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066659211181104e-05, + "grad_norm": 3.9877636432647705, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8541613817214966, + "num_tokens": 537135339.0, + "step": 14084 + }, + { + "epoch": 1.791756773947335, + "ewc_loss": 0.008073867298662663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.073867502389476e-05, + "grad_norm": 3.940371513366699, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.870044469833374, + "num_tokens": 537169608.0, + "step": 14085 + }, + { + "epoch": 1.7918839842259255, + "ewc_loss": 0.0080400500446558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040049578994513e-05, + "grad_norm": 3.9514946937561035, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8857549428939819, + "num_tokens": 537205560.0, + "step": 14086 + }, + { + "epoch": 1.792011194504516, + "ewc_loss": 0.008080781437456608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080781844910234e-05, + "grad_norm": 4.001842021942139, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8662070631980896, + "num_tokens": 537238697.0, + "step": 14087 + }, + { + "epoch": 1.7921384047831066, + "ewc_loss": 0.008103803731501102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103803702397272e-05, + "grad_norm": 3.8999807834625244, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8707996606826782, + "num_tokens": 537282365.0, + "step": 14088 + }, + { + "epoch": 1.7922656150616971, + "ewc_loss": 0.008028180338442326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028180309338495e-05, + "grad_norm": 3.9191665649414062, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8857148885726929, + "num_tokens": 537320805.0, + "step": 14089 + }, + { + "epoch": 1.7923928253402877, + "ewc_loss": 0.008089299313724041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.089299080893397e-05, + "grad_norm": 4.003403663635254, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8798191547393799, + "num_tokens": 537354607.0, + "step": 14090 + }, + { + "epoch": 1.792520035618878, + "ewc_loss": 0.008131212554872036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.131212234729901e-05, + "grad_norm": 3.9832544326782227, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8855206966400146, + "num_tokens": 537386672.0, + "step": 14091 + }, + { + "epoch": 1.7926472458974685, + "ewc_loss": 0.008080258965492249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080259431153536e-05, + "grad_norm": 3.913553237915039, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.878727912902832, + "num_tokens": 537430882.0, + "step": 14092 + }, + { + "epoch": 1.792774456176059, + "ewc_loss": 0.008071835152804852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071835327427834e-05, + "grad_norm": 3.976044178009033, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8794425129890442, + "num_tokens": 537465114.0, + "step": 14093 + }, + { + "epoch": 1.7929016664546495, + "ewc_loss": 0.008128532208502293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.128532499540597e-05, + "grad_norm": 3.9884612560272217, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8702243566513062, + "num_tokens": 537502820.0, + "step": 14094 + }, + { + "epoch": 1.79302887673324, + "ewc_loss": 0.008090230636298656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090230403468013e-05, + "grad_norm": 3.9663047790527344, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8846242427825928, + "num_tokens": 537537422.0, + "step": 14095 + }, + { + "epoch": 1.7931560870118306, + "ewc_loss": 0.00807801354676485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.078013343038037e-05, + "grad_norm": 4.106610298156738, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.881922721862793, + "num_tokens": 537566828.0, + "step": 14096 + }, + { + "epoch": 1.793283297290421, + "ewc_loss": 0.008178632706403732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.17863256088458e-05, + "grad_norm": 3.9488766193389893, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8729255199432373, + "num_tokens": 537606102.0, + "step": 14097 + }, + { + "epoch": 1.7934105075690114, + "ewc_loss": 0.008039407432079315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03940711193718e-05, + "grad_norm": 4.012691497802734, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8812201023101807, + "num_tokens": 537638527.0, + "step": 14098 + }, + { + "epoch": 1.793537717847602, + "ewc_loss": 0.008124134503304958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.124134183162823e-05, + "grad_norm": 3.8937227725982666, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8750178813934326, + "num_tokens": 537683375.0, + "step": 14099 + }, + { + "epoch": 1.7936649281261925, + "ewc_loss": 0.008033983409404755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033983613131568e-05, + "grad_norm": 3.9088685512542725, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.870718777179718, + "num_tokens": 537731354.0, + "step": 14100 + }, + { + "epoch": 1.793792138404783, + "ewc_loss": 0.008066347800195217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066347800195217e-05, + "grad_norm": 3.9610235691070557, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8719152212142944, + "num_tokens": 537770055.0, + "step": 14101 + }, + { + "epoch": 1.7939193486833735, + "ewc_loss": 0.008090437389910221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090437768260017e-05, + "grad_norm": 4.031026363372803, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8566200137138367, + "num_tokens": 537807977.0, + "step": 14102 + }, + { + "epoch": 1.794046558961964, + "ewc_loss": 0.008099349215626717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.099349361145869e-05, + "grad_norm": 3.944944143295288, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8697673678398132, + "num_tokens": 537847809.0, + "step": 14103 + }, + { + "epoch": 1.7941737692405546, + "ewc_loss": 0.008046879433095455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046879520406947e-05, + "grad_norm": 3.9793190956115723, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8570944666862488, + "num_tokens": 537885506.0, + "step": 14104 + }, + { + "epoch": 1.7943009795191451, + "ewc_loss": 0.008092708885669708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092709322227165e-05, + "grad_norm": 4.042476177215576, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8716753125190735, + "num_tokens": 537918936.0, + "step": 14105 + }, + { + "epoch": 1.7944281897977357, + "ewc_loss": 0.008116485550999641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116485696518794e-05, + "grad_norm": 3.9793035984039307, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8806240558624268, + "num_tokens": 537954205.0, + "step": 14106 + }, + { + "epoch": 1.7945554000763262, + "ewc_loss": 0.008047560229897499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047559822443873e-05, + "grad_norm": 3.9551470279693604, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8776761889457703, + "num_tokens": 537989190.0, + "step": 14107 + }, + { + "epoch": 1.7946826103549167, + "ewc_loss": 0.00806623324751854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066232840064913e-05, + "grad_norm": 3.956242561340332, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8665543794631958, + "num_tokens": 538028852.0, + "step": 14108 + }, + { + "epoch": 1.7948098206335072, + "ewc_loss": 0.008083191700279713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.083191642072052e-05, + "grad_norm": 3.9954094886779785, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.880847692489624, + "num_tokens": 538061245.0, + "step": 14109 + }, + { + "epoch": 1.7949370309120978, + "ewc_loss": 0.008079789578914642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07978940429166e-05, + "grad_norm": 3.8883440494537354, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8878538608551025, + "num_tokens": 538103238.0, + "step": 14110 + }, + { + "epoch": 1.7950642411906883, + "ewc_loss": 0.008012001402676105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012001489987597e-05, + "grad_norm": 4.012195110321045, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8626090884208679, + "num_tokens": 538142692.0, + "step": 14111 + }, + { + "epoch": 1.7951914514692788, + "ewc_loss": 0.008137891069054604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13789083622396e-05, + "grad_norm": 3.967737913131714, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8720178604125977, + "num_tokens": 538183155.0, + "step": 14112 + }, + { + "epoch": 1.7953186617478694, + "ewc_loss": 0.008045870810747147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.045871072681621e-05, + "grad_norm": 3.975088357925415, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8746945261955261, + "num_tokens": 538222063.0, + "step": 14113 + }, + { + "epoch": 1.7954458720264599, + "ewc_loss": 0.008074448443949223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074448851402849e-05, + "grad_norm": 3.9914934635162354, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8791163563728333, + "num_tokens": 538259847.0, + "step": 14114 + }, + { + "epoch": 1.7955730823050502, + "ewc_loss": 0.008078952319920063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.078951941570267e-05, + "grad_norm": 3.9382946491241455, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8825608491897583, + "num_tokens": 538298557.0, + "step": 14115 + }, + { + "epoch": 1.7957002925836407, + "ewc_loss": 0.008042617700994015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042617992032319e-05, + "grad_norm": 3.9612667560577393, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8723800182342529, + "num_tokens": 538337926.0, + "step": 14116 + }, + { + "epoch": 1.7958275028622313, + "ewc_loss": 0.008067522197961807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067522139754146e-05, + "grad_norm": 3.9346625804901123, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8787654638290405, + "num_tokens": 538382755.0, + "step": 14117 + }, + { + "epoch": 1.7959547131408218, + "ewc_loss": 0.008020196110010147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02019567345269e-05, + "grad_norm": 4.015325546264648, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.859636127948761, + "num_tokens": 538417639.0, + "step": 14118 + }, + { + "epoch": 1.7960819234194123, + "ewc_loss": 0.008088701404631138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.088701724773273e-05, + "grad_norm": 3.9532644748687744, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.869143545627594, + "num_tokens": 538458444.0, + "step": 14119 + }, + { + "epoch": 1.7962091336980028, + "ewc_loss": 0.00801655650138855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.016556239454076e-05, + "grad_norm": 3.932497024536133, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8762987852096558, + "num_tokens": 538496580.0, + "step": 14120 + }, + { + "epoch": 1.7963363439765934, + "ewc_loss": 0.008029559627175331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029559830902144e-05, + "grad_norm": 3.9542760848999023, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8839311599731445, + "num_tokens": 538538474.0, + "step": 14121 + }, + { + "epoch": 1.7964635542551837, + "ewc_loss": 0.00802597776055336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025977876968682e-05, + "grad_norm": 3.9136672019958496, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8770555257797241, + "num_tokens": 538576828.0, + "step": 14122 + }, + { + "epoch": 1.7965907645337742, + "ewc_loss": 0.008006708696484566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00670895841904e-05, + "grad_norm": 3.994004726409912, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8829596638679504, + "num_tokens": 538611630.0, + "step": 14123 + }, + { + "epoch": 1.7967179748123647, + "ewc_loss": 0.008065647445619106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.065647125476971e-05, + "grad_norm": 3.9107182025909424, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8508903980255127, + "num_tokens": 538659537.0, + "step": 14124 + }, + { + "epoch": 1.7968451850909553, + "ewc_loss": 0.00798910390585661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.989104051375762e-05, + "grad_norm": 3.930905818939209, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8831012845039368, + "num_tokens": 538700216.0, + "step": 14125 + }, + { + "epoch": 1.7969723953695458, + "ewc_loss": 0.0080322390422225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032238838495687e-05, + "grad_norm": 3.981180191040039, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8725053668022156, + "num_tokens": 538737719.0, + "step": 14126 + }, + { + "epoch": 1.7970996056481363, + "ewc_loss": 0.008027685806155205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.027686271816492e-05, + "grad_norm": 3.9691946506500244, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8832257986068726, + "num_tokens": 538774723.0, + "step": 14127 + }, + { + "epoch": 1.7972268159267268, + "ewc_loss": 0.00800632406026125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00632406026125e-05, + "grad_norm": 3.9171571731567383, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8854964971542358, + "num_tokens": 538815288.0, + "step": 14128 + }, + { + "epoch": 1.7973540262053174, + "ewc_loss": 0.007985251024365425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98525070422329e-05, + "grad_norm": 3.969661235809326, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.874527633190155, + "num_tokens": 538858239.0, + "step": 14129 + }, + { + "epoch": 1.797481236483908, + "ewc_loss": 0.008032224141061306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032224286580458e-05, + "grad_norm": 3.9717161655426025, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8799276947975159, + "num_tokens": 538896947.0, + "step": 14130 + }, + { + "epoch": 1.7976084467624984, + "ewc_loss": 0.007999696768820286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99969639047049e-05, + "grad_norm": 3.9997997283935547, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8732216954231262, + "num_tokens": 538936511.0, + "step": 14131 + }, + { + "epoch": 1.797735657041089, + "ewc_loss": 0.008022909052670002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022908878047019e-05, + "grad_norm": 4.000253200531006, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8707542419433594, + "num_tokens": 538969198.0, + "step": 14132 + }, + { + "epoch": 1.7978628673196795, + "ewc_loss": 0.008000990375876427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000990055734292e-05, + "grad_norm": 3.9946255683898926, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8870159387588501, + "num_tokens": 539002258.0, + "step": 14133 + }, + { + "epoch": 1.79799007759827, + "ewc_loss": 0.007974890060722828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974890468176454e-05, + "grad_norm": 3.9681732654571533, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8617203831672668, + "num_tokens": 539039993.0, + "step": 14134 + }, + { + "epoch": 1.7981172878768605, + "ewc_loss": 0.007981532253324986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.981532689882442e-05, + "grad_norm": 3.915961980819702, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8712602853775024, + "num_tokens": 539080713.0, + "step": 14135 + }, + { + "epoch": 1.798244498155451, + "ewc_loss": 0.00797894038259983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.978940266184509e-05, + "grad_norm": 3.9165279865264893, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8860913515090942, + "num_tokens": 539121115.0, + "step": 14136 + }, + { + "epoch": 1.7983717084340416, + "ewc_loss": 0.007996490225195885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99649060354568e-05, + "grad_norm": 3.8991150856018066, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8927061557769775, + "num_tokens": 539159889.0, + "step": 14137 + }, + { + "epoch": 1.7984989187126321, + "ewc_loss": 0.00796925462782383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.969254511408508e-05, + "grad_norm": 3.95564341545105, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8761181831359863, + "num_tokens": 539201508.0, + "step": 14138 + }, + { + "epoch": 1.7986261289912227, + "ewc_loss": 0.008026354014873505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026354043977335e-05, + "grad_norm": 3.9998955726623535, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8572410345077515, + "num_tokens": 539238640.0, + "step": 14139 + }, + { + "epoch": 1.798753339269813, + "ewc_loss": 0.008039720356464386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039720705710351e-05, + "grad_norm": 3.9991912841796875, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8723804354667664, + "num_tokens": 539274797.0, + "step": 14140 + }, + { + "epoch": 1.7988805495484035, + "ewc_loss": 0.008010261692106724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010261808522046e-05, + "grad_norm": 4.034951210021973, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8661831617355347, + "num_tokens": 539308410.0, + "step": 14141 + }, + { + "epoch": 1.799007759826994, + "ewc_loss": 0.00804892834275961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0489284300711e-05, + "grad_norm": 3.971410036087036, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8682430386543274, + "num_tokens": 539345578.0, + "step": 14142 + }, + { + "epoch": 1.7991349701055845, + "ewc_loss": 0.007990273647010326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.990273297764361e-05, + "grad_norm": 4.052947044372559, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8609876036643982, + "num_tokens": 539378946.0, + "step": 14143 + }, + { + "epoch": 1.799262180384175, + "ewc_loss": 0.008074795827269554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074795914581046e-05, + "grad_norm": 3.925992250442505, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8878750205039978, + "num_tokens": 539417663.0, + "step": 14144 + }, + { + "epoch": 1.7993893906627656, + "ewc_loss": 0.007979068905115128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97906905063428e-05, + "grad_norm": 3.914278030395508, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8911861181259155, + "num_tokens": 539459597.0, + "step": 14145 + }, + { + "epoch": 1.799516600941356, + "ewc_loss": 0.008027835749089718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.027835428947583e-05, + "grad_norm": 3.938626527786255, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8779832720756531, + "num_tokens": 539499579.0, + "step": 14146 + }, + { + "epoch": 1.7996438112199464, + "ewc_loss": 0.008031208999454975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03120929049328e-05, + "grad_norm": 3.955836534500122, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8851637840270996, + "num_tokens": 539538505.0, + "step": 14147 + }, + { + "epoch": 1.799771021498537, + "ewc_loss": 0.008044769056141376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044769492698833e-05, + "grad_norm": 4.00934362411499, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8705680966377258, + "num_tokens": 539574824.0, + "step": 14148 + }, + { + "epoch": 1.7998982317771275, + "ewc_loss": 0.00805717334151268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057173545239493e-05, + "grad_norm": 3.9597327709198, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8625005483627319, + "num_tokens": 539614181.0, + "step": 14149 + }, + { + "epoch": 1.800025442055718, + "ewc_loss": 0.008032071404159069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03207149147056e-05, + "grad_norm": 3.9802451133728027, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8638044595718384, + "num_tokens": 539656713.0, + "step": 14150 + }, + { + "epoch": 1.8001526523343085, + "ewc_loss": 0.00806518830358982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.065188012551516e-05, + "grad_norm": 3.9947099685668945, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8731538653373718, + "num_tokens": 539695436.0, + "step": 14151 + }, + { + "epoch": 1.800279862612899, + "ewc_loss": 0.0080604562535882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060456457315013e-05, + "grad_norm": 3.9404489994049072, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8773282170295715, + "num_tokens": 539736549.0, + "step": 14152 + }, + { + "epoch": 1.8004070728914896, + "ewc_loss": 0.00801857654005289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018576772883534e-05, + "grad_norm": 3.971762180328369, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8812568187713623, + "num_tokens": 539776326.0, + "step": 14153 + }, + { + "epoch": 1.8005342831700801, + "ewc_loss": 0.008038249798119068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038249507080764e-05, + "grad_norm": 3.900574207305908, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8820305466651917, + "num_tokens": 539819665.0, + "step": 14154 + }, + { + "epoch": 1.8006614934486707, + "ewc_loss": 0.007983807474374771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.983807154232636e-05, + "grad_norm": 3.9971506595611572, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8742668032646179, + "num_tokens": 539858175.0, + "step": 14155 + }, + { + "epoch": 1.8007887037272612, + "ewc_loss": 0.008074022829532623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074023207882419e-05, + "grad_norm": 3.9644269943237305, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8739845156669617, + "num_tokens": 539895152.0, + "step": 14156 + }, + { + "epoch": 1.8009159140058517, + "ewc_loss": 0.008009001612663269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009001612663269e-05, + "grad_norm": 3.934117317199707, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.880387544631958, + "num_tokens": 539941033.0, + "step": 14157 + }, + { + "epoch": 1.8010431242844422, + "ewc_loss": 0.007983429357409477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98342953203246e-05, + "grad_norm": 3.979901075363159, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8775627613067627, + "num_tokens": 539977959.0, + "step": 14158 + }, + { + "epoch": 1.8011703345630328, + "ewc_loss": 0.00804579071700573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.045791037147865e-05, + "grad_norm": 4.011907577514648, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.874999463558197, + "num_tokens": 540014857.0, + "step": 14159 + }, + { + "epoch": 1.8012975448416233, + "ewc_loss": 0.008017925545573235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017925574677065e-05, + "grad_norm": 3.9502928256988525, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8754218816757202, + "num_tokens": 540055775.0, + "step": 14160 + }, + { + "epoch": 1.8014247551202138, + "ewc_loss": 0.007979579269886017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979579095263034e-05, + "grad_norm": 4.016768455505371, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8856689929962158, + "num_tokens": 540090287.0, + "step": 14161 + }, + { + "epoch": 1.8015519653988044, + "ewc_loss": 0.008043388836085796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043389243539423e-05, + "grad_norm": 3.9626967906951904, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8821130990982056, + "num_tokens": 540129139.0, + "step": 14162 + }, + { + "epoch": 1.8016791756773949, + "ewc_loss": 0.007984351366758347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984351395862177e-05, + "grad_norm": 4.042961120605469, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8768709301948547, + "num_tokens": 540165187.0, + "step": 14163 + }, + { + "epoch": 1.8018063859559852, + "ewc_loss": 0.008033616468310356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033616904867813e-05, + "grad_norm": 3.9310364723205566, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8887319564819336, + "num_tokens": 540201113.0, + "step": 14164 + }, + { + "epoch": 1.8019335962345757, + "ewc_loss": 0.007950475439429283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9504752648063e-05, + "grad_norm": 3.968832015991211, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8774989247322083, + "num_tokens": 540237448.0, + "step": 14165 + }, + { + "epoch": 1.8020608065131662, + "ewc_loss": 0.008010097779333591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010098099475726e-05, + "grad_norm": 3.9445371627807617, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8692103028297424, + "num_tokens": 540279703.0, + "step": 14166 + }, + { + "epoch": 1.8021880167917568, + "ewc_loss": 0.007978641428053379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.978641224326566e-05, + "grad_norm": 4.054549217224121, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8593915700912476, + "num_tokens": 540313071.0, + "step": 14167 + }, + { + "epoch": 1.8023152270703473, + "ewc_loss": 0.008050029166042805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050029282458127e-05, + "grad_norm": 3.927700996398926, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8795977830886841, + "num_tokens": 540353940.0, + "step": 14168 + }, + { + "epoch": 1.8024424373489378, + "ewc_loss": 0.007941843010485172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.941843068692833e-05, + "grad_norm": 3.9683566093444824, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8729956746101379, + "num_tokens": 540391241.0, + "step": 14169 + }, + { + "epoch": 1.8025696476275284, + "ewc_loss": 0.008024739101529121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024739508982748e-05, + "grad_norm": 4.074517250061035, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8868151903152466, + "num_tokens": 540420205.0, + "step": 14170 + }, + { + "epoch": 1.8026968579061187, + "ewc_loss": 0.008062700740993023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062700362643227e-05, + "grad_norm": 3.9884467124938965, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8798656463623047, + "num_tokens": 540452121.0, + "step": 14171 + }, + { + "epoch": 1.8028240681847092, + "ewc_loss": 0.00799267552793026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992675818968564e-05, + "grad_norm": 3.987339735031128, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.876599907875061, + "num_tokens": 540487415.0, + "step": 14172 + }, + { + "epoch": 1.8029512784632997, + "ewc_loss": 0.008055424317717552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055424405029044e-05, + "grad_norm": 4.018622875213623, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8646105527877808, + "num_tokens": 540522293.0, + "step": 14173 + }, + { + "epoch": 1.8030784887418903, + "ewc_loss": 0.008070118725299835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070118929026648e-05, + "grad_norm": 3.9457292556762695, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8884789943695068, + "num_tokens": 540560345.0, + "step": 14174 + }, + { + "epoch": 1.8032056990204808, + "ewc_loss": 0.008024646900594234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024647104321048e-05, + "grad_norm": 3.9648916721343994, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8845618367195129, + "num_tokens": 540598299.0, + "step": 14175 + }, + { + "epoch": 1.8033329092990713, + "ewc_loss": 0.008071210235357285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071210322668776e-05, + "grad_norm": 3.954956531524658, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8871959447860718, + "num_tokens": 540634480.0, + "step": 14176 + }, + { + "epoch": 1.8034601195776618, + "ewc_loss": 0.00805200356990099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052003977354616e-05, + "grad_norm": 3.947305679321289, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8667985796928406, + "num_tokens": 540679473.0, + "step": 14177 + }, + { + "epoch": 1.8035873298562524, + "ewc_loss": 0.008036622777581215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036622602958232e-05, + "grad_norm": 3.9270055294036865, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.869312047958374, + "num_tokens": 540722259.0, + "step": 14178 + }, + { + "epoch": 1.803714540134843, + "ewc_loss": 0.008043459616601467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04345982032828e-05, + "grad_norm": 3.9698894023895264, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8788742423057556, + "num_tokens": 540758058.0, + "step": 14179 + }, + { + "epoch": 1.8038417504134334, + "ewc_loss": 0.008068887516856194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068887836998329e-05, + "grad_norm": 3.990739583969116, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8714169263839722, + "num_tokens": 540798535.0, + "step": 14180 + }, + { + "epoch": 1.803968960692024, + "ewc_loss": 0.008072219789028168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.072219497989863e-05, + "grad_norm": 3.9485044479370117, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8759855628013611, + "num_tokens": 540839423.0, + "step": 14181 + }, + { + "epoch": 1.8040961709706145, + "ewc_loss": 0.008021451532840729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021451503736898e-05, + "grad_norm": 3.926790237426758, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8818125128746033, + "num_tokens": 540887512.0, + "step": 14182 + }, + { + "epoch": 1.804223381249205, + "ewc_loss": 0.008021842688322067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02184222266078e-05, + "grad_norm": 3.9818265438079834, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8789057731628418, + "num_tokens": 540922292.0, + "step": 14183 + }, + { + "epoch": 1.8043505915277955, + "ewc_loss": 0.00804046355187893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04046358098276e-05, + "grad_norm": 3.910815954208374, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.88219153881073, + "num_tokens": 540964105.0, + "step": 14184 + }, + { + "epoch": 1.804477801806386, + "ewc_loss": 0.007991720922291279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991720485733822e-05, + "grad_norm": 3.9878499507904053, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8748619556427002, + "num_tokens": 541001793.0, + "step": 14185 + }, + { + "epoch": 1.8046050120849766, + "ewc_loss": 0.00805430207401514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054302452364936e-05, + "grad_norm": 3.975701093673706, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8824918270111084, + "num_tokens": 541041104.0, + "step": 14186 + }, + { + "epoch": 1.8047322223635671, + "ewc_loss": 0.00799430999904871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99430999904871e-05, + "grad_norm": 3.9863953590393066, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.870111346244812, + "num_tokens": 541079643.0, + "step": 14187 + }, + { + "epoch": 1.8048594326421576, + "ewc_loss": 0.008011055178940296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011055615497753e-05, + "grad_norm": 3.9512548446655273, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.879734992980957, + "num_tokens": 541118554.0, + "step": 14188 + }, + { + "epoch": 1.804986642920748, + "ewc_loss": 0.007977055385708809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977055065566674e-05, + "grad_norm": 3.938786268234253, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8650721311569214, + "num_tokens": 541156893.0, + "step": 14189 + }, + { + "epoch": 1.8051138531993385, + "ewc_loss": 0.007994487881660461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994488260010257e-05, + "grad_norm": 3.9513871669769287, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8908461332321167, + "num_tokens": 541197283.0, + "step": 14190 + }, + { + "epoch": 1.805241063477929, + "ewc_loss": 0.00799473561346531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99473564256914e-05, + "grad_norm": 3.978470802307129, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8890522122383118, + "num_tokens": 541232343.0, + "step": 14191 + }, + { + "epoch": 1.8053682737565195, + "ewc_loss": 0.00801694393157959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.016944047994912e-05, + "grad_norm": 4.015182971954346, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8682898283004761, + "num_tokens": 541270274.0, + "step": 14192 + }, + { + "epoch": 1.80549548403511, + "ewc_loss": 0.0080189760774374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018976222956553e-05, + "grad_norm": 3.9952900409698486, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8714141249656677, + "num_tokens": 541306018.0, + "step": 14193 + }, + { + "epoch": 1.8056226943137006, + "ewc_loss": 0.007990939542651176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99093977548182e-05, + "grad_norm": 3.9819321632385254, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8765705823898315, + "num_tokens": 541344974.0, + "step": 14194 + }, + { + "epoch": 1.805749904592291, + "ewc_loss": 0.008003807626664639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003808034118265e-05, + "grad_norm": 3.9836549758911133, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.885924220085144, + "num_tokens": 541378590.0, + "step": 14195 + }, + { + "epoch": 1.8058771148708814, + "ewc_loss": 0.008000551722943783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000552043085918e-05, + "grad_norm": 3.981900691986084, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.876885712146759, + "num_tokens": 541415945.0, + "step": 14196 + }, + { + "epoch": 1.806004325149472, + "ewc_loss": 0.008004439994692802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004439587239176e-05, + "grad_norm": 3.9318413734436035, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8728699684143066, + "num_tokens": 541459577.0, + "step": 14197 + }, + { + "epoch": 1.8061315354280625, + "ewc_loss": 0.007990885525941849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.990885205799714e-05, + "grad_norm": 3.9506397247314453, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8763953447341919, + "num_tokens": 541501041.0, + "step": 14198 + }, + { + "epoch": 1.806258745706653, + "ewc_loss": 0.008006046526134014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006046118680388e-05, + "grad_norm": 3.973708152770996, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8695429563522339, + "num_tokens": 541538587.0, + "step": 14199 + }, + { + "epoch": 1.8063859559852435, + "ewc_loss": 0.008016861043870449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01686110207811e-05, + "grad_norm": 3.970299243927002, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.861886203289032, + "num_tokens": 541582225.0, + "step": 14200 + }, + { + "epoch": 1.806513166263834, + "ewc_loss": 0.008012805134057999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012805483303964e-05, + "grad_norm": 3.9422643184661865, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.886092483997345, + "num_tokens": 541616956.0, + "step": 14201 + }, + { + "epoch": 1.8066403765424246, + "ewc_loss": 0.00800293404608965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002934191608801e-05, + "grad_norm": 4.028169631958008, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8659452199935913, + "num_tokens": 541650814.0, + "step": 14202 + }, + { + "epoch": 1.8067675868210151, + "ewc_loss": 0.008068718016147614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068717579590157e-05, + "grad_norm": 3.947232246398926, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.884972870349884, + "num_tokens": 541692167.0, + "step": 14203 + }, + { + "epoch": 1.8068947970996057, + "ewc_loss": 0.007990357466042042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.990357698872685e-05, + "grad_norm": 3.9521501064300537, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8672103881835938, + "num_tokens": 541732427.0, + "step": 14204 + }, + { + "epoch": 1.8070220073781962, + "ewc_loss": 0.008017093874514103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017093932721764e-05, + "grad_norm": 3.9333860874176025, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8776549100875854, + "num_tokens": 541773256.0, + "step": 14205 + }, + { + "epoch": 1.8071492176567867, + "ewc_loss": 0.008019472472369671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01947244326584e-05, + "grad_norm": 3.953504800796509, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8814113140106201, + "num_tokens": 541815008.0, + "step": 14206 + }, + { + "epoch": 1.8072764279353772, + "ewc_loss": 0.008013347163796425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01334681455046e-05, + "grad_norm": 3.9833948612213135, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8770339488983154, + "num_tokens": 541853462.0, + "step": 14207 + }, + { + "epoch": 1.8074036382139678, + "ewc_loss": 0.00802390743046999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023907139431685e-05, + "grad_norm": 3.997368335723877, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8709043860435486, + "num_tokens": 541890487.0, + "step": 14208 + }, + { + "epoch": 1.8075308484925583, + "ewc_loss": 0.008029755204916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029755554161966e-05, + "grad_norm": 3.938023567199707, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8803258538246155, + "num_tokens": 541929857.0, + "step": 14209 + }, + { + "epoch": 1.8076580587711488, + "ewc_loss": 0.00799102894961834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991028542164713e-05, + "grad_norm": 3.976109743118286, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8683057427406311, + "num_tokens": 541969923.0, + "step": 14210 + }, + { + "epoch": 1.8077852690497394, + "ewc_loss": 0.00801362656056881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013626211322844e-05, + "grad_norm": 3.918656349182129, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8839168548583984, + "num_tokens": 542008802.0, + "step": 14211 + }, + { + "epoch": 1.8079124793283299, + "ewc_loss": 0.007976396009325981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976395863806829e-05, + "grad_norm": 3.936049222946167, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8890299201011658, + "num_tokens": 542049276.0, + "step": 14212 + }, + { + "epoch": 1.8080396896069202, + "ewc_loss": 0.008007436990737915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007436554180458e-05, + "grad_norm": 4.001684665679932, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8697676658630371, + "num_tokens": 542086921.0, + "step": 14213 + }, + { + "epoch": 1.8081668998855107, + "ewc_loss": 0.008026426658034325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026426803553477e-05, + "grad_norm": 3.971832752227783, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8760732412338257, + "num_tokens": 542124291.0, + "step": 14214 + }, + { + "epoch": 1.8082941101641012, + "ewc_loss": 0.00799187645316124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991876191226766e-05, + "grad_norm": 3.953920602798462, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8651190400123596, + "num_tokens": 542167600.0, + "step": 14215 + }, + { + "epoch": 1.8084213204426918, + "ewc_loss": 0.00799856148660183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998561341082677e-05, + "grad_norm": 4.081775665283203, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8706477284431458, + "num_tokens": 542201134.0, + "step": 14216 + }, + { + "epoch": 1.8085485307212823, + "ewc_loss": 0.008060981519520283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060981781454757e-05, + "grad_norm": 3.953282117843628, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8632901310920715, + "num_tokens": 542242590.0, + "step": 14217 + }, + { + "epoch": 1.8086757409998728, + "ewc_loss": 0.007956554181873798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95655432739295e-05, + "grad_norm": 3.972050428390503, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8765740394592285, + "num_tokens": 542281945.0, + "step": 14218 + }, + { + "epoch": 1.8088029512784631, + "ewc_loss": 0.008025221526622772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025221904972568e-05, + "grad_norm": 3.9640231132507324, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8834003210067749, + "num_tokens": 542319389.0, + "step": 14219 + }, + { + "epoch": 1.8089301615570537, + "ewc_loss": 0.008006601594388485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006602001842111e-05, + "grad_norm": 4.011995315551758, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8650308847427368, + "num_tokens": 542356912.0, + "step": 14220 + }, + { + "epoch": 1.8090573718356442, + "ewc_loss": 0.008042738772928715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042738772928715e-05, + "grad_norm": 3.998082160949707, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8700349926948547, + "num_tokens": 542394054.0, + "step": 14221 + }, + { + "epoch": 1.8091845821142347, + "ewc_loss": 0.008017096668481827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01709684310481e-05, + "grad_norm": 4.0000505447387695, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8793779015541077, + "num_tokens": 542431063.0, + "step": 14222 + }, + { + "epoch": 1.8093117923928252, + "ewc_loss": 0.008023756556212902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023756527109072e-05, + "grad_norm": 3.9964256286621094, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8650429248809814, + "num_tokens": 542471398.0, + "step": 14223 + }, + { + "epoch": 1.8094390026714158, + "ewc_loss": 0.008017548359930515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017547952476889e-05, + "grad_norm": 3.9374406337738037, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8607780933380127, + "num_tokens": 542516288.0, + "step": 14224 + }, + { + "epoch": 1.8095662129500063, + "ewc_loss": 0.00800313614308834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003136463230476e-05, + "grad_norm": 4.041336536407471, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8741132616996765, + "num_tokens": 542552292.0, + "step": 14225 + }, + { + "epoch": 1.8096934232285968, + "ewc_loss": 0.008083158172667027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.083158172667027e-05, + "grad_norm": 3.9245195388793945, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8781294822692871, + "num_tokens": 542592380.0, + "step": 14226 + }, + { + "epoch": 1.8098206335071874, + "ewc_loss": 0.007987572811543941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987572462297976e-05, + "grad_norm": 3.9871087074279785, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8694903254508972, + "num_tokens": 542632698.0, + "step": 14227 + }, + { + "epoch": 1.8099478437857779, + "ewc_loss": 0.008053836412727833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053836791077629e-05, + "grad_norm": 4.015416145324707, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8655754327774048, + "num_tokens": 542668234.0, + "step": 14228 + }, + { + "epoch": 1.8100750540643684, + "ewc_loss": 0.008047576993703842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047577284742147e-05, + "grad_norm": 3.9992153644561768, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8757385611534119, + "num_tokens": 542706205.0, + "step": 14229 + }, + { + "epoch": 1.810202264342959, + "ewc_loss": 0.008019786328077316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019786764634773e-05, + "grad_norm": 4.105715274810791, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8739330768585205, + "num_tokens": 542737068.0, + "step": 14230 + }, + { + "epoch": 1.8103294746215495, + "ewc_loss": 0.008082419633865356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082419662969187e-05, + "grad_norm": 3.9657304286956787, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8861393928527832, + "num_tokens": 542769776.0, + "step": 14231 + }, + { + "epoch": 1.81045668490014, + "ewc_loss": 0.007976589724421501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976589404279366e-05, + "grad_norm": 3.979128122329712, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.887200117111206, + "num_tokens": 542802440.0, + "step": 14232 + }, + { + "epoch": 1.8105838951787305, + "ewc_loss": 0.008055908605456352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055908256210387e-05, + "grad_norm": 3.9528284072875977, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8879653215408325, + "num_tokens": 542841158.0, + "step": 14233 + }, + { + "epoch": 1.810711105457321, + "ewc_loss": 0.008047575131058693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047575101954862e-05, + "grad_norm": 4.007528781890869, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8724414706230164, + "num_tokens": 542876637.0, + "step": 14234 + }, + { + "epoch": 1.8108383157359116, + "ewc_loss": 0.008068977855145931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068978058872744e-05, + "grad_norm": 3.962430238723755, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8827909231185913, + "num_tokens": 542915087.0, + "step": 14235 + }, + { + "epoch": 1.8109655260145021, + "ewc_loss": 0.008019533939659595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019534288905561e-05, + "grad_norm": 3.9813075065612793, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.868000864982605, + "num_tokens": 542950635.0, + "step": 14236 + }, + { + "epoch": 1.8110927362930926, + "ewc_loss": 0.008069639094173908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069638715824112e-05, + "grad_norm": 3.989718198776245, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8712925910949707, + "num_tokens": 542990078.0, + "step": 14237 + }, + { + "epoch": 1.811219946571683, + "ewc_loss": 0.008066116832196712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066116424743086e-05, + "grad_norm": 3.997164011001587, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8534743189811707, + "num_tokens": 543030333.0, + "step": 14238 + }, + { + "epoch": 1.8113471568502735, + "ewc_loss": 0.008085733279585838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085733134066686e-05, + "grad_norm": 3.9399242401123047, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8699771761894226, + "num_tokens": 543070537.0, + "step": 14239 + }, + { + "epoch": 1.811474367128864, + "ewc_loss": 0.008045244961977005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.045245340326801e-05, + "grad_norm": 3.9596848487854004, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8739272952079773, + "num_tokens": 543109137.0, + "step": 14240 + }, + { + "epoch": 1.8116015774074545, + "ewc_loss": 0.008072853088378906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.072853233898059e-05, + "grad_norm": 3.966365337371826, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8697448968887329, + "num_tokens": 543146616.0, + "step": 14241 + }, + { + "epoch": 1.811728787686045, + "ewc_loss": 0.008068627677857876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068627357715741e-05, + "grad_norm": 4.009344100952148, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.887342095375061, + "num_tokens": 543178433.0, + "step": 14242 + }, + { + "epoch": 1.8118559979646356, + "ewc_loss": 0.008092544041574001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092544157989323e-05, + "grad_norm": 3.950784206390381, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.886644184589386, + "num_tokens": 543215086.0, + "step": 14243 + }, + { + "epoch": 1.811983208243226, + "ewc_loss": 0.00803083460777998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.030834578676149e-05, + "grad_norm": 4.001606464385986, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8605049848556519, + "num_tokens": 543252111.0, + "step": 14244 + }, + { + "epoch": 1.8121104185218164, + "ewc_loss": 0.008105147629976273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.105147571768612e-05, + "grad_norm": 3.9269800186157227, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8832769393920898, + "num_tokens": 543291938.0, + "step": 14245 + }, + { + "epoch": 1.812237628800407, + "ewc_loss": 0.008036211133003235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03621078375727e-05, + "grad_norm": 3.9679627418518066, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8791658878326416, + "num_tokens": 543335141.0, + "step": 14246 + }, + { + "epoch": 1.8123648390789975, + "ewc_loss": 0.008094148710370064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09414850664325e-05, + "grad_norm": 3.999361038208008, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8754706382751465, + "num_tokens": 543371305.0, + "step": 14247 + }, + { + "epoch": 1.812492049357588, + "ewc_loss": 0.008079678751528263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.079678809735924e-05, + "grad_norm": 3.9540605545043945, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8872070908546448, + "num_tokens": 543407431.0, + "step": 14248 + }, + { + "epoch": 1.8126192596361785, + "ewc_loss": 0.00806074496358633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060744585236534e-05, + "grad_norm": 3.95436692237854, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8754726648330688, + "num_tokens": 543450287.0, + "step": 14249 + }, + { + "epoch": 1.812746469914769, + "ewc_loss": 0.008061762899160385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06176321930252e-05, + "grad_norm": 3.9488582611083984, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8880549669265747, + "num_tokens": 543486521.0, + "step": 14250 + }, + { + "epoch": 1.8128736801933596, + "ewc_loss": 0.008054157719016075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054157660808414e-05, + "grad_norm": 4.0054216384887695, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8757814168930054, + "num_tokens": 543522157.0, + "step": 14251 + }, + { + "epoch": 1.8130008904719501, + "ewc_loss": 0.008092237636446953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092237840173766e-05, + "grad_norm": 3.9733755588531494, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.878366231918335, + "num_tokens": 543560503.0, + "step": 14252 + }, + { + "epoch": 1.8131281007505406, + "ewc_loss": 0.008037501946091652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037502266233787e-05, + "grad_norm": 4.004803657531738, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8644654750823975, + "num_tokens": 543598943.0, + "step": 14253 + }, + { + "epoch": 1.8132553110291312, + "ewc_loss": 0.008065746165812016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.065746078500524e-05, + "grad_norm": 3.929170608520508, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8783456087112427, + "num_tokens": 543640466.0, + "step": 14254 + }, + { + "epoch": 1.8133825213077217, + "ewc_loss": 0.008009403012692928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009403245523572e-05, + "grad_norm": 3.9728121757507324, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8778654336929321, + "num_tokens": 543677581.0, + "step": 14255 + }, + { + "epoch": 1.8135097315863122, + "ewc_loss": 0.008054358884692192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054359204834327e-05, + "grad_norm": 4.064234733581543, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8693896532058716, + "num_tokens": 543711220.0, + "step": 14256 + }, + { + "epoch": 1.8136369418649028, + "ewc_loss": 0.008086259476840496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086259185802191e-05, + "grad_norm": 3.970914840698242, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8687148094177246, + "num_tokens": 543750109.0, + "step": 14257 + }, + { + "epoch": 1.8137641521434933, + "ewc_loss": 0.007993226870894432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99322733655572e-05, + "grad_norm": 3.975423812866211, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8801318407058716, + "num_tokens": 543788040.0, + "step": 14258 + }, + { + "epoch": 1.8138913624220838, + "ewc_loss": 0.008051395416259766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05139570729807e-05, + "grad_norm": 4.037334442138672, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8751500844955444, + "num_tokens": 543821037.0, + "step": 14259 + }, + { + "epoch": 1.8140185727006743, + "ewc_loss": 0.008076136000454426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076136145973578e-05, + "grad_norm": 3.9389822483062744, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8883146047592163, + "num_tokens": 543856331.0, + "step": 14260 + }, + { + "epoch": 1.8141457829792649, + "ewc_loss": 0.008006804622709751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006805001059547e-05, + "grad_norm": 3.9859588146209717, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8793729543685913, + "num_tokens": 543892898.0, + "step": 14261 + }, + { + "epoch": 1.8142729932578552, + "ewc_loss": 0.008067192509770393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067192538874224e-05, + "grad_norm": 3.9510459899902344, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8709282875061035, + "num_tokens": 543932858.0, + "step": 14262 + }, + { + "epoch": 1.8144002035364457, + "ewc_loss": 0.008048268966376781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.048269228311256e-05, + "grad_norm": 4.004522323608398, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8852719068527222, + "num_tokens": 543968145.0, + "step": 14263 + }, + { + "epoch": 1.8145274138150362, + "ewc_loss": 0.008074811659753323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074811921687797e-05, + "grad_norm": 4.011245250701904, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8734518885612488, + "num_tokens": 544000495.0, + "step": 14264 + }, + { + "epoch": 1.8146546240936268, + "ewc_loss": 0.00806482695043087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064827125053853e-05, + "grad_norm": 3.932353973388672, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8707574009895325, + "num_tokens": 544041345.0, + "step": 14265 + }, + { + "epoch": 1.8147818343722173, + "ewc_loss": 0.008021892048418522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021892426768318e-05, + "grad_norm": 4.011193752288818, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.880047619342804, + "num_tokens": 544080084.0, + "step": 14266 + }, + { + "epoch": 1.8149090446508078, + "ewc_loss": 0.008093580603599548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093580981949344e-05, + "grad_norm": 3.997720241546631, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8680115938186646, + "num_tokens": 544115828.0, + "step": 14267 + }, + { + "epoch": 1.8150362549293981, + "ewc_loss": 0.008044810965657234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044810965657234e-05, + "grad_norm": 3.962648391723633, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8718279600143433, + "num_tokens": 544153213.0, + "step": 14268 + }, + { + "epoch": 1.8151634652079887, + "ewc_loss": 0.008047385141253471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047385199461132e-05, + "grad_norm": 4.004763603210449, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8788443803787231, + "num_tokens": 544192383.0, + "step": 14269 + }, + { + "epoch": 1.8152906754865792, + "ewc_loss": 0.008089662529528141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.089662151178345e-05, + "grad_norm": 3.969017744064331, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.876814603805542, + "num_tokens": 544232034.0, + "step": 14270 + }, + { + "epoch": 1.8154178857651697, + "ewc_loss": 0.008059648796916008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059648826019838e-05, + "grad_norm": 4.009252548217773, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.859940230846405, + "num_tokens": 544266242.0, + "step": 14271 + }, + { + "epoch": 1.8155450960437602, + "ewc_loss": 0.008077399805188179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077399979811162e-05, + "grad_norm": 3.9966952800750732, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8644402027130127, + "num_tokens": 544304995.0, + "step": 14272 + }, + { + "epoch": 1.8156723063223508, + "ewc_loss": 0.008066239766776562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066239388426766e-05, + "grad_norm": 3.9850449562072754, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8741134405136108, + "num_tokens": 544341950.0, + "step": 14273 + }, + { + "epoch": 1.8157995166009413, + "ewc_loss": 0.008059057407081127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059057290665805e-05, + "grad_norm": 3.958089590072632, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8715029954910278, + "num_tokens": 544379535.0, + "step": 14274 + }, + { + "epoch": 1.8159267268795318, + "ewc_loss": 0.008058903738856316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058903767960146e-05, + "grad_norm": 3.996526002883911, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8748779296875, + "num_tokens": 544416939.0, + "step": 14275 + }, + { + "epoch": 1.8160539371581224, + "ewc_loss": 0.008094372227787971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.094372606137767e-05, + "grad_norm": 3.8861451148986816, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8859465718269348, + "num_tokens": 544461476.0, + "step": 14276 + }, + { + "epoch": 1.8161811474367129, + "ewc_loss": 0.00801989808678627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01989808678627e-05, + "grad_norm": 3.942868709564209, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8759016394615173, + "num_tokens": 544505051.0, + "step": 14277 + }, + { + "epoch": 1.8163083577153034, + "ewc_loss": 0.008092246949672699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092246571322903e-05, + "grad_norm": 3.914729595184326, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.877081573009491, + "num_tokens": 544550408.0, + "step": 14278 + }, + { + "epoch": 1.816435567993894, + "ewc_loss": 0.008026372641324997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026372961467132e-05, + "grad_norm": 4.015700817108154, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8739481568336487, + "num_tokens": 544586645.0, + "step": 14279 + }, + { + "epoch": 1.8165627782724845, + "ewc_loss": 0.008093366399407387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093366341199726e-05, + "grad_norm": 3.9600653648376465, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.869604229927063, + "num_tokens": 544628087.0, + "step": 14280 + }, + { + "epoch": 1.816689988551075, + "ewc_loss": 0.00800369493663311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003695256775245e-05, + "grad_norm": 3.9300270080566406, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.89125657081604, + "num_tokens": 544667961.0, + "step": 14281 + }, + { + "epoch": 1.8168171988296655, + "ewc_loss": 0.008028126321732998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02812646725215e-05, + "grad_norm": 4.009698867797852, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8717637062072754, + "num_tokens": 544704919.0, + "step": 14282 + }, + { + "epoch": 1.816944409108256, + "ewc_loss": 0.008053554221987724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053554483922198e-05, + "grad_norm": 3.9626572132110596, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8809678554534912, + "num_tokens": 544741421.0, + "step": 14283 + }, + { + "epoch": 1.8170716193868466, + "ewc_loss": 0.007997049018740654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997048669494689e-05, + "grad_norm": 3.951944589614868, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8789952397346497, + "num_tokens": 544776414.0, + "step": 14284 + }, + { + "epoch": 1.817198829665437, + "ewc_loss": 0.00800877995789051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.008779695956036e-05, + "grad_norm": 3.967668056488037, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8700506091117859, + "num_tokens": 544814964.0, + "step": 14285 + }, + { + "epoch": 1.8173260399440276, + "ewc_loss": 0.008028010837733746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028010779526085e-05, + "grad_norm": 3.9767239093780518, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8621348142623901, + "num_tokens": 544853381.0, + "step": 14286 + }, + { + "epoch": 1.817453250222618, + "ewc_loss": 0.008012702688574791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012702892301604e-05, + "grad_norm": 3.9425554275512695, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8787876963615417, + "num_tokens": 544892474.0, + "step": 14287 + }, + { + "epoch": 1.8175804605012085, + "ewc_loss": 0.008022453635931015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022453403100371e-05, + "grad_norm": 3.998957872390747, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8838860988616943, + "num_tokens": 544927667.0, + "step": 14288 + }, + { + "epoch": 1.817707670779799, + "ewc_loss": 0.00805191695690155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051917393459007e-05, + "grad_norm": 4.047377109527588, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8814375400543213, + "num_tokens": 544959780.0, + "step": 14289 + }, + { + "epoch": 1.8178348810583895, + "ewc_loss": 0.008050670847296715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050671021919698e-05, + "grad_norm": 3.971186399459839, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8697729110717773, + "num_tokens": 545001832.0, + "step": 14290 + }, + { + "epoch": 1.81796209133698, + "ewc_loss": 0.008002758026123047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002758113434538e-05, + "grad_norm": 3.975719690322876, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8836899399757385, + "num_tokens": 545038489.0, + "step": 14291 + }, + { + "epoch": 1.8180893016155706, + "ewc_loss": 0.008040976710617542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04097653599456e-05, + "grad_norm": 4.093629837036133, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8580182194709778, + "num_tokens": 545069086.0, + "step": 14292 + }, + { + "epoch": 1.818216511894161, + "ewc_loss": 0.008099363185465336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.099363185465336e-05, + "grad_norm": 3.989558219909668, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8788552284240723, + "num_tokens": 545104564.0, + "step": 14293 + }, + { + "epoch": 1.8183437221727514, + "ewc_loss": 0.008003813214600086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003813127288595e-05, + "grad_norm": 3.9494385719299316, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8792061805725098, + "num_tokens": 545149036.0, + "step": 14294 + }, + { + "epoch": 1.818470932451342, + "ewc_loss": 0.00801760982722044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01760979811661e-05, + "grad_norm": 3.953244686126709, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8863667249679565, + "num_tokens": 545183825.0, + "step": 14295 + }, + { + "epoch": 1.8185981427299325, + "ewc_loss": 0.008048340678215027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.048340532695875e-05, + "grad_norm": 3.9831817150115967, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8805320858955383, + "num_tokens": 545221274.0, + "step": 14296 + }, + { + "epoch": 1.818725353008523, + "ewc_loss": 0.008031297475099564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031297329580411e-05, + "grad_norm": 3.948455810546875, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8831307888031006, + "num_tokens": 545259242.0, + "step": 14297 + }, + { + "epoch": 1.8188525632871135, + "ewc_loss": 0.00802534818649292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025348506635055e-05, + "grad_norm": 4.113712787628174, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8626324534416199, + "num_tokens": 545289380.0, + "step": 14298 + }, + { + "epoch": 1.818979773565704, + "ewc_loss": 0.00813353806734085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.133538358379155e-05, + "grad_norm": 4.01074743270874, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8799077868461609, + "num_tokens": 545321935.0, + "step": 14299 + }, + { + "epoch": 1.8191069838442946, + "ewc_loss": 0.008009855635464191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009855810087174e-05, + "grad_norm": 3.8867757320404053, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8818888664245605, + "num_tokens": 545365714.0, + "step": 14300 + }, + { + "epoch": 1.8192341941228851, + "ewc_loss": 0.008001442067325115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001441892702132e-05, + "grad_norm": 3.9668710231781006, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8658484816551208, + "num_tokens": 545406838.0, + "step": 14301 + }, + { + "epoch": 1.8193614044014756, + "ewc_loss": 0.008092348463833332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092348434729502e-05, + "grad_norm": 3.990046739578247, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8725014925003052, + "num_tokens": 545443265.0, + "step": 14302 + }, + { + "epoch": 1.8194886146800662, + "ewc_loss": 0.008069561794400215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069561590673402e-05, + "grad_norm": 3.9853017330169678, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8868786096572876, + "num_tokens": 545479903.0, + "step": 14303 + }, + { + "epoch": 1.8196158249586567, + "ewc_loss": 0.008038648404181004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038648229558021e-05, + "grad_norm": 3.96578311920166, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8826122283935547, + "num_tokens": 545513767.0, + "step": 14304 + }, + { + "epoch": 1.8197430352372472, + "ewc_loss": 0.00806993804872036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069937757682055e-05, + "grad_norm": 4.0351786613464355, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8611375093460083, + "num_tokens": 545550541.0, + "step": 14305 + }, + { + "epoch": 1.8198702455158378, + "ewc_loss": 0.008094754070043564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09475386631675e-05, + "grad_norm": 3.9679062366485596, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.876517653465271, + "num_tokens": 545590739.0, + "step": 14306 + }, + { + "epoch": 1.8199974557944283, + "ewc_loss": 0.008036098442971706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03609800641425e-05, + "grad_norm": 4.058577060699463, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.862242579460144, + "num_tokens": 545627127.0, + "step": 14307 + }, + { + "epoch": 1.8201246660730188, + "ewc_loss": 0.008125671185553074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.1256715930067e-05, + "grad_norm": 3.938793182373047, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8802530765533447, + "num_tokens": 545665473.0, + "step": 14308 + }, + { + "epoch": 1.8202518763516093, + "ewc_loss": 0.008009168319404125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009168232092634e-05, + "grad_norm": 3.9479336738586426, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8911834955215454, + "num_tokens": 545708018.0, + "step": 14309 + }, + { + "epoch": 1.8203790866301999, + "ewc_loss": 0.008068995550274849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068995521171018e-05, + "grad_norm": 3.9964680671691895, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8815882802009583, + "num_tokens": 545745635.0, + "step": 14310 + }, + { + "epoch": 1.8205062969087902, + "ewc_loss": 0.008075025863945484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.075025834841654e-05, + "grad_norm": 3.9811694622039795, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8652591705322266, + "num_tokens": 545785260.0, + "step": 14311 + }, + { + "epoch": 1.8206335071873807, + "ewc_loss": 0.008048531599342823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.048531162785366e-05, + "grad_norm": 4.009826183319092, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8750967979431152, + "num_tokens": 545821696.0, + "step": 14312 + }, + { + "epoch": 1.8207607174659712, + "ewc_loss": 0.008064167574048042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064167923294008e-05, + "grad_norm": 3.9723854064941406, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8884769082069397, + "num_tokens": 545857971.0, + "step": 14313 + }, + { + "epoch": 1.8208879277445618, + "ewc_loss": 0.008038513362407684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038512896746397e-05, + "grad_norm": 4.075833797454834, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8660085797309875, + "num_tokens": 545891000.0, + "step": 14314 + }, + { + "epoch": 1.8210151380231523, + "ewc_loss": 0.008093545213341713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093545329757035e-05, + "grad_norm": 3.954939603805542, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8628218173980713, + "num_tokens": 545932825.0, + "step": 14315 + }, + { + "epoch": 1.8211423483017428, + "ewc_loss": 0.00800507701933384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.005076961126179e-05, + "grad_norm": 3.968040704727173, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8664371967315674, + "num_tokens": 545972849.0, + "step": 14316 + }, + { + "epoch": 1.8212695585803331, + "ewc_loss": 0.00808736402541399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087364403763786e-05, + "grad_norm": 3.9595015048980713, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8874579071998596, + "num_tokens": 546011492.0, + "step": 14317 + }, + { + "epoch": 1.8213967688589237, + "ewc_loss": 0.00805690512061119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05690506240353e-05, + "grad_norm": 4.026479721069336, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8703990578651428, + "num_tokens": 546048461.0, + "step": 14318 + }, + { + "epoch": 1.8215239791375142, + "ewc_loss": 0.00809216033667326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092160715023056e-05, + "grad_norm": 3.938347101211548, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8846508860588074, + "num_tokens": 546087113.0, + "step": 14319 + }, + { + "epoch": 1.8216511894161047, + "ewc_loss": 0.008012568578124046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012568287085742e-05, + "grad_norm": 4.008780479431152, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8786094188690186, + "num_tokens": 546124413.0, + "step": 14320 + }, + { + "epoch": 1.8217783996946952, + "ewc_loss": 0.008080778643488884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080778206931427e-05, + "grad_norm": 3.895472764968872, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.886491060256958, + "num_tokens": 546165763.0, + "step": 14321 + }, + { + "epoch": 1.8219056099732858, + "ewc_loss": 0.00798838771879673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988388097146526e-05, + "grad_norm": 3.9680275917053223, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8773490190505981, + "num_tokens": 546203218.0, + "step": 14322 + }, + { + "epoch": 1.8220328202518763, + "ewc_loss": 0.008070018142461777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070017793215811e-05, + "grad_norm": 3.9570720195770264, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8670573830604553, + "num_tokens": 546247610.0, + "step": 14323 + }, + { + "epoch": 1.8221600305304668, + "ewc_loss": 0.008026139810681343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026140130823478e-05, + "grad_norm": 3.9861254692077637, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8668435215950012, + "num_tokens": 546288952.0, + "step": 14324 + }, + { + "epoch": 1.8222872408090574, + "ewc_loss": 0.00804249756038189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042497938731685e-05, + "grad_norm": 3.989983558654785, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8875788450241089, + "num_tokens": 546322449.0, + "step": 14325 + }, + { + "epoch": 1.8224144510876479, + "ewc_loss": 0.008039446547627449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039446402108297e-05, + "grad_norm": 3.9396557807922363, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8822104334831238, + "num_tokens": 546362692.0, + "step": 14326 + }, + { + "epoch": 1.8225416613662384, + "ewc_loss": 0.008001159876585007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001160313142464e-05, + "grad_norm": 4.045745849609375, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8604009747505188, + "num_tokens": 546398856.0, + "step": 14327 + }, + { + "epoch": 1.822668871644829, + "ewc_loss": 0.008082129061222076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082129352260381e-05, + "grad_norm": 3.97343111038208, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8677632212638855, + "num_tokens": 546439633.0, + "step": 14328 + }, + { + "epoch": 1.8227960819234195, + "ewc_loss": 0.007989960722625256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.989960431586951e-05, + "grad_norm": 3.9478492736816406, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8793985247612, + "num_tokens": 546477282.0, + "step": 14329 + }, + { + "epoch": 1.82292329220201, + "ewc_loss": 0.008022533729672432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022533438634127e-05, + "grad_norm": 3.958695411682129, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8858696222305298, + "num_tokens": 546514584.0, + "step": 14330 + }, + { + "epoch": 1.8230505024806005, + "ewc_loss": 0.008026892319321632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026892464840785e-05, + "grad_norm": 3.969106674194336, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8800891041755676, + "num_tokens": 546550761.0, + "step": 14331 + }, + { + "epoch": 1.823177712759191, + "ewc_loss": 0.008019981905817986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019981760298833e-05, + "grad_norm": 3.969179153442383, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8627300262451172, + "num_tokens": 546594157.0, + "step": 14332 + }, + { + "epoch": 1.8233049230377816, + "ewc_loss": 0.008007623255252838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007622818695381e-05, + "grad_norm": 3.933598756790161, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8838748931884766, + "num_tokens": 546634671.0, + "step": 14333 + }, + { + "epoch": 1.823432133316372, + "ewc_loss": 0.007975024171173573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975024345796555e-05, + "grad_norm": 3.9936883449554443, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8767719268798828, + "num_tokens": 546672084.0, + "step": 14334 + }, + { + "epoch": 1.8235593435949626, + "ewc_loss": 0.008045974187552929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.045974391279742e-05, + "grad_norm": 4.0686798095703125, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8699671626091003, + "num_tokens": 546701920.0, + "step": 14335 + }, + { + "epoch": 1.823686553873553, + "ewc_loss": 0.008067939430475235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067939052125439e-05, + "grad_norm": 3.9897449016571045, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8804051876068115, + "num_tokens": 546735776.0, + "step": 14336 + }, + { + "epoch": 1.8238137641521435, + "ewc_loss": 0.007984769530594349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984769763424993e-05, + "grad_norm": 3.98659348487854, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8811256885528564, + "num_tokens": 546772991.0, + "step": 14337 + }, + { + "epoch": 1.823940974430734, + "ewc_loss": 0.008035083301365376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035083737922832e-05, + "grad_norm": 4.03616189956665, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8694737553596497, + "num_tokens": 546805310.0, + "step": 14338 + }, + { + "epoch": 1.8240681847093245, + "ewc_loss": 0.008041160181164742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.041160617722198e-05, + "grad_norm": 4.008354663848877, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8825404644012451, + "num_tokens": 546838240.0, + "step": 14339 + }, + { + "epoch": 1.824195394987915, + "ewc_loss": 0.008031737059354782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03173752501607e-05, + "grad_norm": 3.9802050590515137, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.870557427406311, + "num_tokens": 546878484.0, + "step": 14340 + }, + { + "epoch": 1.8243226052665056, + "ewc_loss": 0.008031321689486504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031321340240538e-05, + "grad_norm": 3.9382107257843018, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8904410600662231, + "num_tokens": 546919747.0, + "step": 14341 + }, + { + "epoch": 1.8244498155450959, + "ewc_loss": 0.008019602857530117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019602682907134e-05, + "grad_norm": 3.943479061126709, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8747018575668335, + "num_tokens": 546962614.0, + "step": 14342 + }, + { + "epoch": 1.8245770258236864, + "ewc_loss": 0.00804364588111639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043646084843203e-05, + "grad_norm": 3.9721407890319824, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8698290586471558, + "num_tokens": 547000059.0, + "step": 14343 + }, + { + "epoch": 1.824704236102277, + "ewc_loss": 0.008047197945415974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047198207350448e-05, + "grad_norm": 3.9504547119140625, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8761624097824097, + "num_tokens": 547037372.0, + "step": 14344 + }, + { + "epoch": 1.8248314463808675, + "ewc_loss": 0.008034848608076572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034848724491894e-05, + "grad_norm": 4.035909175872803, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8624976873397827, + "num_tokens": 547071840.0, + "step": 14345 + }, + { + "epoch": 1.824958656659458, + "ewc_loss": 0.008093567565083504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093567885225639e-05, + "grad_norm": 3.986280679702759, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8523461222648621, + "num_tokens": 547108916.0, + "step": 14346 + }, + { + "epoch": 1.8250858669380485, + "ewc_loss": 0.00803286861628294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032868208829314e-05, + "grad_norm": 3.9625821113586426, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8934659957885742, + "num_tokens": 547146991.0, + "step": 14347 + }, + { + "epoch": 1.825213077216639, + "ewc_loss": 0.008048741146922112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.048740710364655e-05, + "grad_norm": 3.9711713790893555, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8847471475601196, + "num_tokens": 547184093.0, + "step": 14348 + }, + { + "epoch": 1.8253402874952296, + "ewc_loss": 0.008059712126851082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059712126851082e-05, + "grad_norm": 4.0196661949157715, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8822624683380127, + "num_tokens": 547217570.0, + "step": 14349 + }, + { + "epoch": 1.8254674977738201, + "ewc_loss": 0.008064337074756622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064336725510657e-05, + "grad_norm": 3.918478488922119, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8764075040817261, + "num_tokens": 547259498.0, + "step": 14350 + }, + { + "epoch": 1.8255947080524106, + "ewc_loss": 0.00799760315567255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997603097464889e-05, + "grad_norm": 3.939085006713867, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8752765655517578, + "num_tokens": 547298419.0, + "step": 14351 + }, + { + "epoch": 1.8257219183310012, + "ewc_loss": 0.008058463223278522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058462844928727e-05, + "grad_norm": 3.911259889602661, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8822823762893677, + "num_tokens": 547342984.0, + "step": 14352 + }, + { + "epoch": 1.8258491286095917, + "ewc_loss": 0.00803093146532774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.030931348912418e-05, + "grad_norm": 3.966254949569702, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8776392936706543, + "num_tokens": 547383162.0, + "step": 14353 + }, + { + "epoch": 1.8259763388881822, + "ewc_loss": 0.008065990172326565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.065989823080599e-05, + "grad_norm": 3.9822590351104736, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8960515260696411, + "num_tokens": 547416967.0, + "step": 14354 + }, + { + "epoch": 1.8261035491667728, + "ewc_loss": 0.008047095499932766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047095616348088e-05, + "grad_norm": 3.970379114151001, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8790864944458008, + "num_tokens": 547454695.0, + "step": 14355 + }, + { + "epoch": 1.8262307594453633, + "ewc_loss": 0.008035934530198574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03593429736793e-05, + "grad_norm": 4.01383638381958, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8868852853775024, + "num_tokens": 547494258.0, + "step": 14356 + }, + { + "epoch": 1.8263579697239538, + "ewc_loss": 0.008050466887652874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0504672951065e-05, + "grad_norm": 3.9645378589630127, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8718560934066772, + "num_tokens": 547531404.0, + "step": 14357 + }, + { + "epoch": 1.8264851800025443, + "ewc_loss": 0.007999683730304241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.999684021342546e-05, + "grad_norm": 3.996288537979126, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8813360333442688, + "num_tokens": 547566553.0, + "step": 14358 + }, + { + "epoch": 1.8266123902811349, + "ewc_loss": 0.00803880114108324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038801024667919e-05, + "grad_norm": 3.9818975925445557, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8705145120620728, + "num_tokens": 547605505.0, + "step": 14359 + }, + { + "epoch": 1.8267396005597252, + "ewc_loss": 0.00801020860671997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010208694031462e-05, + "grad_norm": 3.9426445960998535, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8849167227745056, + "num_tokens": 547643146.0, + "step": 14360 + }, + { + "epoch": 1.8268668108383157, + "ewc_loss": 0.007997414097189903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99741392256692e-05, + "grad_norm": 3.9871788024902344, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.872237503528595, + "num_tokens": 547683373.0, + "step": 14361 + }, + { + "epoch": 1.8269940211169062, + "ewc_loss": 0.00804454367607832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044543938012794e-05, + "grad_norm": 4.0785088539123535, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8723316192626953, + "num_tokens": 547716762.0, + "step": 14362 + }, + { + "epoch": 1.8271212313954968, + "ewc_loss": 0.008065450005233288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.065449947025627e-05, + "grad_norm": 3.9659314155578613, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8849183917045593, + "num_tokens": 547756907.0, + "step": 14363 + }, + { + "epoch": 1.8272484416740873, + "ewc_loss": 0.007976540364325047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976540655363351e-05, + "grad_norm": 3.9629323482513428, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8725630044937134, + "num_tokens": 547793545.0, + "step": 14364 + }, + { + "epoch": 1.8273756519526778, + "ewc_loss": 0.00802681129425764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026810974115506e-05, + "grad_norm": 3.9729185104370117, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8667494058609009, + "num_tokens": 547836845.0, + "step": 14365 + }, + { + "epoch": 1.8275028622312681, + "ewc_loss": 0.008031477220356464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031477045733482e-05, + "grad_norm": 3.9826722145080566, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8819965124130249, + "num_tokens": 547875849.0, + "step": 14366 + }, + { + "epoch": 1.8276300725098586, + "ewc_loss": 0.008020129054784775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02012873464264e-05, + "grad_norm": 3.9376673698425293, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8711100816726685, + "num_tokens": 547918672.0, + "step": 14367 + }, + { + "epoch": 1.8277572827884492, + "ewc_loss": 0.007994177751243114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994177576620132e-05, + "grad_norm": 4.057089805603027, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8631225824356079, + "num_tokens": 547958681.0, + "step": 14368 + }, + { + "epoch": 1.8278844930670397, + "ewc_loss": 0.008068036288022995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068036549957469e-05, + "grad_norm": 3.9693233966827393, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8718481063842773, + "num_tokens": 547996940.0, + "step": 14369 + }, + { + "epoch": 1.8280117033456302, + "ewc_loss": 0.007966272532939911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.966272096382454e-05, + "grad_norm": 4.016201972961426, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8849462270736694, + "num_tokens": 548030188.0, + "step": 14370 + }, + { + "epoch": 1.8281389136242208, + "ewc_loss": 0.008032659068703651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032659388845786e-05, + "grad_norm": 3.9295318126678467, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8913587927818298, + "num_tokens": 548066891.0, + "step": 14371 + }, + { + "epoch": 1.8282661239028113, + "ewc_loss": 0.007963817566633224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.96381791587919e-05, + "grad_norm": 3.956141948699951, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8811744451522827, + "num_tokens": 548104664.0, + "step": 14372 + }, + { + "epoch": 1.8283933341814018, + "ewc_loss": 0.00802161917090416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021618850762025e-05, + "grad_norm": 3.975217580795288, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8716601133346558, + "num_tokens": 548143938.0, + "step": 14373 + }, + { + "epoch": 1.8285205444599923, + "ewc_loss": 0.008008183911442757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.008183795027435e-05, + "grad_norm": 3.9532532691955566, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8746916055679321, + "num_tokens": 548183213.0, + "step": 14374 + }, + { + "epoch": 1.8286477547385829, + "ewc_loss": 0.008000441826879978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000442176125944e-05, + "grad_norm": 4.043869495391846, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8606398105621338, + "num_tokens": 548221304.0, + "step": 14375 + }, + { + "epoch": 1.8287749650171734, + "ewc_loss": 0.008053860627114773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053860801737756e-05, + "grad_norm": 3.9352080821990967, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8731915950775146, + "num_tokens": 548261123.0, + "step": 14376 + }, + { + "epoch": 1.828902175295764, + "ewc_loss": 0.007967902347445488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967901910888031e-05, + "grad_norm": 3.9646267890930176, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.857447624206543, + "num_tokens": 548305040.0, + "step": 14377 + }, + { + "epoch": 1.8290293855743545, + "ewc_loss": 0.008020115084946156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020114910323173e-05, + "grad_norm": 3.977567195892334, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8842665553092957, + "num_tokens": 548338538.0, + "step": 14378 + }, + { + "epoch": 1.829156595852945, + "ewc_loss": 0.008024848997592926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02484864834696e-05, + "grad_norm": 3.951810121536255, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8796066641807556, + "num_tokens": 548378510.0, + "step": 14379 + }, + { + "epoch": 1.8292838061315355, + "ewc_loss": 0.007994420826435089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994420593604445e-05, + "grad_norm": 3.99245548248291, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8706150650978088, + "num_tokens": 548415790.0, + "step": 14380 + }, + { + "epoch": 1.829411016410126, + "ewc_loss": 0.008044383488595486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04438313934952e-05, + "grad_norm": 4.039621353149414, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8678483366966248, + "num_tokens": 548447294.0, + "step": 14381 + }, + { + "epoch": 1.8295382266887166, + "ewc_loss": 0.008055276237428188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055275975493714e-05, + "grad_norm": 3.976933002471924, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8831601142883301, + "num_tokens": 548484617.0, + "step": 14382 + }, + { + "epoch": 1.829665436967307, + "ewc_loss": 0.008013315498828888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013315527932718e-05, + "grad_norm": 3.933298110961914, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8730639815330505, + "num_tokens": 548527552.0, + "step": 14383 + }, + { + "epoch": 1.8297926472458976, + "ewc_loss": 0.008028116077184677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02811628091149e-05, + "grad_norm": 3.980659246444702, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8939067125320435, + "num_tokens": 548561322.0, + "step": 14384 + }, + { + "epoch": 1.829919857524488, + "ewc_loss": 0.008048361167311668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.048361632972956e-05, + "grad_norm": 3.993820905685425, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8861303329467773, + "num_tokens": 548594810.0, + "step": 14385 + }, + { + "epoch": 1.8300470678030785, + "ewc_loss": 0.00804455578327179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044555579544976e-05, + "grad_norm": 4.0004563331604, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8697695136070251, + "num_tokens": 548631094.0, + "step": 14386 + }, + { + "epoch": 1.830174278081669, + "ewc_loss": 0.008049262687563896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.049262396525592e-05, + "grad_norm": 4.027559757232666, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8777972459793091, + "num_tokens": 548669162.0, + "step": 14387 + }, + { + "epoch": 1.8303014883602595, + "ewc_loss": 0.008061842992901802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061842527240515e-05, + "grad_norm": 3.9944393634796143, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8763997554779053, + "num_tokens": 548704555.0, + "step": 14388 + }, + { + "epoch": 1.83042869863885, + "ewc_loss": 0.00803749542683363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037495717871934e-05, + "grad_norm": 3.9403340816497803, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8879945278167725, + "num_tokens": 548741852.0, + "step": 14389 + }, + { + "epoch": 1.8305559089174406, + "ewc_loss": 0.00802954938262701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029549644561484e-05, + "grad_norm": 3.964000940322876, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8638059496879578, + "num_tokens": 548788455.0, + "step": 14390 + }, + { + "epoch": 1.8306831191960309, + "ewc_loss": 0.008047112263739109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0471123510506e-05, + "grad_norm": 3.986100435256958, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8828279972076416, + "num_tokens": 548826347.0, + "step": 14391 + }, + { + "epoch": 1.8308103294746214, + "ewc_loss": 0.008061805739998817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061805419856682e-05, + "grad_norm": 3.9778223037719727, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8687670230865479, + "num_tokens": 548865428.0, + "step": 14392 + }, + { + "epoch": 1.830937539753212, + "ewc_loss": 0.008014131337404251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.014131162781268e-05, + "grad_norm": 3.9672491550445557, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8657114505767822, + "num_tokens": 548901671.0, + "step": 14393 + }, + { + "epoch": 1.8310647500318025, + "ewc_loss": 0.008037756197154522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037756197154522e-05, + "grad_norm": 3.9903340339660645, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8729795813560486, + "num_tokens": 548939234.0, + "step": 14394 + }, + { + "epoch": 1.831191960310393, + "ewc_loss": 0.008051139302551746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05113886599429e-05, + "grad_norm": 3.963392496109009, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8805716037750244, + "num_tokens": 548975334.0, + "step": 14395 + }, + { + "epoch": 1.8313191705889835, + "ewc_loss": 0.008062236942350864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062236884143203e-05, + "grad_norm": 3.9619052410125732, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8687796592712402, + "num_tokens": 549016372.0, + "step": 14396 + }, + { + "epoch": 1.831446380867574, + "ewc_loss": 0.008045732043683529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04573210189119e-05, + "grad_norm": 3.9182989597320557, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8845125436782837, + "num_tokens": 549058049.0, + "step": 14397 + }, + { + "epoch": 1.8315735911461646, + "ewc_loss": 0.008013689890503883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013689512154087e-05, + "grad_norm": 4.027935028076172, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.864287793636322, + "num_tokens": 549097855.0, + "step": 14398 + }, + { + "epoch": 1.831700801424755, + "ewc_loss": 0.008094960823655128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.094960503512993e-05, + "grad_norm": 3.9967663288116455, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8960564136505127, + "num_tokens": 549130906.0, + "step": 14399 + }, + { + "epoch": 1.8318280117033456, + "ewc_loss": 0.008005953393876553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.005953714018688e-05, + "grad_norm": 3.9407238960266113, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8581408262252808, + "num_tokens": 549176574.0, + "step": 14400 + }, + { + "epoch": 1.8319552219819362, + "ewc_loss": 0.008005788549780846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.005788549780846e-05, + "grad_norm": 4.002152919769287, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.87446129322052, + "num_tokens": 549213945.0, + "step": 14401 + }, + { + "epoch": 1.8320824322605267, + "ewc_loss": 0.008059424348175526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05942399892956e-05, + "grad_norm": 3.941303253173828, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8675523996353149, + "num_tokens": 549255843.0, + "step": 14402 + }, + { + "epoch": 1.8322096425391172, + "ewc_loss": 0.008010542951524258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010542660485953e-05, + "grad_norm": 3.9965028762817383, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8794884085655212, + "num_tokens": 549292573.0, + "step": 14403 + }, + { + "epoch": 1.8323368528177078, + "ewc_loss": 0.008060378022491932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06037787697278e-05, + "grad_norm": 3.885114908218384, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8890501856803894, + "num_tokens": 549335291.0, + "step": 14404 + }, + { + "epoch": 1.8324640630962983, + "ewc_loss": 0.007968399673700333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968399586388841e-05, + "grad_norm": 3.954960584640503, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8632622957229614, + "num_tokens": 549381337.0, + "step": 14405 + }, + { + "epoch": 1.8325912733748888, + "ewc_loss": 0.008046713657677174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046713628573343e-05, + "grad_norm": 3.9817261695861816, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8856849670410156, + "num_tokens": 549418180.0, + "step": 14406 + }, + { + "epoch": 1.8327184836534793, + "ewc_loss": 0.008024186827242374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02418653620407e-05, + "grad_norm": 3.9202115535736084, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8829615116119385, + "num_tokens": 549461623.0, + "step": 14407 + }, + { + "epoch": 1.8328456939320699, + "ewc_loss": 0.007979925721883774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.97992615844123e-05, + "grad_norm": 3.9845757484436035, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8799678087234497, + "num_tokens": 549500791.0, + "step": 14408 + }, + { + "epoch": 1.8329729042106602, + "ewc_loss": 0.008051946759223938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051946497289464e-05, + "grad_norm": 4.021050930023193, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8831771612167358, + "num_tokens": 549533343.0, + "step": 14409 + }, + { + "epoch": 1.8331001144892507, + "ewc_loss": 0.008023083209991455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02308350102976e-05, + "grad_norm": 3.964240789413452, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8692089319229126, + "num_tokens": 549572707.0, + "step": 14410 + }, + { + "epoch": 1.8332273247678412, + "ewc_loss": 0.007986397482454777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.986397395143285e-05, + "grad_norm": 3.9859578609466553, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8825390338897705, + "num_tokens": 549609772.0, + "step": 14411 + }, + { + "epoch": 1.8333545350464318, + "ewc_loss": 0.00802610069513321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026100840652362e-05, + "grad_norm": 4.002768039703369, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8646061420440674, + "num_tokens": 549648626.0, + "step": 14412 + }, + { + "epoch": 1.8334817453250223, + "ewc_loss": 0.008013397455215454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013397018657997e-05, + "grad_norm": 3.9520299434661865, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.864607572555542, + "num_tokens": 549693211.0, + "step": 14413 + }, + { + "epoch": 1.8336089556036128, + "ewc_loss": 0.007985533215105534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98553301137872e-05, + "grad_norm": 4.032618999481201, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8710647225379944, + "num_tokens": 549729463.0, + "step": 14414 + }, + { + "epoch": 1.8337361658822031, + "ewc_loss": 0.00803814735263586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038147643674165e-05, + "grad_norm": 3.958223819732666, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.880855143070221, + "num_tokens": 549769139.0, + "step": 14415 + }, + { + "epoch": 1.8338633761607936, + "ewc_loss": 0.007991543971002102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991544407559559e-05, + "grad_norm": 3.9669597148895264, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.879449188709259, + "num_tokens": 549807067.0, + "step": 14416 + }, + { + "epoch": 1.8339905864393842, + "ewc_loss": 0.008007396943867207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00739653641358e-05, + "grad_norm": 3.9955670833587646, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8752595782279968, + "num_tokens": 549842689.0, + "step": 14417 + }, + { + "epoch": 1.8341177967179747, + "ewc_loss": 0.008045442402362823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.045442518778145e-05, + "grad_norm": 4.001235485076904, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8953709602355957, + "num_tokens": 549876108.0, + "step": 14418 + }, + { + "epoch": 1.8342450069965652, + "ewc_loss": 0.008018190041184425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01818969193846e-05, + "grad_norm": 3.964124917984009, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8894287347793579, + "num_tokens": 549909656.0, + "step": 14419 + }, + { + "epoch": 1.8343722172751558, + "ewc_loss": 0.008009042590856552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009042358025908e-05, + "grad_norm": 3.9748241901397705, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8681365251541138, + "num_tokens": 549949156.0, + "step": 14420 + }, + { + "epoch": 1.8344994275537463, + "ewc_loss": 0.008026694878935814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026694558793679e-05, + "grad_norm": 3.9333717823028564, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8822280168533325, + "num_tokens": 549990256.0, + "step": 14421 + }, + { + "epoch": 1.8346266378323368, + "ewc_loss": 0.008004758507013321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004758274182677e-05, + "grad_norm": 3.999539613723755, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8855791091918945, + "num_tokens": 550023557.0, + "step": 14422 + }, + { + "epoch": 1.8347538481109273, + "ewc_loss": 0.008061396889388561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061396511038765e-05, + "grad_norm": 3.977396011352539, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8723233938217163, + "num_tokens": 550069402.0, + "step": 14423 + }, + { + "epoch": 1.8348810583895179, + "ewc_loss": 0.008020277135074139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020277164177969e-05, + "grad_norm": 3.9410641193389893, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8837127089500427, + "num_tokens": 550110860.0, + "step": 14424 + }, + { + "epoch": 1.8350082686681084, + "ewc_loss": 0.007997258566319942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997258217073977e-05, + "grad_norm": 3.9986491203308105, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8601465225219727, + "num_tokens": 550151736.0, + "step": 14425 + }, + { + "epoch": 1.835135478946699, + "ewc_loss": 0.008063208311796188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.063208224484697e-05, + "grad_norm": 3.994885206222534, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8793689012527466, + "num_tokens": 550187731.0, + "step": 14426 + }, + { + "epoch": 1.8352626892252895, + "ewc_loss": 0.00801868923008442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018688822630793e-05, + "grad_norm": 4.044010162353516, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8780754804611206, + "num_tokens": 550220545.0, + "step": 14427 + }, + { + "epoch": 1.83538989950388, + "ewc_loss": 0.008070424199104309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070424519246444e-05, + "grad_norm": 3.9854516983032227, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8798699975013733, + "num_tokens": 550254755.0, + "step": 14428 + }, + { + "epoch": 1.8355171097824705, + "ewc_loss": 0.008018437772989273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018437802093104e-05, + "grad_norm": 3.9592819213867188, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8842814564704895, + "num_tokens": 550295191.0, + "step": 14429 + }, + { + "epoch": 1.835644320061061, + "ewc_loss": 0.008024157956242561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024158159969375e-05, + "grad_norm": 4.020264625549316, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8719392418861389, + "num_tokens": 550332984.0, + "step": 14430 + }, + { + "epoch": 1.8357715303396516, + "ewc_loss": 0.008070813491940498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070813055383042e-05, + "grad_norm": 3.9735751152038574, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8780901432037354, + "num_tokens": 550377470.0, + "step": 14431 + }, + { + "epoch": 1.835898740618242, + "ewc_loss": 0.007993376813828945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99337649368681e-05, + "grad_norm": 3.9543514251708984, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8809041380882263, + "num_tokens": 550416123.0, + "step": 14432 + }, + { + "epoch": 1.8360259508968326, + "ewc_loss": 0.008031465113162994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031464676605538e-05, + "grad_norm": 3.960538625717163, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8815298676490784, + "num_tokens": 550458062.0, + "step": 14433 + }, + { + "epoch": 1.836153161175423, + "ewc_loss": 0.00801336020231247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013359911274165e-05, + "grad_norm": 4.012604236602783, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8705199956893921, + "num_tokens": 550496485.0, + "step": 14434 + }, + { + "epoch": 1.8362803714540135, + "ewc_loss": 0.00804359745234251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043597335927188e-05, + "grad_norm": 3.9582340717315674, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8789594173431396, + "num_tokens": 550536500.0, + "step": 14435 + }, + { + "epoch": 1.836407581732604, + "ewc_loss": 0.007968474179506302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968474528752267e-05, + "grad_norm": 3.9921176433563232, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8706466555595398, + "num_tokens": 550575611.0, + "step": 14436 + }, + { + "epoch": 1.8365347920111945, + "ewc_loss": 0.008033175021409988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033175254240632e-05, + "grad_norm": 3.981058120727539, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.869169294834137, + "num_tokens": 550616431.0, + "step": 14437 + }, + { + "epoch": 1.836662002289785, + "ewc_loss": 0.008001661859452724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001661626622081e-05, + "grad_norm": 3.9680795669555664, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8863122463226318, + "num_tokens": 550653626.0, + "step": 14438 + }, + { + "epoch": 1.8367892125683756, + "ewc_loss": 0.008001747541129589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001747482921928e-05, + "grad_norm": 4.085880756378174, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8727298378944397, + "num_tokens": 550690899.0, + "step": 14439 + }, + { + "epoch": 1.8369164228469659, + "ewc_loss": 0.008064272813498974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064272697083652e-05, + "grad_norm": 3.9987146854400635, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8769837617874146, + "num_tokens": 550734408.0, + "step": 14440 + }, + { + "epoch": 1.8370436331255564, + "ewc_loss": 0.00795096904039383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.950969302328303e-05, + "grad_norm": 4.05393123626709, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.864177942276001, + "num_tokens": 550767888.0, + "step": 14441 + }, + { + "epoch": 1.837170843404147, + "ewc_loss": 0.008031627163290977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031626930460334e-05, + "grad_norm": 3.9625229835510254, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8774113059043884, + "num_tokens": 550807192.0, + "step": 14442 + }, + { + "epoch": 1.8372980536827375, + "ewc_loss": 0.007972386665642262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.972386811161414e-05, + "grad_norm": 3.9572129249572754, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8859108090400696, + "num_tokens": 550848006.0, + "step": 14443 + }, + { + "epoch": 1.837425263961328, + "ewc_loss": 0.007975822314620018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975822518346831e-05, + "grad_norm": 3.9413461685180664, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8758640289306641, + "num_tokens": 550888350.0, + "step": 14444 + }, + { + "epoch": 1.8375524742399185, + "ewc_loss": 0.007995713502168655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995713531272486e-05, + "grad_norm": 4.004329681396484, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8830270171165466, + "num_tokens": 550925389.0, + "step": 14445 + }, + { + "epoch": 1.837679684518509, + "ewc_loss": 0.008023512549698353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023512782528996e-05, + "grad_norm": 4.006978511810303, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8812727928161621, + "num_tokens": 550962610.0, + "step": 14446 + }, + { + "epoch": 1.8378068947970996, + "ewc_loss": 0.008000788278877735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000788511708379e-05, + "grad_norm": 3.998051166534424, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8720009922981262, + "num_tokens": 551001018.0, + "step": 14447 + }, + { + "epoch": 1.83793410507569, + "ewc_loss": 0.007992176339030266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992176688276231e-05, + "grad_norm": 4.023508071899414, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8784676790237427, + "num_tokens": 551038505.0, + "step": 14448 + }, + { + "epoch": 1.8380613153542806, + "ewc_loss": 0.00800225418061018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002254617167637e-05, + "grad_norm": 3.9669361114501953, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8857635855674744, + "num_tokens": 551072235.0, + "step": 14449 + }, + { + "epoch": 1.8381885256328712, + "ewc_loss": 0.007980606518685818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.980606460478157e-05, + "grad_norm": 3.9389684200286865, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8890355825424194, + "num_tokens": 551110129.0, + "step": 14450 + }, + { + "epoch": 1.8383157359114617, + "ewc_loss": 0.007982520386576653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982520764926448e-05, + "grad_norm": 3.941729784011841, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8778811693191528, + "num_tokens": 551152380.0, + "step": 14451 + }, + { + "epoch": 1.8384429461900522, + "ewc_loss": 0.007997730746865273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997731154318899e-05, + "grad_norm": 3.9425840377807617, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8872636556625366, + "num_tokens": 551195942.0, + "step": 14452 + }, + { + "epoch": 1.8385701564686427, + "ewc_loss": 0.007982905954122543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982905663084239e-05, + "grad_norm": 3.989255666732788, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8700209259986877, + "num_tokens": 551238502.0, + "step": 14453 + }, + { + "epoch": 1.8386973667472333, + "ewc_loss": 0.007996953092515469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.996953354449943e-05, + "grad_norm": 3.939314842224121, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8861998915672302, + "num_tokens": 551278091.0, + "step": 14454 + }, + { + "epoch": 1.8388245770258238, + "ewc_loss": 0.007955504581332207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.955504406709224e-05, + "grad_norm": 4.03248929977417, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8586286902427673, + "num_tokens": 551314940.0, + "step": 14455 + }, + { + "epoch": 1.8389517873044143, + "ewc_loss": 0.008011290803551674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011290628928691e-05, + "grad_norm": 3.9577882289886475, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8647923469543457, + "num_tokens": 551356427.0, + "step": 14456 + }, + { + "epoch": 1.8390789975830049, + "ewc_loss": 0.007949527353048325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949527207529172e-05, + "grad_norm": 3.9569265842437744, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8818267583847046, + "num_tokens": 551400436.0, + "step": 14457 + }, + { + "epoch": 1.8392062078615952, + "ewc_loss": 0.00795953068882227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.959530921652913e-05, + "grad_norm": 3.9889352321624756, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8755626678466797, + "num_tokens": 551438335.0, + "step": 14458 + }, + { + "epoch": 1.8393334181401857, + "ewc_loss": 0.007975742220878601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975741755217314e-05, + "grad_norm": 4.0136799812316895, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8798409700393677, + "num_tokens": 551471214.0, + "step": 14459 + }, + { + "epoch": 1.8394606284187762, + "ewc_loss": 0.007987801916897297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987801654962823e-05, + "grad_norm": 4.0555806159973145, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8637955188751221, + "num_tokens": 551504071.0, + "step": 14460 + }, + { + "epoch": 1.8395878386973668, + "ewc_loss": 0.008018119260668755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018119115149602e-05, + "grad_norm": 4.005390644073486, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8719459176063538, + "num_tokens": 551538456.0, + "step": 14461 + }, + { + "epoch": 1.8397150489759573, + "ewc_loss": 0.007952778600156307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952778832986951e-05, + "grad_norm": 3.9613544940948486, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.880752682685852, + "num_tokens": 551575334.0, + "step": 14462 + }, + { + "epoch": 1.8398422592545478, + "ewc_loss": 0.008003275841474533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003275434020907e-05, + "grad_norm": 3.9663941860198975, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8842910528182983, + "num_tokens": 551617127.0, + "step": 14463 + }, + { + "epoch": 1.8399694695331381, + "ewc_loss": 0.008006162941455841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006162534002215e-05, + "grad_norm": 3.9658074378967285, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8806700706481934, + "num_tokens": 551655548.0, + "step": 14464 + }, + { + "epoch": 1.8400966798117286, + "ewc_loss": 0.00799848884344101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.998489309102297e-05, + "grad_norm": 3.9826645851135254, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8737943172454834, + "num_tokens": 551691807.0, + "step": 14465 + }, + { + "epoch": 1.8402238900903192, + "ewc_loss": 0.008025235496461391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025235729292035e-05, + "grad_norm": 3.9847755432128906, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.884903609752655, + "num_tokens": 551729805.0, + "step": 14466 + }, + { + "epoch": 1.8403511003689097, + "ewc_loss": 0.00802559033036232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025590796023607e-05, + "grad_norm": 3.986943483352661, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8554288744926453, + "num_tokens": 551775220.0, + "step": 14467 + }, + { + "epoch": 1.8404783106475002, + "ewc_loss": 0.008019938133656979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019938104553148e-05, + "grad_norm": 3.9802510738372803, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.869611382484436, + "num_tokens": 551816242.0, + "step": 14468 + }, + { + "epoch": 1.8406055209260908, + "ewc_loss": 0.007999763824045658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.999764056876302e-05, + "grad_norm": 3.9604504108428955, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8659384250640869, + "num_tokens": 551854929.0, + "step": 14469 + }, + { + "epoch": 1.8407327312046813, + "ewc_loss": 0.00804079044610262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040790271479636e-05, + "grad_norm": 3.949773073196411, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8743017911911011, + "num_tokens": 551898877.0, + "step": 14470 + }, + { + "epoch": 1.8408599414832718, + "ewc_loss": 0.008037334308028221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037334191612899e-05, + "grad_norm": 3.972705602645874, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8922170400619507, + "num_tokens": 551934252.0, + "step": 14471 + }, + { + "epoch": 1.8409871517618623, + "ewc_loss": 0.008042186498641968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042186527745798e-05, + "grad_norm": 3.9367575645446777, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8878244161605835, + "num_tokens": 551974779.0, + "step": 14472 + }, + { + "epoch": 1.8411143620404529, + "ewc_loss": 0.008025048300623894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025048009585589e-05, + "grad_norm": 10.523521423339844, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8762800693511963, + "num_tokens": 552011627.0, + "step": 14473 + }, + { + "epoch": 1.8412415723190434, + "ewc_loss": 0.011348269879817963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 0.00011348269617883489, + "grad_norm": 5.090199947357178, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8573894500732422, + "num_tokens": 552045333.0, + "step": 14474 + }, + { + "epoch": 1.841368782597634, + "ewc_loss": 0.008772003464400768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.772003639023751e-05, + "grad_norm": 3.581841230392456, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8710135221481323, + "num_tokens": 552079974.0, + "step": 14475 + }, + { + "epoch": 1.8414959928762245, + "ewc_loss": 0.008273164741694927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.273164712591097e-05, + "grad_norm": 4.466869354248047, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8839899897575378, + "num_tokens": 552117507.0, + "step": 14476 + }, + { + "epoch": 1.841623203154815, + "ewc_loss": 0.009683001786470413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.68300155363977e-05, + "grad_norm": 4.166859149932861, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.87476646900177, + "num_tokens": 552156009.0, + "step": 14477 + }, + { + "epoch": 1.8417504134334055, + "ewc_loss": 0.008693420328199863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.693420386407524e-05, + "grad_norm": 4.150207042694092, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8846733570098877, + "num_tokens": 552189092.0, + "step": 14478 + }, + { + "epoch": 1.841877623711996, + "ewc_loss": 0.008711914531886578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.711914415471256e-05, + "grad_norm": 4.074409484863281, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8765460848808289, + "num_tokens": 552228305.0, + "step": 14479 + }, + { + "epoch": 1.8420048339905866, + "ewc_loss": 0.00869003962725401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.690039976499975e-05, + "grad_norm": 4.058661460876465, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.878606915473938, + "num_tokens": 552269402.0, + "step": 14480 + }, + { + "epoch": 1.842132044269177, + "ewc_loss": 0.008599678054451942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599678403697908e-05, + "grad_norm": 4.149377822875977, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8747493624687195, + "num_tokens": 552304870.0, + "step": 14481 + }, + { + "epoch": 1.8422592545477676, + "ewc_loss": 0.008611627854406834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611627708887681e-05, + "grad_norm": 4.11558723449707, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8732147216796875, + "num_tokens": 552336802.0, + "step": 14482 + }, + { + "epoch": 1.842386464826358, + "ewc_loss": 0.00852544791996479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525447628926486e-05, + "grad_norm": 4.08021354675293, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8893774151802063, + "num_tokens": 552376124.0, + "step": 14483 + }, + { + "epoch": 1.8425136751049485, + "ewc_loss": 0.008454021997749805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454021735815331e-05, + "grad_norm": 4.049119472503662, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8671700358390808, + "num_tokens": 552412285.0, + "step": 14484 + }, + { + "epoch": 1.842640885383539, + "ewc_loss": 0.008434628136456013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434628398390487e-05, + "grad_norm": 4.0754899978637695, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8788833022117615, + "num_tokens": 552447527.0, + "step": 14485 + }, + { + "epoch": 1.8427680956621295, + "ewc_loss": 0.008410167880356312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410168084083125e-05, + "grad_norm": 4.013015270233154, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8795619010925293, + "num_tokens": 552486569.0, + "step": 14486 + }, + { + "epoch": 1.84289530594072, + "ewc_loss": 0.008342914283275604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342914225067943e-05, + "grad_norm": 4.046510696411133, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8406343460083008, + "num_tokens": 552529158.0, + "step": 14487 + }, + { + "epoch": 1.8430225162193106, + "ewc_loss": 0.008354151621460915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354151214007288e-05, + "grad_norm": 3.9905266761779785, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8748754858970642, + "num_tokens": 552572183.0, + "step": 14488 + }, + { + "epoch": 1.8431497264979009, + "ewc_loss": 0.008280378766357899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28037882456556e-05, + "grad_norm": 4.004045009613037, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8739079833030701, + "num_tokens": 552609785.0, + "step": 14489 + }, + { + "epoch": 1.8432769367764914, + "ewc_loss": 0.008277536369860172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.277536107925698e-05, + "grad_norm": 4.025030612945557, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8832238912582397, + "num_tokens": 552641243.0, + "step": 14490 + }, + { + "epoch": 1.843404147055082, + "ewc_loss": 0.008262949995696545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.262949995696545e-05, + "grad_norm": 3.996575355529785, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.868965208530426, + "num_tokens": 552677595.0, + "step": 14491 + }, + { + "epoch": 1.8435313573336725, + "ewc_loss": 0.008214087225496769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.214086847146973e-05, + "grad_norm": 3.999502420425415, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8762252330780029, + "num_tokens": 552715492.0, + "step": 14492 + }, + { + "epoch": 1.843658567612263, + "ewc_loss": 0.008224815130233765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.224815246649086e-05, + "grad_norm": 3.9582302570343018, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8752812743186951, + "num_tokens": 552757410.0, + "step": 14493 + }, + { + "epoch": 1.8437857778908535, + "ewc_loss": 0.00816817581653595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.168175554601476e-05, + "grad_norm": 3.9553635120391846, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.891279935836792, + "num_tokens": 552793794.0, + "step": 14494 + }, + { + "epoch": 1.843912988169444, + "ewc_loss": 0.008155457675457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.155457908287644e-05, + "grad_norm": 3.967115879058838, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8605728149414062, + "num_tokens": 552836010.0, + "step": 14495 + }, + { + "epoch": 1.8440401984480346, + "ewc_loss": 0.008160351775586605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.160351717378944e-05, + "grad_norm": 4.023988246917725, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8780366778373718, + "num_tokens": 552874136.0, + "step": 14496 + }, + { + "epoch": 1.844167408726625, + "ewc_loss": 0.008172696456313133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.172696107067168e-05, + "grad_norm": 3.9819979667663574, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.873896598815918, + "num_tokens": 552912314.0, + "step": 14497 + }, + { + "epoch": 1.8442946190052156, + "ewc_loss": 0.008105365559458733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.105365122901276e-05, + "grad_norm": 4.006002426147461, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8633438348770142, + "num_tokens": 552953998.0, + "step": 14498 + }, + { + "epoch": 1.8444218292838062, + "ewc_loss": 0.008129355497658253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.129355410346761e-05, + "grad_norm": 4.009678840637207, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8946348428726196, + "num_tokens": 552987310.0, + "step": 14499 + }, + { + "epoch": 1.8445490395623967, + "ewc_loss": 0.008093616925179958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093616634141654e-05, + "grad_norm": 4.040477275848389, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.875529944896698, + "num_tokens": 553019830.0, + "step": 14500 + }, + { + "epoch": 1.8446762498409872, + "ewc_loss": 0.008121407590806484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121407881844789e-05, + "grad_norm": 3.922924280166626, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8747477531433105, + "num_tokens": 553066695.0, + "step": 14501 + }, + { + "epoch": 1.8448034601195777, + "ewc_loss": 0.008026608265936375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02660797489807e-05, + "grad_norm": 4.063374042510986, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8683943152427673, + "num_tokens": 553098943.0, + "step": 14502 + }, + { + "epoch": 1.8449306703981683, + "ewc_loss": 0.008151519112288952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.151518704835325e-05, + "grad_norm": 3.975257635116577, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8799326419830322, + "num_tokens": 553136106.0, + "step": 14503 + }, + { + "epoch": 1.8450578806767588, + "ewc_loss": 0.008049910888075829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.049910684349015e-05, + "grad_norm": 3.9771578311920166, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8822348713874817, + "num_tokens": 553170014.0, + "step": 14504 + }, + { + "epoch": 1.8451850909553493, + "ewc_loss": 0.008080110885202885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080111001618207e-05, + "grad_norm": 3.9669289588928223, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8707228899002075, + "num_tokens": 553213297.0, + "step": 14505 + }, + { + "epoch": 1.8453123012339399, + "ewc_loss": 0.008057639002799988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057639206526801e-05, + "grad_norm": 3.9991815090179443, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8785405158996582, + "num_tokens": 553248559.0, + "step": 14506 + }, + { + "epoch": 1.8454395115125302, + "ewc_loss": 0.008089276030659676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.089275797829032e-05, + "grad_norm": 3.9475882053375244, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8784984350204468, + "num_tokens": 553289140.0, + "step": 14507 + }, + { + "epoch": 1.8455667217911207, + "ewc_loss": 0.008046960458159447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046960283536464e-05, + "grad_norm": 3.9729971885681152, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8694184422492981, + "num_tokens": 553330619.0, + "step": 14508 + }, + { + "epoch": 1.8456939320697112, + "ewc_loss": 0.008079534396529198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.079534745775163e-05, + "grad_norm": 3.9712493419647217, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8876782059669495, + "num_tokens": 553371295.0, + "step": 14509 + }, + { + "epoch": 1.8458211423483017, + "ewc_loss": 0.008058173581957817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058173261815682e-05, + "grad_norm": 3.9986796379089355, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8725630044937134, + "num_tokens": 553410237.0, + "step": 14510 + }, + { + "epoch": 1.8459483526268923, + "ewc_loss": 0.008067579939961433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067579619819298e-05, + "grad_norm": 3.9994702339172363, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8835898637771606, + "num_tokens": 553446727.0, + "step": 14511 + }, + { + "epoch": 1.8460755629054828, + "ewc_loss": 0.00805480033159256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054800127865747e-05, + "grad_norm": 3.947183132171631, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8829370141029358, + "num_tokens": 553488626.0, + "step": 14512 + }, + { + "epoch": 1.846202773184073, + "ewc_loss": 0.008026734925806522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026734576560557e-05, + "grad_norm": 3.9818737506866455, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8694730997085571, + "num_tokens": 553528294.0, + "step": 14513 + }, + { + "epoch": 1.8463299834626636, + "ewc_loss": 0.00804225541651249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042255649343133e-05, + "grad_norm": 3.9856693744659424, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8792855739593506, + "num_tokens": 553563738.0, + "step": 14514 + }, + { + "epoch": 1.8464571937412542, + "ewc_loss": 0.008042857982218266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042858098633587e-05, + "grad_norm": 4.059817314147949, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8449865579605103, + "num_tokens": 553603351.0, + "step": 14515 + }, + { + "epoch": 1.8465844040198447, + "ewc_loss": 0.008074402809143066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07440301286988e-05, + "grad_norm": 3.9902446269989014, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8737657070159912, + "num_tokens": 553643655.0, + "step": 14516 + }, + { + "epoch": 1.8467116142984352, + "ewc_loss": 0.00801035389304161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010353485587984e-05, + "grad_norm": 4.000682353973389, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8870782256126404, + "num_tokens": 553677944.0, + "step": 14517 + }, + { + "epoch": 1.8468388245770258, + "ewc_loss": 0.008053367957472801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053367491811514e-05, + "grad_norm": 4.022457122802734, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8715165257453918, + "num_tokens": 553712120.0, + "step": 14518 + }, + { + "epoch": 1.8469660348556163, + "ewc_loss": 0.008056783117353916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056783553911373e-05, + "grad_norm": 3.9524118900299072, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.880178689956665, + "num_tokens": 553751442.0, + "step": 14519 + }, + { + "epoch": 1.8470932451342068, + "ewc_loss": 0.007997831329703331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997831562533975e-05, + "grad_norm": 3.9941158294677734, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8819319009780884, + "num_tokens": 553782924.0, + "step": 14520 + }, + { + "epoch": 1.8472204554127973, + "ewc_loss": 0.008069786243140697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06978641776368e-05, + "grad_norm": 4.003209114074707, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8680121302604675, + "num_tokens": 553818861.0, + "step": 14521 + }, + { + "epoch": 1.8473476656913879, + "ewc_loss": 0.008038700558245182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038700616452843e-05, + "grad_norm": 3.9844377040863037, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8764970898628235, + "num_tokens": 553855593.0, + "step": 14522 + }, + { + "epoch": 1.8474748759699784, + "ewc_loss": 0.008032980374991894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032980258576572e-05, + "grad_norm": 3.9574358463287354, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8761460781097412, + "num_tokens": 553896028.0, + "step": 14523 + }, + { + "epoch": 1.847602086248569, + "ewc_loss": 0.008041808381676674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.041808177949861e-05, + "grad_norm": 3.957688331604004, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8798967599868774, + "num_tokens": 553935212.0, + "step": 14524 + }, + { + "epoch": 1.8477292965271594, + "ewc_loss": 0.008035289123654366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035288919927552e-05, + "grad_norm": 3.982755184173584, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8752691149711609, + "num_tokens": 553971231.0, + "step": 14525 + }, + { + "epoch": 1.84785650680575, + "ewc_loss": 0.008067772723734379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067773160291836e-05, + "grad_norm": 3.983210325241089, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8849020004272461, + "num_tokens": 554006782.0, + "step": 14526 + }, + { + "epoch": 1.8479837170843405, + "ewc_loss": 0.008058623410761356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058623643592e-05, + "grad_norm": 4.024272441864014, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8686742782592773, + "num_tokens": 554043427.0, + "step": 14527 + }, + { + "epoch": 1.848110927362931, + "ewc_loss": 0.008084140717983246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.084141154540703e-05, + "grad_norm": 3.982668161392212, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8649836182594299, + "num_tokens": 554084187.0, + "step": 14528 + }, + { + "epoch": 1.8482381376415216, + "ewc_loss": 0.008041897788643837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.041897672228515e-05, + "grad_norm": 3.9915878772735596, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8768042325973511, + "num_tokens": 554122186.0, + "step": 14529 + }, + { + "epoch": 1.848365347920112, + "ewc_loss": 0.008066650480031967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066650480031967e-05, + "grad_norm": 3.9560773372650146, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8759399652481079, + "num_tokens": 554161388.0, + "step": 14530 + }, + { + "epoch": 1.8484925581987026, + "ewc_loss": 0.008039035834372044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039036038098857e-05, + "grad_norm": 3.9848906993865967, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8692504167556763, + "num_tokens": 554199837.0, + "step": 14531 + }, + { + "epoch": 1.848619768477293, + "ewc_loss": 0.008085113018751144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085113222477958e-05, + "grad_norm": 3.978203773498535, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8658093214035034, + "num_tokens": 554239567.0, + "step": 14532 + }, + { + "epoch": 1.8487469787558835, + "ewc_loss": 0.008057559840381145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057559898588806e-05, + "grad_norm": 3.9699175357818604, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8793935775756836, + "num_tokens": 554277040.0, + "step": 14533 + }, + { + "epoch": 1.848874189034474, + "ewc_loss": 0.008062368258833885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062367851380259e-05, + "grad_norm": 3.9504079818725586, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8808954954147339, + "num_tokens": 554318994.0, + "step": 14534 + }, + { + "epoch": 1.8490013993130645, + "ewc_loss": 0.008074509911239147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074509969446808e-05, + "grad_norm": 4.030431270599365, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8703309297561646, + "num_tokens": 554355334.0, + "step": 14535 + }, + { + "epoch": 1.849128609591655, + "ewc_loss": 0.008086678571999073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086678280960768e-05, + "grad_norm": 3.99782133102417, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8653435707092285, + "num_tokens": 554396504.0, + "step": 14536 + }, + { + "epoch": 1.8492558198702456, + "ewc_loss": 0.008039453066885471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03945295047015e-05, + "grad_norm": 4.029347896575928, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8822526931762695, + "num_tokens": 554429772.0, + "step": 14537 + }, + { + "epoch": 1.8493830301488359, + "ewc_loss": 0.008081583306193352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081583655439317e-05, + "grad_norm": 3.959925413131714, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8777265548706055, + "num_tokens": 554470102.0, + "step": 14538 + }, + { + "epoch": 1.8495102404274264, + "ewc_loss": 0.008009218610823154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009218436200172e-05, + "grad_norm": 3.9485981464385986, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8675066232681274, + "num_tokens": 554511632.0, + "step": 14539 + }, + { + "epoch": 1.849637450706017, + "ewc_loss": 0.00802590698003769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025906572584063e-05, + "grad_norm": 3.9625484943389893, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8714882731437683, + "num_tokens": 554553028.0, + "step": 14540 + }, + { + "epoch": 1.8497646609846075, + "ewc_loss": 0.008054978214204311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054978388827294e-05, + "grad_norm": 4.110896110534668, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.869747519493103, + "num_tokens": 554585109.0, + "step": 14541 + }, + { + "epoch": 1.849891871263198, + "ewc_loss": 0.008122175931930542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.122176222968847e-05, + "grad_norm": 3.9939205646514893, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8776542544364929, + "num_tokens": 554621141.0, + "step": 14542 + }, + { + "epoch": 1.8500190815417885, + "ewc_loss": 0.007993904873728752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.9939047282096e-05, + "grad_norm": 3.9806551933288574, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8736852407455444, + "num_tokens": 554660838.0, + "step": 14543 + }, + { + "epoch": 1.850146291820379, + "ewc_loss": 0.00803424697369337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034247002797201e-05, + "grad_norm": 4.01530647277832, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8780137896537781, + "num_tokens": 554696338.0, + "step": 14544 + }, + { + "epoch": 1.8502735020989696, + "ewc_loss": 0.008065221831202507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.065221481956542e-05, + "grad_norm": 4.015025615692139, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8871460556983948, + "num_tokens": 554729782.0, + "step": 14545 + }, + { + "epoch": 1.85040071237756, + "ewc_loss": 0.008036092855036259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03609291324392e-05, + "grad_norm": 3.9403932094573975, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8916141986846924, + "num_tokens": 554767566.0, + "step": 14546 + }, + { + "epoch": 1.8505279226561506, + "ewc_loss": 0.007984455674886703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98445544205606e-05, + "grad_norm": 3.9748029708862305, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8815315961837769, + "num_tokens": 554806063.0, + "step": 14547 + }, + { + "epoch": 1.8506551329347412, + "ewc_loss": 0.00803489238023758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03489238023758e-05, + "grad_norm": 4.033538818359375, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8739113807678223, + "num_tokens": 554839418.0, + "step": 14548 + }, + { + "epoch": 1.8507823432133317, + "ewc_loss": 0.008050641976296902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050641918089241e-05, + "grad_norm": 3.9403512477874756, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8714653253555298, + "num_tokens": 554881888.0, + "step": 14549 + }, + { + "epoch": 1.8509095534919222, + "ewc_loss": 0.007984003983438015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98400433268398e-05, + "grad_norm": 3.936917781829834, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8937209248542786, + "num_tokens": 554921561.0, + "step": 14550 + }, + { + "epoch": 1.8510367637705127, + "ewc_loss": 0.008028759621083736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028759475564584e-05, + "grad_norm": 4.060068130493164, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8530359268188477, + "num_tokens": 554954380.0, + "step": 14551 + }, + { + "epoch": 1.8511639740491033, + "ewc_loss": 0.008091108873486519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.091108611552045e-05, + "grad_norm": 3.9494941234588623, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8769504427909851, + "num_tokens": 554994781.0, + "step": 14552 + }, + { + "epoch": 1.8512911843276938, + "ewc_loss": 0.007978598587214947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.978599023772404e-05, + "grad_norm": 4.031245708465576, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8864590525627136, + "num_tokens": 555024938.0, + "step": 14553 + }, + { + "epoch": 1.8514183946062843, + "ewc_loss": 0.00807740818709135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077407983364537e-05, + "grad_norm": 4.02923059463501, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8641421794891357, + "num_tokens": 555059785.0, + "step": 14554 + }, + { + "epoch": 1.8515456048848749, + "ewc_loss": 0.00805942714214325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059426909312606e-05, + "grad_norm": 4.0352091789245605, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8513921499252319, + "num_tokens": 555097502.0, + "step": 14555 + }, + { + "epoch": 1.8516728151634652, + "ewc_loss": 0.008061090484261513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061090193223208e-05, + "grad_norm": 3.9328274726867676, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8755437135696411, + "num_tokens": 555139976.0, + "step": 14556 + }, + { + "epoch": 1.8518000254420557, + "ewc_loss": 0.008006855845451355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006855932762846e-05, + "grad_norm": 3.9586615562438965, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8785605430603027, + "num_tokens": 555181250.0, + "step": 14557 + }, + { + "epoch": 1.8519272357206462, + "ewc_loss": 0.008064365945756435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064365829341114e-05, + "grad_norm": 4.060182094573975, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8572511672973633, + "num_tokens": 555214927.0, + "step": 14558 + }, + { + "epoch": 1.8520544459992367, + "ewc_loss": 0.008113785646855831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11378558864817e-05, + "grad_norm": 3.9559361934661865, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8726730942726135, + "num_tokens": 555255795.0, + "step": 14559 + }, + { + "epoch": 1.8521816562778273, + "ewc_loss": 0.008012425154447556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012424950720742e-05, + "grad_norm": 3.9590516090393066, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8791910409927368, + "num_tokens": 555294177.0, + "step": 14560 + }, + { + "epoch": 1.8523088665564178, + "ewc_loss": 0.008069310337305069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069310570135713e-05, + "grad_norm": 3.9564507007598877, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8876655101776123, + "num_tokens": 555331548.0, + "step": 14561 + }, + { + "epoch": 1.852436076835008, + "ewc_loss": 0.008052170276641846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052170596783981e-05, + "grad_norm": 3.9920361042022705, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8889874219894409, + "num_tokens": 555368426.0, + "step": 14562 + }, + { + "epoch": 1.8525632871135986, + "ewc_loss": 0.008069637231528759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06963726063259e-05, + "grad_norm": 3.9432387351989746, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8935730457305908, + "num_tokens": 555411594.0, + "step": 14563 + }, + { + "epoch": 1.8526904973921892, + "ewc_loss": 0.00804011058062315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040110697038472e-05, + "grad_norm": 3.995615005493164, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8688082695007324, + "num_tokens": 555450412.0, + "step": 14564 + }, + { + "epoch": 1.8528177076707797, + "ewc_loss": 0.008067220449447632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067220187513158e-05, + "grad_norm": 3.964503288269043, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8845879435539246, + "num_tokens": 555489067.0, + "step": 14565 + }, + { + "epoch": 1.8529449179493702, + "ewc_loss": 0.008021998219192028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021998655749485e-05, + "grad_norm": 3.9922096729278564, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8827919960021973, + "num_tokens": 555524552.0, + "step": 14566 + }, + { + "epoch": 1.8530721282279607, + "ewc_loss": 0.00805213674902916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052137127378955e-05, + "grad_norm": 4.036422252655029, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8492114543914795, + "num_tokens": 555563766.0, + "step": 14567 + }, + { + "epoch": 1.8531993385065513, + "ewc_loss": 0.008043534122407436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043534035095945e-05, + "grad_norm": 3.961549758911133, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8833310604095459, + "num_tokens": 555601410.0, + "step": 14568 + }, + { + "epoch": 1.8533265487851418, + "ewc_loss": 0.007996548898518085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.996548811206594e-05, + "grad_norm": 3.9779911041259766, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8761045932769775, + "num_tokens": 555641346.0, + "step": 14569 + }, + { + "epoch": 1.8534537590637323, + "ewc_loss": 0.008031659759581089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031659672269598e-05, + "grad_norm": 4.030872821807861, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8853121995925903, + "num_tokens": 555676537.0, + "step": 14570 + }, + { + "epoch": 1.8535809693423229, + "ewc_loss": 0.008063012734055519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.063012501224875e-05, + "grad_norm": 3.9647414684295654, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8624337315559387, + "num_tokens": 555721634.0, + "step": 14571 + }, + { + "epoch": 1.8537081796209134, + "ewc_loss": 0.007984848693013191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.984848343767226e-05, + "grad_norm": 3.962305784225464, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.875252902507782, + "num_tokens": 555765447.0, + "step": 14572 + }, + { + "epoch": 1.853835389899504, + "ewc_loss": 0.008007030934095383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007030555745587e-05, + "grad_norm": 4.068015098571777, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8763062357902527, + "num_tokens": 555798598.0, + "step": 14573 + }, + { + "epoch": 1.8539626001780944, + "ewc_loss": 0.008077085949480534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077085658442229e-05, + "grad_norm": 4.042665004730225, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8849896192550659, + "num_tokens": 555829465.0, + "step": 14574 + }, + { + "epoch": 1.854089810456685, + "ewc_loss": 0.008009440265595913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009440352907404e-05, + "grad_norm": 3.956515312194824, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8783190250396729, + "num_tokens": 555866577.0, + "step": 14575 + }, + { + "epoch": 1.8542170207352755, + "ewc_loss": 0.007994248531758785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994248153408989e-05, + "grad_norm": 4.041553497314453, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8795238733291626, + "num_tokens": 555900472.0, + "step": 14576 + }, + { + "epoch": 1.854344231013866, + "ewc_loss": 0.008056823164224625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056823571678251e-05, + "grad_norm": 3.9843227863311768, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8718129396438599, + "num_tokens": 555936356.0, + "step": 14577 + }, + { + "epoch": 1.8544714412924566, + "ewc_loss": 0.008007105439901352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007105498109013e-05, + "grad_norm": 4.0232954025268555, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8690889477729797, + "num_tokens": 555975338.0, + "step": 14578 + }, + { + "epoch": 1.854598651571047, + "ewc_loss": 0.008041813969612122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.041813998715952e-05, + "grad_norm": 3.991166830062866, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8659391403198242, + "num_tokens": 556019651.0, + "step": 14579 + }, + { + "epoch": 1.8547258618496376, + "ewc_loss": 0.00800412055104971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004120900295675e-05, + "grad_norm": 4.034398555755615, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.872135579586029, + "num_tokens": 556054530.0, + "step": 14580 + }, + { + "epoch": 1.854853072128228, + "ewc_loss": 0.008057438768446445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057439117692411e-05, + "grad_norm": 4.002301216125488, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8709020614624023, + "num_tokens": 556091985.0, + "step": 14581 + }, + { + "epoch": 1.8549802824068184, + "ewc_loss": 0.008028104901313782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028104639379308e-05, + "grad_norm": 4.006346702575684, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8796030282974243, + "num_tokens": 556128777.0, + "step": 14582 + }, + { + "epoch": 1.855107492685409, + "ewc_loss": 0.008036069571971893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036069630179554e-05, + "grad_norm": 3.9759581089019775, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8953094482421875, + "num_tokens": 556164286.0, + "step": 14583 + }, + { + "epoch": 1.8552347029639995, + "ewc_loss": 0.008027458563446999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.027458534343168e-05, + "grad_norm": 3.996666669845581, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8844456672668457, + "num_tokens": 556199141.0, + "step": 14584 + }, + { + "epoch": 1.85536191324259, + "ewc_loss": 0.008052685298025608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052685006987303e-05, + "grad_norm": 3.994741201400757, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8814387321472168, + "num_tokens": 556237104.0, + "step": 14585 + }, + { + "epoch": 1.8554891235211806, + "ewc_loss": 0.008052433840930462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052433986449614e-05, + "grad_norm": 3.993770122528076, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8659994602203369, + "num_tokens": 556275343.0, + "step": 14586 + }, + { + "epoch": 1.8556163337997709, + "ewc_loss": 0.008044579066336155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044578862609342e-05, + "grad_norm": 3.9445583820343018, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8910247087478638, + "num_tokens": 556316539.0, + "step": 14587 + }, + { + "epoch": 1.8557435440783614, + "ewc_loss": 0.008027014322578907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.027013973332942e-05, + "grad_norm": 4.0140557289123535, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8890303373336792, + "num_tokens": 556351101.0, + "step": 14588 + }, + { + "epoch": 1.855870754356952, + "ewc_loss": 0.008099258877336979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.099259139271453e-05, + "grad_norm": 4.005552768707275, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.889008641242981, + "num_tokens": 556382458.0, + "step": 14589 + }, + { + "epoch": 1.8559979646355425, + "ewc_loss": 0.008041531778872013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.041531691560522e-05, + "grad_norm": 3.999082088470459, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8792877197265625, + "num_tokens": 556416980.0, + "step": 14590 + }, + { + "epoch": 1.856125174914133, + "ewc_loss": 0.008063171058893204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.063171117100865e-05, + "grad_norm": 4.087233543395996, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.882366955280304, + "num_tokens": 556451868.0, + "step": 14591 + }, + { + "epoch": 1.8562523851927235, + "ewc_loss": 0.008103998377919197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103997970465571e-05, + "grad_norm": 4.023382186889648, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8658640384674072, + "num_tokens": 556485682.0, + "step": 14592 + }, + { + "epoch": 1.856379595471314, + "ewc_loss": 0.008035300299525261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035299833863974e-05, + "grad_norm": 3.9655590057373047, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8875124454498291, + "num_tokens": 556524857.0, + "step": 14593 + }, + { + "epoch": 1.8565068057499046, + "ewc_loss": 0.008042951114475727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042951230891049e-05, + "grad_norm": 3.994643211364746, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8803114891052246, + "num_tokens": 556565763.0, + "step": 14594 + }, + { + "epoch": 1.856634016028495, + "ewc_loss": 0.00807233527302742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.072335185715929e-05, + "grad_norm": 3.9888322353363037, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.864606499671936, + "num_tokens": 556606202.0, + "step": 14595 + }, + { + "epoch": 1.8567612263070856, + "ewc_loss": 0.008061987347900867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061987318797037e-05, + "grad_norm": 3.968120813369751, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8849148154258728, + "num_tokens": 556646906.0, + "step": 14596 + }, + { + "epoch": 1.8568884365856761, + "ewc_loss": 0.008053204044699669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053204510360956e-05, + "grad_norm": 3.9974722862243652, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8785967230796814, + "num_tokens": 556684249.0, + "step": 14597 + }, + { + "epoch": 1.8570156468642667, + "ewc_loss": 0.008071372285485268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071372576523572e-05, + "grad_norm": 3.94677996635437, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8674669861793518, + "num_tokens": 556726840.0, + "step": 14598 + }, + { + "epoch": 1.8571428571428572, + "ewc_loss": 0.008014007471501827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.014007471501827e-05, + "grad_norm": 3.959031820297241, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8796220421791077, + "num_tokens": 556764822.0, + "step": 14599 + }, + { + "epoch": 1.8572700674214477, + "ewc_loss": 0.008060986176133156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060986147029325e-05, + "grad_norm": 4.022359848022461, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8743892908096313, + "num_tokens": 556800884.0, + "step": 14600 + }, + { + "epoch": 1.8573972777000383, + "ewc_loss": 0.008067593909800053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067594171734527e-05, + "grad_norm": 3.94372296333313, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.880944013595581, + "num_tokens": 556841233.0, + "step": 14601 + }, + { + "epoch": 1.8575244879786288, + "ewc_loss": 0.007999498397111893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.999498484423384e-05, + "grad_norm": 3.9935860633850098, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8693755865097046, + "num_tokens": 556880995.0, + "step": 14602 + }, + { + "epoch": 1.8576516982572193, + "ewc_loss": 0.008059194311499596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059194078668952e-05, + "grad_norm": 4.0145978927612305, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8816586136817932, + "num_tokens": 556914367.0, + "step": 14603 + }, + { + "epoch": 1.8577789085358098, + "ewc_loss": 0.008050067350268364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05006711743772e-05, + "grad_norm": 3.9976108074188232, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8582676649093628, + "num_tokens": 556955944.0, + "step": 14604 + }, + { + "epoch": 1.8579061188144002, + "ewc_loss": 0.008034458383917809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034458733163774e-05, + "grad_norm": 4.077751159667969, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8594059348106384, + "num_tokens": 556989851.0, + "step": 14605 + }, + { + "epoch": 1.8580333290929907, + "ewc_loss": 0.008074138313531876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074138168012723e-05, + "grad_norm": 3.9290413856506348, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8774373531341553, + "num_tokens": 557033662.0, + "step": 14606 + }, + { + "epoch": 1.8581605393715812, + "ewc_loss": 0.00798275787383318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.982757961144671e-05, + "grad_norm": 3.970921516418457, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8693366050720215, + "num_tokens": 557074869.0, + "step": 14607 + }, + { + "epoch": 1.8582877496501717, + "ewc_loss": 0.008048908784985542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.048908784985542e-05, + "grad_norm": 4.007850170135498, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.866642415523529, + "num_tokens": 557110164.0, + "step": 14608 + }, + { + "epoch": 1.8584149599287623, + "ewc_loss": 0.00805818010121584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058179810177535e-05, + "grad_norm": 3.9789555072784424, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8764760494232178, + "num_tokens": 557149309.0, + "step": 14609 + }, + { + "epoch": 1.8585421702073528, + "ewc_loss": 0.0080194640904665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019463712116703e-05, + "grad_norm": 4.074189186096191, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8713411092758179, + "num_tokens": 557180510.0, + "step": 14610 + }, + { + "epoch": 1.858669380485943, + "ewc_loss": 0.008083609864115715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.083610009634867e-05, + "grad_norm": 3.97430157661438, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8823448419570923, + "num_tokens": 557218062.0, + "step": 14611 + }, + { + "epoch": 1.8587965907645336, + "ewc_loss": 0.00800898764282465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.008987788343802e-05, + "grad_norm": 3.9615228176116943, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8679808378219604, + "num_tokens": 557258963.0, + "step": 14612 + }, + { + "epoch": 1.8589238010431242, + "ewc_loss": 0.008045374415814877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.045374852372333e-05, + "grad_norm": 4.019308567047119, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8773332834243774, + "num_tokens": 557296046.0, + "step": 14613 + }, + { + "epoch": 1.8590510113217147, + "ewc_loss": 0.008069063536822796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069063915172592e-05, + "grad_norm": 3.9989688396453857, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8733965158462524, + "num_tokens": 557328395.0, + "step": 14614 + }, + { + "epoch": 1.8591782216003052, + "ewc_loss": 0.008039320819079876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039321255637333e-05, + "grad_norm": 3.970076560974121, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8789405822753906, + "num_tokens": 557363888.0, + "step": 14615 + }, + { + "epoch": 1.8593054318788957, + "ewc_loss": 0.008051013574004173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051013719523326e-05, + "grad_norm": 3.9740374088287354, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8689284920692444, + "num_tokens": 557402944.0, + "step": 14616 + }, + { + "epoch": 1.8594326421574863, + "ewc_loss": 0.00806994829326868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069948671618477e-05, + "grad_norm": 3.951916217803955, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8831695318222046, + "num_tokens": 557442247.0, + "step": 14617 + }, + { + "epoch": 1.8595598524360768, + "ewc_loss": 0.008069640025496483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069640171015635e-05, + "grad_norm": 3.9689126014709473, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8706285953521729, + "num_tokens": 557483580.0, + "step": 14618 + }, + { + "epoch": 1.8596870627146673, + "ewc_loss": 0.008078783750534058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07878386694938e-05, + "grad_norm": 4.001863956451416, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8804582357406616, + "num_tokens": 557520134.0, + "step": 14619 + }, + { + "epoch": 1.8598142729932579, + "ewc_loss": 0.00807721633464098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077216625679284e-05, + "grad_norm": 3.93988037109375, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8719617128372192, + "num_tokens": 557565726.0, + "step": 14620 + }, + { + "epoch": 1.8599414832718484, + "ewc_loss": 0.008033962920308113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033962512854487e-05, + "grad_norm": 3.971884250640869, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8759210109710693, + "num_tokens": 557602871.0, + "step": 14621 + }, + { + "epoch": 1.860068693550439, + "ewc_loss": 0.008071305230259895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071305637713522e-05, + "grad_norm": 3.970761775970459, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8674881458282471, + "num_tokens": 557640873.0, + "step": 14622 + }, + { + "epoch": 1.8601959038290294, + "ewc_loss": 0.00805387832224369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05387826403603e-05, + "grad_norm": 3.9861295223236084, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.869707465171814, + "num_tokens": 557684233.0, + "step": 14623 + }, + { + "epoch": 1.86032311410762, + "ewc_loss": 0.00803464837372303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034647908061743e-05, + "grad_norm": 4.006419658660889, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8797513842582703, + "num_tokens": 557716895.0, + "step": 14624 + }, + { + "epoch": 1.8604503243862105, + "ewc_loss": 0.008062317036092281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06231691967696e-05, + "grad_norm": 3.9874746799468994, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8720121383666992, + "num_tokens": 557758197.0, + "step": 14625 + }, + { + "epoch": 1.860577534664801, + "ewc_loss": 0.008040674962103367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040675311349332e-05, + "grad_norm": 4.0384602546691895, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8504256010055542, + "num_tokens": 557794196.0, + "step": 14626 + }, + { + "epoch": 1.8607047449433916, + "ewc_loss": 0.008073773235082626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.073773642536253e-05, + "grad_norm": 3.9817426204681396, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8609886169433594, + "num_tokens": 557833856.0, + "step": 14627 + }, + { + "epoch": 1.860831955221982, + "ewc_loss": 0.008021285757422447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.021285611903295e-05, + "grad_norm": 4.060702800750732, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.865028440952301, + "num_tokens": 557866304.0, + "step": 14628 + }, + { + "epoch": 1.8609591655005726, + "ewc_loss": 0.008085401728749275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085402077995241e-05, + "grad_norm": 3.9284377098083496, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8897689580917358, + "num_tokens": 557907065.0, + "step": 14629 + }, + { + "epoch": 1.861086375779163, + "ewc_loss": 0.007991168648004532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991168240550905e-05, + "grad_norm": 3.9700872898101807, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8733874559402466, + "num_tokens": 557947326.0, + "step": 14630 + }, + { + "epoch": 1.8612135860577534, + "ewc_loss": 0.008061551488935947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061551488935947e-05, + "grad_norm": 3.9593727588653564, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8750435709953308, + "num_tokens": 557990059.0, + "step": 14631 + }, + { + "epoch": 1.861340796336344, + "ewc_loss": 0.008052639663219452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052639896050096e-05, + "grad_norm": 4.0052313804626465, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8778064846992493, + "num_tokens": 558028406.0, + "step": 14632 + }, + { + "epoch": 1.8614680066149345, + "ewc_loss": 0.008056689985096455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05668969405815e-05, + "grad_norm": 3.9770071506500244, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.870872974395752, + "num_tokens": 558070239.0, + "step": 14633 + }, + { + "epoch": 1.861595216893525, + "ewc_loss": 0.008045503869652748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.045503636822104e-05, + "grad_norm": 4.016087532043457, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8805516362190247, + "num_tokens": 558107685.0, + "step": 14634 + }, + { + "epoch": 1.8617224271721156, + "ewc_loss": 0.008064323104918003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06432290119119e-05, + "grad_norm": 4.013378143310547, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8706678748130798, + "num_tokens": 558143185.0, + "step": 14635 + }, + { + "epoch": 1.8618496374507059, + "ewc_loss": 0.008043925277888775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043925481615588e-05, + "grad_norm": 4.014467239379883, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8835067749023438, + "num_tokens": 558176207.0, + "step": 14636 + }, + { + "epoch": 1.8619768477292964, + "ewc_loss": 0.008050594478845596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050594624364749e-05, + "grad_norm": 4.000320911407471, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8919448852539062, + "num_tokens": 558212215.0, + "step": 14637 + }, + { + "epoch": 1.862104058007887, + "ewc_loss": 0.00804201140999794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042011177167296e-05, + "grad_norm": 4.026782512664795, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8852341175079346, + "num_tokens": 558248471.0, + "step": 14638 + }, + { + "epoch": 1.8622312682864774, + "ewc_loss": 0.008056456223130226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056456135818735e-05, + "grad_norm": 4.005336284637451, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8647247552871704, + "num_tokens": 558285211.0, + "step": 14639 + }, + { + "epoch": 1.862358478565068, + "ewc_loss": 0.008035040460526943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035040082177147e-05, + "grad_norm": 4.139545917510986, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8705355525016785, + "num_tokens": 558314434.0, + "step": 14640 + }, + { + "epoch": 1.8624856888436585, + "ewc_loss": 0.00811355747282505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.113557851174846e-05, + "grad_norm": 3.981017827987671, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8815910220146179, + "num_tokens": 558353710.0, + "step": 14641 + }, + { + "epoch": 1.862612899122249, + "ewc_loss": 0.007987438701093197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.987438584677875e-05, + "grad_norm": 4.108548164367676, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8640638589859009, + "num_tokens": 558390476.0, + "step": 14642 + }, + { + "epoch": 1.8627401094008396, + "ewc_loss": 0.008139647543430328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.139647252392024e-05, + "grad_norm": 3.973039150238037, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8849933743476868, + "num_tokens": 558424917.0, + "step": 14643 + }, + { + "epoch": 1.86286731967943, + "ewc_loss": 0.008015336468815804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.015336788957939e-05, + "grad_norm": 4.011881351470947, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8708090782165527, + "num_tokens": 558459831.0, + "step": 14644 + }, + { + "epoch": 1.8629945299580206, + "ewc_loss": 0.008085085079073906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085084846243262e-05, + "grad_norm": 3.9286484718322754, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8906912803649902, + "num_tokens": 558498600.0, + "step": 14645 + }, + { + "epoch": 1.8631217402366111, + "ewc_loss": 0.008042111061513424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042110857786611e-05, + "grad_norm": 3.9709279537200928, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8671388626098633, + "num_tokens": 558537465.0, + "step": 14646 + }, + { + "epoch": 1.8632489505152017, + "ewc_loss": 0.008115950040519238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11595018603839e-05, + "grad_norm": 3.9586613178253174, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8950332403182983, + "num_tokens": 558575357.0, + "step": 14647 + }, + { + "epoch": 1.8633761607937922, + "ewc_loss": 0.008077492006123066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077492384472862e-05, + "grad_norm": 3.9485974311828613, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8743591904640198, + "num_tokens": 558619422.0, + "step": 14648 + }, + { + "epoch": 1.8635033710723827, + "ewc_loss": 0.00808488205075264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.084881847025827e-05, + "grad_norm": 3.987636089324951, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8675109148025513, + "num_tokens": 558657677.0, + "step": 14649 + }, + { + "epoch": 1.8636305813509733, + "ewc_loss": 0.00809907354414463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.099073602352291e-05, + "grad_norm": 3.9624104499816895, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8836784362792969, + "num_tokens": 558692912.0, + "step": 14650 + }, + { + "epoch": 1.8637577916295638, + "ewc_loss": 0.008081428706645966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081428677542135e-05, + "grad_norm": 3.973083257675171, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8873774409294128, + "num_tokens": 558733470.0, + "step": 14651 + }, + { + "epoch": 1.8638850019081543, + "ewc_loss": 0.008086576126515865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086575689958408e-05, + "grad_norm": 3.9941771030426025, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8729950189590454, + "num_tokens": 558771899.0, + "step": 14652 + }, + { + "epoch": 1.8640122121867448, + "ewc_loss": 0.008089998736977577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08999830042012e-05, + "grad_norm": 3.9643054008483887, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8825864791870117, + "num_tokens": 558813002.0, + "step": 14653 + }, + { + "epoch": 1.8641394224653351, + "ewc_loss": 0.008052324876189232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052324847085401e-05, + "grad_norm": 4.020544528961182, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8611472249031067, + "num_tokens": 558850827.0, + "step": 14654 + }, + { + "epoch": 1.8642666327439257, + "ewc_loss": 0.008085373789072037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085373701760545e-05, + "grad_norm": 3.9989054203033447, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8737602233886719, + "num_tokens": 558889003.0, + "step": 14655 + }, + { + "epoch": 1.8643938430225162, + "ewc_loss": 0.008039098232984543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039098611334339e-05, + "grad_norm": 4.013530731201172, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8831599950790405, + "num_tokens": 558925849.0, + "step": 14656 + }, + { + "epoch": 1.8645210533011067, + "ewc_loss": 0.00804612971842289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046129369176924e-05, + "grad_norm": 3.955089807510376, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8760122656822205, + "num_tokens": 558967471.0, + "step": 14657 + }, + { + "epoch": 1.8646482635796973, + "ewc_loss": 0.008017933927476406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017934305826202e-05, + "grad_norm": 4.011325359344482, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8796778917312622, + "num_tokens": 559001938.0, + "step": 14658 + }, + { + "epoch": 1.8647754738582878, + "ewc_loss": 0.008059771731495857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059771789703518e-05, + "grad_norm": 3.968261957168579, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8814723491668701, + "num_tokens": 559040515.0, + "step": 14659 + }, + { + "epoch": 1.864902684136878, + "ewc_loss": 0.008017449639737606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017449727049097e-05, + "grad_norm": 3.9685158729553223, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8866817951202393, + "num_tokens": 559077702.0, + "step": 14660 + }, + { + "epoch": 1.8650298944154686, + "ewc_loss": 0.008043169975280762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043169509619474e-05, + "grad_norm": 4.014120101928711, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8688645362854004, + "num_tokens": 559114741.0, + "step": 14661 + }, + { + "epoch": 1.8651571046940592, + "ewc_loss": 0.008071154356002808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071154297795147e-05, + "grad_norm": 4.010969638824463, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8558505773544312, + "num_tokens": 559155844.0, + "step": 14662 + }, + { + "epoch": 1.8652843149726497, + "ewc_loss": 0.008039855398237705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039855310926214e-05, + "grad_norm": 3.966325283050537, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8776452541351318, + "num_tokens": 559194964.0, + "step": 14663 + }, + { + "epoch": 1.8654115252512402, + "ewc_loss": 0.008023629896342754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023629925446585e-05, + "grad_norm": 3.967775583267212, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8826577067375183, + "num_tokens": 559230083.0, + "step": 14664 + }, + { + "epoch": 1.8655387355298307, + "ewc_loss": 0.008049318566918373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.049318421399221e-05, + "grad_norm": 3.9835760593414307, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.887332558631897, + "num_tokens": 559268397.0, + "step": 14665 + }, + { + "epoch": 1.8656659458084213, + "ewc_loss": 0.008051436394453049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05143645266071e-05, + "grad_norm": 4.049382209777832, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8674631118774414, + "num_tokens": 559302024.0, + "step": 14666 + }, + { + "epoch": 1.8657931560870118, + "ewc_loss": 0.008100339211523533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.100338891381398e-05, + "grad_norm": 4.015746593475342, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.874984622001648, + "num_tokens": 559343192.0, + "step": 14667 + }, + { + "epoch": 1.8659203663656023, + "ewc_loss": 0.008047970943152905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047970914049074e-05, + "grad_norm": 3.9223079681396484, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8976256251335144, + "num_tokens": 559380616.0, + "step": 14668 + }, + { + "epoch": 1.8660475766441929, + "ewc_loss": 0.008029358461499214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029358286876231e-05, + "grad_norm": 4.0405402183532715, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8858168721199036, + "num_tokens": 559413621.0, + "step": 14669 + }, + { + "epoch": 1.8661747869227834, + "ewc_loss": 0.008113596588373184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.113596413750201e-05, + "grad_norm": 3.980677843093872, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8843474984169006, + "num_tokens": 559447274.0, + "step": 14670 + }, + { + "epoch": 1.866301997201374, + "ewc_loss": 0.00805733259767294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057332888711244e-05, + "grad_norm": 3.9677536487579346, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8848903179168701, + "num_tokens": 559484306.0, + "step": 14671 + }, + { + "epoch": 1.8664292074799644, + "ewc_loss": 0.008067797869443893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067797898547724e-05, + "grad_norm": 3.932537078857422, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8817856907844543, + "num_tokens": 559527126.0, + "step": 14672 + }, + { + "epoch": 1.866556417758555, + "ewc_loss": 0.008045122027397156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.045122376643121e-05, + "grad_norm": 3.913530111312866, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8802692890167236, + "num_tokens": 559568342.0, + "step": 14673 + }, + { + "epoch": 1.8666836280371455, + "ewc_loss": 0.00805656798183918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056568185565993e-05, + "grad_norm": 3.9879820346832275, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8742969632148743, + "num_tokens": 559609597.0, + "step": 14674 + }, + { + "epoch": 1.866810838315736, + "ewc_loss": 0.008085940033197403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08594049885869e-05, + "grad_norm": 3.9986391067504883, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8752611875534058, + "num_tokens": 559646392.0, + "step": 14675 + }, + { + "epoch": 1.8669380485943265, + "ewc_loss": 0.008058140985667706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05814124760218e-05, + "grad_norm": 4.001251220703125, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8796648979187012, + "num_tokens": 559684836.0, + "step": 14676 + }, + { + "epoch": 1.867065258872917, + "ewc_loss": 0.008058474399149418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058474486460909e-05, + "grad_norm": 3.953676462173462, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8807232975959778, + "num_tokens": 559727793.0, + "step": 14677 + }, + { + "epoch": 1.8671924691515076, + "ewc_loss": 0.008034149184823036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034149504965171e-05, + "grad_norm": 3.9747235774993896, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8646590709686279, + "num_tokens": 559768154.0, + "step": 14678 + }, + { + "epoch": 1.867319679430098, + "ewc_loss": 0.008044403977692127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044404239626601e-05, + "grad_norm": 4.013216018676758, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8774252533912659, + "num_tokens": 559798732.0, + "step": 14679 + }, + { + "epoch": 1.8674468897086884, + "ewc_loss": 0.008042988367378712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.042988338274881e-05, + "grad_norm": 3.9934184551239014, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8748665452003479, + "num_tokens": 559842796.0, + "step": 14680 + }, + { + "epoch": 1.867574099987279, + "ewc_loss": 0.008004463277757168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.004463597899303e-05, + "grad_norm": 3.932229995727539, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8734636306762695, + "num_tokens": 559884198.0, + "step": 14681 + }, + { + "epoch": 1.8677013102658695, + "ewc_loss": 0.007985175587236881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.985175761859864e-05, + "grad_norm": 4.026848316192627, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8619909286499023, + "num_tokens": 559916749.0, + "step": 14682 + }, + { + "epoch": 1.86782852054446, + "ewc_loss": 0.008071374148130417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071374031715095e-05, + "grad_norm": 3.967411756515503, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8724132776260376, + "num_tokens": 559958430.0, + "step": 14683 + }, + { + "epoch": 1.8679557308230506, + "ewc_loss": 0.007990087382495403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.990087760845199e-05, + "grad_norm": 4.005077362060547, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.893994152545929, + "num_tokens": 559995720.0, + "step": 14684 + }, + { + "epoch": 1.8680829411016409, + "ewc_loss": 0.00803926307708025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03926304797642e-05, + "grad_norm": 3.9785656929016113, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8808417320251465, + "num_tokens": 560033304.0, + "step": 14685 + }, + { + "epoch": 1.8682101513802314, + "ewc_loss": 0.008015945553779602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.015945786610246e-05, + "grad_norm": 4.025371551513672, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8731131553649902, + "num_tokens": 560069879.0, + "step": 14686 + }, + { + "epoch": 1.868337361658822, + "ewc_loss": 0.00804197695106268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04197698016651e-05, + "grad_norm": 4.035251617431641, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8737852573394775, + "num_tokens": 560106853.0, + "step": 14687 + }, + { + "epoch": 1.8684645719374124, + "ewc_loss": 0.008024024777114391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024025009945035e-05, + "grad_norm": 4.029687404632568, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8785107135772705, + "num_tokens": 560144225.0, + "step": 14688 + }, + { + "epoch": 1.868591782216003, + "ewc_loss": 0.008001737296581268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001737296581268e-05, + "grad_norm": 3.963991641998291, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8697659969329834, + "num_tokens": 560186520.0, + "step": 14689 + }, + { + "epoch": 1.8687189924945935, + "ewc_loss": 0.007988658733665943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.988658762769774e-05, + "grad_norm": 4.0106353759765625, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8793516755104065, + "num_tokens": 560225458.0, + "step": 14690 + }, + { + "epoch": 1.868846202773184, + "ewc_loss": 0.008050160482525826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050160249695182e-05, + "grad_norm": 3.9994029998779297, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.880479097366333, + "num_tokens": 560263401.0, + "step": 14691 + }, + { + "epoch": 1.8689734130517746, + "ewc_loss": 0.008009728975594044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.009729208424687e-05, + "grad_norm": 3.942570686340332, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8734542727470398, + "num_tokens": 560305026.0, + "step": 14692 + }, + { + "epoch": 1.869100623330365, + "ewc_loss": 0.007981906644999981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.981906674103811e-05, + "grad_norm": 4.010544776916504, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8461491465568542, + "num_tokens": 560354823.0, + "step": 14693 + }, + { + "epoch": 1.8692278336089556, + "ewc_loss": 0.008054029196500778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054029603954405e-05, + "grad_norm": 4.002158164978027, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8920917510986328, + "num_tokens": 560391888.0, + "step": 14694 + }, + { + "epoch": 1.8693550438875461, + "ewc_loss": 0.008001760579645634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.001760579645634e-05, + "grad_norm": 4.028066635131836, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8784741163253784, + "num_tokens": 560425350.0, + "step": 14695 + }, + { + "epoch": 1.8694822541661367, + "ewc_loss": 0.00803101621568203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031015750020742e-05, + "grad_norm": 3.9845199584960938, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8872913122177124, + "num_tokens": 560458521.0, + "step": 14696 + }, + { + "epoch": 1.8696094644447272, + "ewc_loss": 0.007999388501048088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.99938861746341e-05, + "grad_norm": 3.9624686241149902, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8704167008399963, + "num_tokens": 560506378.0, + "step": 14697 + }, + { + "epoch": 1.8697366747233177, + "ewc_loss": 0.008005903102457523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00590350991115e-05, + "grad_norm": 3.966935634613037, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8979736566543579, + "num_tokens": 560544055.0, + "step": 14698 + }, + { + "epoch": 1.8698638850019083, + "ewc_loss": 0.008002374321222305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00237394287251e-05, + "grad_norm": 4.025859355926514, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8856178522109985, + "num_tokens": 560577240.0, + "step": 14699 + }, + { + "epoch": 1.8699910952804988, + "ewc_loss": 0.008039278909564018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03927905508317e-05, + "grad_norm": 3.988597869873047, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.872665286064148, + "num_tokens": 560615662.0, + "step": 14700 + }, + { + "epoch": 1.8701183055590893, + "ewc_loss": 0.008007198572158813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007198630366474e-05, + "grad_norm": 3.957702398300171, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8714851140975952, + "num_tokens": 560658386.0, + "step": 14701 + }, + { + "epoch": 1.8702455158376798, + "ewc_loss": 0.008000039495527744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000039815669879e-05, + "grad_norm": 3.9951703548431396, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.875164806842804, + "num_tokens": 560696366.0, + "step": 14702 + }, + { + "epoch": 1.8703727261162701, + "ewc_loss": 0.008038725703954697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038726082304493e-05, + "grad_norm": 4.022064208984375, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8664513826370239, + "num_tokens": 560736419.0, + "step": 14703 + }, + { + "epoch": 1.8704999363948607, + "ewc_loss": 0.008051179349422455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051179611356929e-05, + "grad_norm": 3.966984272003174, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8752206563949585, + "num_tokens": 560777839.0, + "step": 14704 + }, + { + "epoch": 1.8706271466734512, + "ewc_loss": 0.007993998937308788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.993999315658584e-05, + "grad_norm": 3.9889702796936035, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8865879774093628, + "num_tokens": 560816167.0, + "step": 14705 + }, + { + "epoch": 1.8707543569520417, + "ewc_loss": 0.008032023906707764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032023470150307e-05, + "grad_norm": 4.016457557678223, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8639139533042908, + "num_tokens": 560854524.0, + "step": 14706 + }, + { + "epoch": 1.8708815672306323, + "ewc_loss": 0.008041676133871078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.041675755521283e-05, + "grad_norm": 3.972888231277466, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8748908042907715, + "num_tokens": 560894393.0, + "step": 14707 + }, + { + "epoch": 1.8710087775092228, + "ewc_loss": 0.008004231378436089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00423149485141e-05, + "grad_norm": 3.914736032485962, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8877376914024353, + "num_tokens": 560938640.0, + "step": 14708 + }, + { + "epoch": 1.871135987787813, + "ewc_loss": 0.007977358065545559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.977357745403424e-05, + "grad_norm": 3.9999260902404785, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8754559755325317, + "num_tokens": 560976805.0, + "step": 14709 + }, + { + "epoch": 1.8712631980664036, + "ewc_loss": 0.008060535416007042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060535765253007e-05, + "grad_norm": 3.954397201538086, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8733389973640442, + "num_tokens": 561018940.0, + "step": 14710 + }, + { + "epoch": 1.8713904083449941, + "ewc_loss": 0.007995973341166973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.995973282959312e-05, + "grad_norm": 3.9500534534454346, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8818983435630798, + "num_tokens": 561065228.0, + "step": 14711 + }, + { + "epoch": 1.8715176186235847, + "ewc_loss": 0.008002074435353279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.002074173418805e-05, + "grad_norm": 3.9697635173797607, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8699319362640381, + "num_tokens": 561107540.0, + "step": 14712 + }, + { + "epoch": 1.8716448289021752, + "ewc_loss": 0.008028477430343628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028477896004915e-05, + "grad_norm": 4.097822666168213, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8593994975090027, + "num_tokens": 561144774.0, + "step": 14713 + }, + { + "epoch": 1.8717720391807657, + "ewc_loss": 0.008070563897490501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070564217632636e-05, + "grad_norm": 3.9687001705169678, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8564236164093018, + "num_tokens": 561186774.0, + "step": 14714 + }, + { + "epoch": 1.8718992494593563, + "ewc_loss": 0.007973060011863708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.973059837240726e-05, + "grad_norm": 3.974079132080078, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8877816796302795, + "num_tokens": 561223997.0, + "step": 14715 + }, + { + "epoch": 1.8720264597379468, + "ewc_loss": 0.008019746281206608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.019746019272134e-05, + "grad_norm": 3.9820399284362793, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.883402943611145, + "num_tokens": 561261583.0, + "step": 14716 + }, + { + "epoch": 1.8721536700165373, + "ewc_loss": 0.008014172315597534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.014171908143908e-05, + "grad_norm": 3.950805425643921, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8815656900405884, + "num_tokens": 561302681.0, + "step": 14717 + }, + { + "epoch": 1.8722808802951278, + "ewc_loss": 0.007997035048902035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997034845175222e-05, + "grad_norm": 4.060896873474121, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8750375509262085, + "num_tokens": 561338402.0, + "step": 14718 + }, + { + "epoch": 1.8724080905737184, + "ewc_loss": 0.00807960331439972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.079603139776736e-05, + "grad_norm": 4.025487899780273, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8795711994171143, + "num_tokens": 561372892.0, + "step": 14719 + }, + { + "epoch": 1.872535300852309, + "ewc_loss": 0.008006309159100056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006309508346021e-05, + "grad_norm": 3.974774122238159, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8726062774658203, + "num_tokens": 561409474.0, + "step": 14720 + }, + { + "epoch": 1.8726625111308994, + "ewc_loss": 0.008012661710381508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012662146938965e-05, + "grad_norm": 4.009364128112793, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8510842323303223, + "num_tokens": 561450152.0, + "step": 14721 + }, + { + "epoch": 1.87278972140949, + "ewc_loss": 0.008060634136199951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060633990680799e-05, + "grad_norm": 3.958620548248291, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8708927631378174, + "num_tokens": 561489448.0, + "step": 14722 + }, + { + "epoch": 1.8729169316880805, + "ewc_loss": 0.00802499521523714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024994895095006e-05, + "grad_norm": 4.008208274841309, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8550151586532593, + "num_tokens": 561529684.0, + "step": 14723 + }, + { + "epoch": 1.873044141966671, + "ewc_loss": 0.008089287206530571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.089287439361215e-05, + "grad_norm": 3.996119976043701, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8781547546386719, + "num_tokens": 561564259.0, + "step": 14724 + }, + { + "epoch": 1.8731713522452615, + "ewc_loss": 0.008061546832323074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061547123361379e-05, + "grad_norm": 3.9207754135131836, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8842501640319824, + "num_tokens": 561609650.0, + "step": 14725 + }, + { + "epoch": 1.873298562523852, + "ewc_loss": 0.00802527368068695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025273564271629e-05, + "grad_norm": 3.9978525638580322, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8819485306739807, + "num_tokens": 561645520.0, + "step": 14726 + }, + { + "epoch": 1.8734257728024426, + "ewc_loss": 0.008087088353931904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087087917374447e-05, + "grad_norm": 3.9906563758850098, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.881180465221405, + "num_tokens": 561682771.0, + "step": 14727 + }, + { + "epoch": 1.873552983081033, + "ewc_loss": 0.00806655827909708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066558075370267e-05, + "grad_norm": 4.001596927642822, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8672369718551636, + "num_tokens": 561721317.0, + "step": 14728 + }, + { + "epoch": 1.8736801933596234, + "ewc_loss": 0.008070728741586208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070728654274717e-05, + "grad_norm": 3.9880008697509766, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8786005973815918, + "num_tokens": 561763178.0, + "step": 14729 + }, + { + "epoch": 1.873807403638214, + "ewc_loss": 0.008049340918660164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.049340976867825e-05, + "grad_norm": 3.998455047607422, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.877727210521698, + "num_tokens": 561802292.0, + "step": 14730 + }, + { + "epoch": 1.8739346139168045, + "ewc_loss": 0.008046042174100876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046042057685554e-05, + "grad_norm": 3.9544453620910645, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8837717175483704, + "num_tokens": 561838610.0, + "step": 14731 + }, + { + "epoch": 1.874061824195395, + "ewc_loss": 0.008033809252083302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033808990148827e-05, + "grad_norm": 3.9697225093841553, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8810805082321167, + "num_tokens": 561881992.0, + "step": 14732 + }, + { + "epoch": 1.8741890344739855, + "ewc_loss": 0.008057309314608574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057309605646878e-05, + "grad_norm": 4.042332172393799, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8676725029945374, + "num_tokens": 561917661.0, + "step": 14733 + }, + { + "epoch": 1.8743162447525759, + "ewc_loss": 0.008076434023678303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076433732639998e-05, + "grad_norm": 3.95391845703125, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8775122165679932, + "num_tokens": 561958218.0, + "step": 14734 + }, + { + "epoch": 1.8744434550311664, + "ewc_loss": 0.008013849146664143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013848855625838e-05, + "grad_norm": 3.9507675170898438, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8885658979415894, + "num_tokens": 561993882.0, + "step": 14735 + }, + { + "epoch": 1.874570665309757, + "ewc_loss": 0.008040674962103367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040675311349332e-05, + "grad_norm": 4.0056047439575195, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8782142400741577, + "num_tokens": 562035443.0, + "step": 14736 + }, + { + "epoch": 1.8746978755883474, + "ewc_loss": 0.008067446760833263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067446469794959e-05, + "grad_norm": 3.9996275901794434, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8739058971405029, + "num_tokens": 562072806.0, + "step": 14737 + }, + { + "epoch": 1.874825085866938, + "ewc_loss": 0.008039849810302258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039849490160123e-05, + "grad_norm": 3.999288558959961, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8598502278327942, + "num_tokens": 562113652.0, + "step": 14738 + }, + { + "epoch": 1.8749522961455285, + "ewc_loss": 0.008047603070735931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047602750593796e-05, + "grad_norm": 4.0225725173950195, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8690081238746643, + "num_tokens": 562150600.0, + "step": 14739 + }, + { + "epoch": 1.875079506424119, + "ewc_loss": 0.00804448127746582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044481364777312e-05, + "grad_norm": 3.9789443016052246, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8800978064537048, + "num_tokens": 562187730.0, + "step": 14740 + }, + { + "epoch": 1.8752067167027096, + "ewc_loss": 0.008025932125747204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025932038435712e-05, + "grad_norm": 3.9897167682647705, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8766806721687317, + "num_tokens": 562228229.0, + "step": 14741 + }, + { + "epoch": 1.8753339269813, + "ewc_loss": 0.008031723089516163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031722973100841e-05, + "grad_norm": 4.0274882316589355, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8820744752883911, + "num_tokens": 562262293.0, + "step": 14742 + }, + { + "epoch": 1.8754611372598906, + "ewc_loss": 0.008071922697126865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071922638919204e-05, + "grad_norm": 3.9988043308258057, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8744848370552063, + "num_tokens": 562303389.0, + "step": 14743 + }, + { + "epoch": 1.8755883475384811, + "ewc_loss": 0.008032228797674179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.032228652155027e-05, + "grad_norm": 3.9515695571899414, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8711291551589966, + "num_tokens": 562344461.0, + "step": 14744 + }, + { + "epoch": 1.8757155578170717, + "ewc_loss": 0.008017241954803467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017241634661332e-05, + "grad_norm": 3.9953932762145996, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8581511974334717, + "num_tokens": 562387068.0, + "step": 14745 + }, + { + "epoch": 1.8758427680956622, + "ewc_loss": 0.008065454661846161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.065454312600195e-05, + "grad_norm": 3.978026866912842, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8745318651199341, + "num_tokens": 562428225.0, + "step": 14746 + }, + { + "epoch": 1.8759699783742527, + "ewc_loss": 0.008017514832317829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.017514483071864e-05, + "grad_norm": 4.033690929412842, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8766475915908813, + "num_tokens": 562463879.0, + "step": 14747 + }, + { + "epoch": 1.8760971886528433, + "ewc_loss": 0.008056554943323135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056555088842288e-05, + "grad_norm": 4.620720863342285, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8710072040557861, + "num_tokens": 562502327.0, + "step": 14748 + }, + { + "epoch": 1.8762243989314338, + "ewc_loss": 0.008379624225199223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379624341614544e-05, + "grad_norm": 3.9566569328308105, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8811900615692139, + "num_tokens": 562542162.0, + "step": 14749 + }, + { + "epoch": 1.8763516092100243, + "ewc_loss": 0.007812771946191788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.812771946191788e-05, + "grad_norm": 3.9163193702697754, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8715760707855225, + "num_tokens": 562583364.0, + "step": 14750 + }, + { + "epoch": 1.8764788194886148, + "ewc_loss": 0.008051291108131409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051290933508426e-05, + "grad_norm": 4.04177188873291, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8776618838310242, + "num_tokens": 562615870.0, + "step": 14751 + }, + { + "epoch": 1.8766060297672051, + "ewc_loss": 0.008099035359919071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.099035767372698e-05, + "grad_norm": 3.9665374755859375, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8631026148796082, + "num_tokens": 562655086.0, + "step": 14752 + }, + { + "epoch": 1.8767332400457957, + "ewc_loss": 0.008020631968975067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020632230909541e-05, + "grad_norm": 3.985854387283325, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8696305155754089, + "num_tokens": 562697532.0, + "step": 14753 + }, + { + "epoch": 1.8768604503243862, + "ewc_loss": 0.008080728352069855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08072873041965e-05, + "grad_norm": 4.060598850250244, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8565450310707092, + "num_tokens": 562731057.0, + "step": 14754 + }, + { + "epoch": 1.8769876606029767, + "ewc_loss": 0.008120992220938206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.120992424665019e-05, + "grad_norm": 3.9753544330596924, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8659684658050537, + "num_tokens": 562770402.0, + "step": 14755 + }, + { + "epoch": 1.8771148708815673, + "ewc_loss": 0.008071520365774632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07152027846314e-05, + "grad_norm": 4.032528877258301, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8725064992904663, + "num_tokens": 562806191.0, + "step": 14756 + }, + { + "epoch": 1.8772420811601578, + "ewc_loss": 0.008131243288516998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.131243521347642e-05, + "grad_norm": 3.980980157852173, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8744512796401978, + "num_tokens": 562846864.0, + "step": 14757 + }, + { + "epoch": 1.877369291438748, + "ewc_loss": 0.00807131640613079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071316551649943e-05, + "grad_norm": 3.9904592037200928, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8708867430686951, + "num_tokens": 562882887.0, + "step": 14758 + }, + { + "epoch": 1.8774965017173386, + "ewc_loss": 0.008117731660604477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.117732068058103e-05, + "grad_norm": 4.032716751098633, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8747388124465942, + "num_tokens": 562919096.0, + "step": 14759 + }, + { + "epoch": 1.8776237119959291, + "ewc_loss": 0.008145502768456936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.145502943079919e-05, + "grad_norm": 4.016748428344727, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8492107391357422, + "num_tokens": 562959336.0, + "step": 14760 + }, + { + "epoch": 1.8777509222745197, + "ewc_loss": 0.00808655098080635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08655095170252e-05, + "grad_norm": 3.986182451248169, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8743700981140137, + "num_tokens": 562997883.0, + "step": 14761 + }, + { + "epoch": 1.8778781325531102, + "ewc_loss": 0.008087088353931904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087088644970208e-05, + "grad_norm": 4.0091657638549805, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8846380710601807, + "num_tokens": 563036763.0, + "step": 14762 + }, + { + "epoch": 1.8780053428317007, + "ewc_loss": 0.008121879771351814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121879363898188e-05, + "grad_norm": 3.960753917694092, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8913190364837646, + "num_tokens": 563076815.0, + "step": 14763 + }, + { + "epoch": 1.8781325531102913, + "ewc_loss": 0.008073522709310055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.073522621998563e-05, + "grad_norm": 4.012530326843262, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8857095241546631, + "num_tokens": 563110511.0, + "step": 14764 + }, + { + "epoch": 1.8782597633888818, + "ewc_loss": 0.008132967166602612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.132967195706442e-05, + "grad_norm": 4.078354835510254, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8704867362976074, + "num_tokens": 563140757.0, + "step": 14765 + }, + { + "epoch": 1.8783869736674723, + "ewc_loss": 0.008128858171403408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.128857734845951e-05, + "grad_norm": 3.954273223876953, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.875174880027771, + "num_tokens": 563179961.0, + "step": 14766 + }, + { + "epoch": 1.8785141839460628, + "ewc_loss": 0.008070367388427258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070367039181292e-05, + "grad_norm": 3.993969202041626, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.883516788482666, + "num_tokens": 563221355.0, + "step": 14767 + }, + { + "epoch": 1.8786413942246534, + "ewc_loss": 0.008136386051774025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.136386168189347e-05, + "grad_norm": 3.964169502258301, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8961062431335449, + "num_tokens": 563257427.0, + "step": 14768 + }, + { + "epoch": 1.878768604503244, + "ewc_loss": 0.008080415427684784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080415864242241e-05, + "grad_norm": 4.009528160095215, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8738555312156677, + "num_tokens": 563294371.0, + "step": 14769 + }, + { + "epoch": 1.8788958147818344, + "ewc_loss": 0.008125045336782932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.125045133056119e-05, + "grad_norm": 3.940089702606201, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8854370713233948, + "num_tokens": 563338592.0, + "step": 14770 + }, + { + "epoch": 1.879023025060425, + "ewc_loss": 0.008072227239608765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.072226773947477e-05, + "grad_norm": 4.071815490722656, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8776792287826538, + "num_tokens": 563373853.0, + "step": 14771 + }, + { + "epoch": 1.8791502353390155, + "ewc_loss": 0.008138008415699005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.138007979141548e-05, + "grad_norm": 4.020386219024658, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.873948335647583, + "num_tokens": 563411137.0, + "step": 14772 + }, + { + "epoch": 1.879277445617606, + "ewc_loss": 0.008064801804721355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064801659202203e-05, + "grad_norm": 4.016950607299805, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8796629905700684, + "num_tokens": 563450294.0, + "step": 14773 + }, + { + "epoch": 1.8794046558961965, + "ewc_loss": 0.008067470043897629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067470480455086e-05, + "grad_norm": 3.966212511062622, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8766999244689941, + "num_tokens": 563496858.0, + "step": 14774 + }, + { + "epoch": 1.879531866174787, + "ewc_loss": 0.00803022738546133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.030227763811126e-05, + "grad_norm": 4.0225749015808105, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8720882534980774, + "num_tokens": 563539566.0, + "step": 14775 + }, + { + "epoch": 1.8796590764533776, + "ewc_loss": 0.008077790029346943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077789971139282e-05, + "grad_norm": 4.030210971832275, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8839671611785889, + "num_tokens": 563574363.0, + "step": 14776 + }, + { + "epoch": 1.879786286731968, + "ewc_loss": 0.00805091392248869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050914038904011e-05, + "grad_norm": 4.055789947509766, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8772170543670654, + "num_tokens": 563605540.0, + "step": 14777 + }, + { + "epoch": 1.8799134970105584, + "ewc_loss": 0.008061513304710388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061513653956354e-05, + "grad_norm": 4.076683521270752, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8628851175308228, + "num_tokens": 563646640.0, + "step": 14778 + }, + { + "epoch": 1.880040707289149, + "ewc_loss": 0.008080176077783108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080175757640973e-05, + "grad_norm": 3.983928918838501, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8872352838516235, + "num_tokens": 563683576.0, + "step": 14779 + }, + { + "epoch": 1.8801679175677395, + "ewc_loss": 0.00803445652127266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03445655037649e-05, + "grad_norm": 4.0229291915893555, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8679805397987366, + "num_tokens": 563724088.0, + "step": 14780 + }, + { + "epoch": 1.88029512784633, + "ewc_loss": 0.00808943435549736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.089434413705021e-05, + "grad_norm": 4.0163726806640625, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8745659589767456, + "num_tokens": 563760897.0, + "step": 14781 + }, + { + "epoch": 1.8804223381249205, + "ewc_loss": 0.008055411279201508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055411308305338e-05, + "grad_norm": 3.9356322288513184, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8801339268684387, + "num_tokens": 563800158.0, + "step": 14782 + }, + { + "epoch": 1.8805495484035109, + "ewc_loss": 0.008007007651031017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007008000276983e-05, + "grad_norm": 4.016303539276123, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8818252086639404, + "num_tokens": 563833288.0, + "step": 14783 + }, + { + "epoch": 1.8806767586821014, + "ewc_loss": 0.00808666367083788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08666372904554e-05, + "grad_norm": 4.0490827560424805, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8862008452415466, + "num_tokens": 563874507.0, + "step": 14784 + }, + { + "epoch": 1.880803968960692, + "ewc_loss": 0.008064422756433487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064422581810504e-05, + "grad_norm": 3.9758687019348145, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8759564161300659, + "num_tokens": 563912452.0, + "step": 14785 + }, + { + "epoch": 1.8809311792392824, + "ewc_loss": 0.008018947206437588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018947119126096e-05, + "grad_norm": 4.012364864349365, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8607038259506226, + "num_tokens": 563950565.0, + "step": 14786 + }, + { + "epoch": 1.881058389517873, + "ewc_loss": 0.008081144653260708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081144915195182e-05, + "grad_norm": 4.0366034507751465, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.865424633026123, + "num_tokens": 563984924.0, + "step": 14787 + }, + { + "epoch": 1.8811855997964635, + "ewc_loss": 0.008085362613201141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085362787824124e-05, + "grad_norm": 3.9561851024627686, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8827606439590454, + "num_tokens": 564022887.0, + "step": 14788 + }, + { + "epoch": 1.881312810075054, + "ewc_loss": 0.008035475388169289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035475184442475e-05, + "grad_norm": 3.958181858062744, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8754255175590515, + "num_tokens": 564064829.0, + "step": 14789 + }, + { + "epoch": 1.8814400203536445, + "ewc_loss": 0.00805866252630949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058662933763117e-05, + "grad_norm": 4.039797782897949, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8543727993965149, + "num_tokens": 564100858.0, + "step": 14790 + }, + { + "epoch": 1.881567230632235, + "ewc_loss": 0.008123405277729034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.123405132209882e-05, + "grad_norm": 3.996152400970459, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8701430559158325, + "num_tokens": 564141477.0, + "step": 14791 + }, + { + "epoch": 1.8816944409108256, + "ewc_loss": 0.008055683225393295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055683429120108e-05, + "grad_norm": 3.9910669326782227, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8697282671928406, + "num_tokens": 564179271.0, + "step": 14792 + }, + { + "epoch": 1.8818216511894161, + "ewc_loss": 0.008083627559244633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.083627471933141e-05, + "grad_norm": 3.932896852493286, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8813456296920776, + "num_tokens": 564224336.0, + "step": 14793 + }, + { + "epoch": 1.8819488614680067, + "ewc_loss": 0.00803497713059187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034976781345904e-05, + "grad_norm": 3.966050148010254, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8722457885742188, + "num_tokens": 564265157.0, + "step": 14794 + }, + { + "epoch": 1.8820760717465972, + "ewc_loss": 0.008059668354690075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059668471105397e-05, + "grad_norm": 4.047635078430176, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8825414180755615, + "num_tokens": 564295023.0, + "step": 14795 + }, + { + "epoch": 1.8822032820251877, + "ewc_loss": 0.008116809651255608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116809476632625e-05, + "grad_norm": 3.953331470489502, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8775482177734375, + "num_tokens": 564333700.0, + "step": 14796 + }, + { + "epoch": 1.8823304923037782, + "ewc_loss": 0.008020248264074326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020248060347512e-05, + "grad_norm": 4.047048091888428, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8615552186965942, + "num_tokens": 564374342.0, + "step": 14797 + }, + { + "epoch": 1.8824577025823688, + "ewc_loss": 0.008101304993033409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.101305138552561e-05, + "grad_norm": 3.9068570137023926, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8783280253410339, + "num_tokens": 564422877.0, + "step": 14798 + }, + { + "epoch": 1.8825849128609593, + "ewc_loss": 0.007992229424417019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992229802766815e-05, + "grad_norm": 3.9887962341308594, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8751834630966187, + "num_tokens": 564464713.0, + "step": 14799 + }, + { + "epoch": 1.8827121231395498, + "ewc_loss": 0.008090287446975708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090287883533165e-05, + "grad_norm": 4.044164657592773, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8659050464630127, + "num_tokens": 564500667.0, + "step": 14800 + }, + { + "epoch": 1.8828393334181401, + "ewc_loss": 0.008073479868471622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07347969384864e-05, + "grad_norm": 3.931887149810791, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8849528431892395, + "num_tokens": 564541460.0, + "step": 14801 + }, + { + "epoch": 1.8829665436967307, + "ewc_loss": 0.008005714043974876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00571360741742e-05, + "grad_norm": 4.029818058013916, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8683900237083435, + "num_tokens": 564580983.0, + "step": 14802 + }, + { + "epoch": 1.8830937539753212, + "ewc_loss": 0.008093885146081448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093885116977617e-05, + "grad_norm": 3.9887280464172363, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8837865591049194, + "num_tokens": 564615967.0, + "step": 14803 + }, + { + "epoch": 1.8832209642539117, + "ewc_loss": 0.008023268543183804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02326831035316e-05, + "grad_norm": 4.024001121520996, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8693970441818237, + "num_tokens": 564653255.0, + "step": 14804 + }, + { + "epoch": 1.8833481745325023, + "ewc_loss": 0.008065185509622097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06518510216847e-05, + "grad_norm": 4.024641990661621, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8753527998924255, + "num_tokens": 564688420.0, + "step": 14805 + }, + { + "epoch": 1.8834753848110928, + "ewc_loss": 0.008057324215769768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057324157562107e-05, + "grad_norm": 3.935703754425049, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8709033727645874, + "num_tokens": 564733123.0, + "step": 14806 + }, + { + "epoch": 1.883602595089683, + "ewc_loss": 0.008011441677808762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011441968847066e-05, + "grad_norm": 3.976217031478882, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8818666934967041, + "num_tokens": 564767712.0, + "step": 14807 + }, + { + "epoch": 1.8837298053682736, + "ewc_loss": 0.008067463524639606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067463204497471e-05, + "grad_norm": 3.970036268234253, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.873460590839386, + "num_tokens": 564810663.0, + "step": 14808 + }, + { + "epoch": 1.8838570156468641, + "ewc_loss": 0.008040670305490494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040670218179002e-05, + "grad_norm": 4.012508392333984, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8756105899810791, + "num_tokens": 564848689.0, + "step": 14809 + }, + { + "epoch": 1.8839842259254547, + "ewc_loss": 0.00806775875389576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067758608376607e-05, + "grad_norm": 3.9935808181762695, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8647513389587402, + "num_tokens": 564887426.0, + "step": 14810 + }, + { + "epoch": 1.8841114362040452, + "ewc_loss": 0.008048499934375286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.048499876167625e-05, + "grad_norm": 3.9880874156951904, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8798319101333618, + "num_tokens": 564924852.0, + "step": 14811 + }, + { + "epoch": 1.8842386464826357, + "ewc_loss": 0.008067766204476357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067766611929983e-05, + "grad_norm": 3.9609384536743164, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8722962737083435, + "num_tokens": 564964201.0, + "step": 14812 + }, + { + "epoch": 1.8843658567612263, + "ewc_loss": 0.00802849605679512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028496085898951e-05, + "grad_norm": 3.9714479446411133, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8869308233261108, + "num_tokens": 565004939.0, + "step": 14813 + }, + { + "epoch": 1.8844930670398168, + "ewc_loss": 0.008070013485848904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070013427641243e-05, + "grad_norm": 3.9865710735321045, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8799324035644531, + "num_tokens": 565044067.0, + "step": 14814 + }, + { + "epoch": 1.8846202773184073, + "ewc_loss": 0.008049769327044487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0497695307713e-05, + "grad_norm": 4.000823974609375, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8763903379440308, + "num_tokens": 565082114.0, + "step": 14815 + }, + { + "epoch": 1.8847474875969978, + "ewc_loss": 0.008044584654271603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044584683375433e-05, + "grad_norm": 4.011937141418457, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8764792084693909, + "num_tokens": 565118350.0, + "step": 14816 + }, + { + "epoch": 1.8848746978755884, + "ewc_loss": 0.008053051307797432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053050987655297e-05, + "grad_norm": 4.043050289154053, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8673169016838074, + "num_tokens": 565153313.0, + "step": 14817 + }, + { + "epoch": 1.885001908154179, + "ewc_loss": 0.008049643598496914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.049643656704575e-05, + "grad_norm": 4.002514362335205, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8626658916473389, + "num_tokens": 565191332.0, + "step": 14818 + }, + { + "epoch": 1.8851291184327694, + "ewc_loss": 0.008023965172469616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023965347092599e-05, + "grad_norm": 3.9635114669799805, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8730002045631409, + "num_tokens": 565229937.0, + "step": 14819 + }, + { + "epoch": 1.88525632871136, + "ewc_loss": 0.008041788823902607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.041788532864302e-05, + "grad_norm": 4.010162830352783, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8695083260536194, + "num_tokens": 565266271.0, + "step": 14820 + }, + { + "epoch": 1.8853835389899505, + "ewc_loss": 0.008067600429058075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06760072009638e-05, + "grad_norm": 4.034253120422363, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.872806191444397, + "num_tokens": 565300593.0, + "step": 14821 + }, + { + "epoch": 1.885510749268541, + "ewc_loss": 0.008067195303738117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06719544925727e-05, + "grad_norm": 3.9435019493103027, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.875668466091156, + "num_tokens": 565341801.0, + "step": 14822 + }, + { + "epoch": 1.8856379595471315, + "ewc_loss": 0.008011800236999989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011800673557445e-05, + "grad_norm": 4.005122184753418, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8936635851860046, + "num_tokens": 565374380.0, + "step": 14823 + }, + { + "epoch": 1.885765169825722, + "ewc_loss": 0.008099361322820187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.099361002678052e-05, + "grad_norm": 3.9823737144470215, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8679478168487549, + "num_tokens": 565413096.0, + "step": 14824 + }, + { + "epoch": 1.8858923801043126, + "ewc_loss": 0.00806600321084261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066002919804305e-05, + "grad_norm": 3.9645307064056396, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8854706883430481, + "num_tokens": 565453292.0, + "step": 14825 + }, + { + "epoch": 1.886019590382903, + "ewc_loss": 0.008070078678429127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07007891125977e-05, + "grad_norm": 4.019972801208496, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8718732595443726, + "num_tokens": 565489311.0, + "step": 14826 + }, + { + "epoch": 1.8861468006614934, + "ewc_loss": 0.008115454576909542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.115454693324864e-05, + "grad_norm": 4.00591516494751, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8594394326210022, + "num_tokens": 565526293.0, + "step": 14827 + }, + { + "epoch": 1.886274010940084, + "ewc_loss": 0.008085343986749649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085343870334327e-05, + "grad_norm": 3.9520576000213623, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8691123723983765, + "num_tokens": 565569215.0, + "step": 14828 + }, + { + "epoch": 1.8864012212186745, + "ewc_loss": 0.008076944388449192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076944504864514e-05, + "grad_norm": 3.96502423286438, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8852101564407349, + "num_tokens": 565610755.0, + "step": 14829 + }, + { + "epoch": 1.886528431497265, + "ewc_loss": 0.008089013397693634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.089013863354921e-05, + "grad_norm": 4.053582191467285, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8795071840286255, + "num_tokens": 565641964.0, + "step": 14830 + }, + { + "epoch": 1.8866556417758555, + "ewc_loss": 0.008142033591866493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.142033766489476e-05, + "grad_norm": 4.046426296234131, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8429797887802124, + "num_tokens": 565682852.0, + "step": 14831 + }, + { + "epoch": 1.8867828520544458, + "ewc_loss": 0.008090524934232235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090525079751387e-05, + "grad_norm": 3.94756817817688, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8915377855300903, + "num_tokens": 565721352.0, + "step": 14832 + }, + { + "epoch": 1.8869100623330364, + "ewc_loss": 0.00806946586817503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069466275628656e-05, + "grad_norm": 3.9664018154144287, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8798855543136597, + "num_tokens": 565764045.0, + "step": 14833 + }, + { + "epoch": 1.887037272611627, + "ewc_loss": 0.008090483024716377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090482879197225e-05, + "grad_norm": 4.004837512969971, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8828200101852417, + "num_tokens": 565797617.0, + "step": 14834 + }, + { + "epoch": 1.8871644828902174, + "ewc_loss": 0.008119040168821812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.119040285237134e-05, + "grad_norm": 3.9866390228271484, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8750550746917725, + "num_tokens": 565839254.0, + "step": 14835 + }, + { + "epoch": 1.887291693168808, + "ewc_loss": 0.008062436245381832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062436245381832e-05, + "grad_norm": 3.987471580505371, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8756924867630005, + "num_tokens": 565874742.0, + "step": 14836 + }, + { + "epoch": 1.8874189034473985, + "ewc_loss": 0.008092250674962997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092250936897472e-05, + "grad_norm": 4.057947158813477, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8770946264266968, + "num_tokens": 565911043.0, + "step": 14837 + }, + { + "epoch": 1.887546113725989, + "ewc_loss": 0.008108004927635193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.108004840323702e-05, + "grad_norm": 3.9755871295928955, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8717597723007202, + "num_tokens": 565950228.0, + "step": 14838 + }, + { + "epoch": 1.8876733240045795, + "ewc_loss": 0.00804479606449604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044796413742006e-05, + "grad_norm": 3.982839822769165, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8878157734870911, + "num_tokens": 565982891.0, + "step": 14839 + }, + { + "epoch": 1.88780053428317, + "ewc_loss": 0.00806654617190361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066546433838084e-05, + "grad_norm": 3.9423179626464844, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8737965822219849, + "num_tokens": 566022601.0, + "step": 14840 + }, + { + "epoch": 1.8879277445617606, + "ewc_loss": 0.00805621687322855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056216756813228e-05, + "grad_norm": 4.046483039855957, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8763176202774048, + "num_tokens": 566054928.0, + "step": 14841 + }, + { + "epoch": 1.8880549548403511, + "ewc_loss": 0.00810651108622551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.10651108622551e-05, + "grad_norm": 3.972399950027466, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8697987794876099, + "num_tokens": 566094900.0, + "step": 14842 + }, + { + "epoch": 1.8881821651189417, + "ewc_loss": 0.008041862398386002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.041862020036206e-05, + "grad_norm": 4.09287691116333, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8740537762641907, + "num_tokens": 566125230.0, + "step": 14843 + }, + { + "epoch": 1.8883093753975322, + "ewc_loss": 0.008128199726343155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.128199988277629e-05, + "grad_norm": 3.9502336978912354, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8910104632377625, + "num_tokens": 566165320.0, + "step": 14844 + }, + { + "epoch": 1.8884365856761227, + "ewc_loss": 0.008005740121006966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.005740528460592e-05, + "grad_norm": 4.059966087341309, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8669867515563965, + "num_tokens": 566202769.0, + "step": 14845 + }, + { + "epoch": 1.8885637959547132, + "ewc_loss": 0.008132505230605602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.132505172397941e-05, + "grad_norm": 3.9428112506866455, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8768171072006226, + "num_tokens": 566245874.0, + "step": 14846 + }, + { + "epoch": 1.8886910062333038, + "ewc_loss": 0.008005582727491856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.005582640180364e-05, + "grad_norm": 3.9957778453826904, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8583779335021973, + "num_tokens": 566284855.0, + "step": 14847 + }, + { + "epoch": 1.8888182165118943, + "ewc_loss": 0.00808286014944315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082860586000606e-05, + "grad_norm": 4.027398586273193, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8668535351753235, + "num_tokens": 566323422.0, + "step": 14848 + }, + { + "epoch": 1.8889454267904848, + "ewc_loss": 0.008084709756076336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08470940683037e-05, + "grad_norm": 3.983027219772339, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8842663764953613, + "num_tokens": 566364357.0, + "step": 14849 + }, + { + "epoch": 1.8890726370690751, + "ewc_loss": 0.008050750009715557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050750329857692e-05, + "grad_norm": 4.024125099182129, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8726288080215454, + "num_tokens": 566404498.0, + "step": 14850 + }, + { + "epoch": 1.8891998473476657, + "ewc_loss": 0.00807679258286953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076792437350377e-05, + "grad_norm": 3.9882729053497314, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8713368773460388, + "num_tokens": 566442960.0, + "step": 14851 + }, + { + "epoch": 1.8893270576262562, + "ewc_loss": 0.00804681796580553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046817674767226e-05, + "grad_norm": 4.017602443695068, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8839622139930725, + "num_tokens": 566475087.0, + "step": 14852 + }, + { + "epoch": 1.8894542679048467, + "ewc_loss": 0.008079578168690205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.079578401520848e-05, + "grad_norm": 4.010751724243164, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8666896224021912, + "num_tokens": 566514337.0, + "step": 14853 + }, + { + "epoch": 1.8895814781834372, + "ewc_loss": 0.008059486746788025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059486572165042e-05, + "grad_norm": 4.0339674949646, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8654197454452515, + "num_tokens": 566553204.0, + "step": 14854 + }, + { + "epoch": 1.8897086884620278, + "ewc_loss": 0.008081270381808281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081270789261907e-05, + "grad_norm": 4.03262996673584, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8740965723991394, + "num_tokens": 566589697.0, + "step": 14855 + }, + { + "epoch": 1.889835898740618, + "ewc_loss": 0.00808054581284523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080545376287773e-05, + "grad_norm": 4.0291619300842285, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8612985610961914, + "num_tokens": 566629680.0, + "step": 14856 + }, + { + "epoch": 1.8899631090192086, + "ewc_loss": 0.008081060834228992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081060514086857e-05, + "grad_norm": 3.9537479877471924, + "learning_rate": 1e-06, + "loss": 0.2883, + "mean_token_accuracy": 0.8987360000610352, + "num_tokens": 566670931.0, + "step": 14857 + }, + { + "epoch": 1.8900903192977991, + "ewc_loss": 0.0080372653901577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037265797611326e-05, + "grad_norm": 4.022486209869385, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8707790374755859, + "num_tokens": 566706673.0, + "step": 14858 + }, + { + "epoch": 1.8902175295763897, + "ewc_loss": 0.008103566244244576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.10356650617905e-05, + "grad_norm": 3.970381259918213, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8705753087997437, + "num_tokens": 566749503.0, + "step": 14859 + }, + { + "epoch": 1.8903447398549802, + "ewc_loss": 0.008054613135755062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054613135755062e-05, + "grad_norm": 3.9821360111236572, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8896327018737793, + "num_tokens": 566790700.0, + "step": 14860 + }, + { + "epoch": 1.8904719501335707, + "ewc_loss": 0.00807663332670927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076633093878627e-05, + "grad_norm": 3.9753317832946777, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8738043308258057, + "num_tokens": 566832995.0, + "step": 14861 + }, + { + "epoch": 1.8905991604121613, + "ewc_loss": 0.008047842420637608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047842129599303e-05, + "grad_norm": 4.065321922302246, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8648369312286377, + "num_tokens": 566868190.0, + "step": 14862 + }, + { + "epoch": 1.8907263706907518, + "ewc_loss": 0.008100918494164944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.100918785203248e-05, + "grad_norm": 4.0374250411987305, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8486357927322388, + "num_tokens": 566906637.0, + "step": 14863 + }, + { + "epoch": 1.8908535809693423, + "ewc_loss": 0.008055496960878372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055497164605185e-05, + "grad_norm": 3.982675790786743, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8763732314109802, + "num_tokens": 566946006.0, + "step": 14864 + }, + { + "epoch": 1.8909807912479328, + "ewc_loss": 0.008050145581364632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050145697779953e-05, + "grad_norm": 4.021720886230469, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8639032244682312, + "num_tokens": 566987726.0, + "step": 14865 + }, + { + "epoch": 1.8911080015265234, + "ewc_loss": 0.008083779364824295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.083779539447278e-05, + "grad_norm": 3.9485909938812256, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8899299502372742, + "num_tokens": 567027432.0, + "step": 14866 + }, + { + "epoch": 1.891235211805114, + "ewc_loss": 0.008035535924136639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035536302486435e-05, + "grad_norm": 4.02901029586792, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8818946480751038, + "num_tokens": 567060607.0, + "step": 14867 + }, + { + "epoch": 1.8913624220837044, + "ewc_loss": 0.008079892955720425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.079892722889781e-05, + "grad_norm": 3.984780788421631, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8701462745666504, + "num_tokens": 567100691.0, + "step": 14868 + }, + { + "epoch": 1.891489632362295, + "ewc_loss": 0.008050240576267242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050241012824699e-05, + "grad_norm": 3.934954881668091, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8934546709060669, + "num_tokens": 567139460.0, + "step": 14869 + }, + { + "epoch": 1.8916168426408855, + "ewc_loss": 0.008027187548577785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02718786871992e-05, + "grad_norm": 4.0071516036987305, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8632113337516785, + "num_tokens": 567185384.0, + "step": 14870 + }, + { + "epoch": 1.891744052919476, + "ewc_loss": 0.008088389411568642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.088389586191624e-05, + "grad_norm": 4.039084434509277, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8747593760490417, + "num_tokens": 567219898.0, + "step": 14871 + }, + { + "epoch": 1.8918712631980665, + "ewc_loss": 0.008055209182202816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055209036683664e-05, + "grad_norm": 3.9603092670440674, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8839762210845947, + "num_tokens": 567260322.0, + "step": 14872 + }, + { + "epoch": 1.891998473476657, + "ewc_loss": 0.007994938641786575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994938641786575e-05, + "grad_norm": 4.05262565612793, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8708078861236572, + "num_tokens": 567293739.0, + "step": 14873 + }, + { + "epoch": 1.8921256837552476, + "ewc_loss": 0.008101092651486397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.101092680590227e-05, + "grad_norm": 4.047353744506836, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8764007091522217, + "num_tokens": 567327282.0, + "step": 14874 + }, + { + "epoch": 1.892252894033838, + "ewc_loss": 0.00803549773991108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03549773991108e-05, + "grad_norm": 4.013700008392334, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8688750267028809, + "num_tokens": 567366402.0, + "step": 14875 + }, + { + "epoch": 1.8923801043124284, + "ewc_loss": 0.008040708489716053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040708053158596e-05, + "grad_norm": 4.011956691741943, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.869050145149231, + "num_tokens": 567409924.0, + "step": 14876 + }, + { + "epoch": 1.892507314591019, + "ewc_loss": 0.00803486630320549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034866186790168e-05, + "grad_norm": 3.9571373462677, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8786097764968872, + "num_tokens": 567450355.0, + "step": 14877 + }, + { + "epoch": 1.8926345248696095, + "ewc_loss": 0.008029268123209476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029268065001816e-05, + "grad_norm": 4.023082256317139, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.889162003993988, + "num_tokens": 567487701.0, + "step": 14878 + }, + { + "epoch": 1.8927617351482, + "ewc_loss": 0.008055539801716805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055539365159348e-05, + "grad_norm": 3.9786479473114014, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8757487535476685, + "num_tokens": 567527619.0, + "step": 14879 + }, + { + "epoch": 1.8928889454267905, + "ewc_loss": 0.008016545325517654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.016545325517654e-05, + "grad_norm": 4.01113748550415, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8743135333061218, + "num_tokens": 567567016.0, + "step": 14880 + }, + { + "epoch": 1.8930161557053808, + "ewc_loss": 0.008037999272346497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037999214138836e-05, + "grad_norm": 4.019594192504883, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8618649244308472, + "num_tokens": 567605465.0, + "step": 14881 + }, + { + "epoch": 1.8931433659839714, + "ewc_loss": 0.00801842287182808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.018422522582114e-05, + "grad_norm": 3.9761338233947754, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8782219290733337, + "num_tokens": 567646680.0, + "step": 14882 + }, + { + "epoch": 1.893270576262562, + "ewc_loss": 0.007994083687663078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.994083716766909e-05, + "grad_norm": 3.9569664001464844, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8767861127853394, + "num_tokens": 567685685.0, + "step": 14883 + }, + { + "epoch": 1.8933977865411524, + "ewc_loss": 0.008011721074581146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01172063802369e-05, + "grad_norm": 4.093320369720459, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8649373054504395, + "num_tokens": 567717222.0, + "step": 14884 + }, + { + "epoch": 1.893524996819743, + "ewc_loss": 0.008096640929579735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.096640522126108e-05, + "grad_norm": 3.985924482345581, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8667596578598022, + "num_tokens": 567761050.0, + "step": 14885 + }, + { + "epoch": 1.8936522070983335, + "ewc_loss": 0.007974587380886078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.974587060743943e-05, + "grad_norm": 3.98022723197937, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.873063325881958, + "num_tokens": 567801423.0, + "step": 14886 + }, + { + "epoch": 1.893779417376924, + "ewc_loss": 0.008012481965124607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.012481703190133e-05, + "grad_norm": 3.9799699783325195, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8670904040336609, + "num_tokens": 567844614.0, + "step": 14887 + }, + { + "epoch": 1.8939066276555145, + "ewc_loss": 0.008037278428673744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.03727816673927e-05, + "grad_norm": 4.044219493865967, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8787735104560852, + "num_tokens": 567878803.0, + "step": 14888 + }, + { + "epoch": 1.894033837934105, + "ewc_loss": 0.008054771460592747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054771751631051e-05, + "grad_norm": 4.033442497253418, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8856927752494812, + "num_tokens": 567909419.0, + "step": 14889 + }, + { + "epoch": 1.8941610482126956, + "ewc_loss": 0.008022686466574669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022686233744025e-05, + "grad_norm": 3.999569892883301, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8767550587654114, + "num_tokens": 567949221.0, + "step": 14890 + }, + { + "epoch": 1.8942882584912861, + "ewc_loss": 0.00803303625434637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033036283450201e-05, + "grad_norm": 4.0038628578186035, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8909590840339661, + "num_tokens": 567985101.0, + "step": 14891 + }, + { + "epoch": 1.8944154687698767, + "ewc_loss": 0.008033165708184242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033165795495734e-05, + "grad_norm": 3.9680697917938232, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8588602542877197, + "num_tokens": 568030542.0, + "step": 14892 + }, + { + "epoch": 1.8945426790484672, + "ewc_loss": 0.008016298525035381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.016298670554534e-05, + "grad_norm": 3.975720167160034, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.877673864364624, + "num_tokens": 568070723.0, + "step": 14893 + }, + { + "epoch": 1.8946698893270577, + "ewc_loss": 0.008029726333916187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029726450331509e-05, + "grad_norm": 3.9836854934692383, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8698452711105347, + "num_tokens": 568113444.0, + "step": 14894 + }, + { + "epoch": 1.8947970996056482, + "ewc_loss": 0.008027863688766956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.027863805182278e-05, + "grad_norm": 3.9872639179229736, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8794491291046143, + "num_tokens": 568152646.0, + "step": 14895 + }, + { + "epoch": 1.8949243098842388, + "ewc_loss": 0.008025603368878365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025603165151551e-05, + "grad_norm": 4.067127704620361, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8751126527786255, + "num_tokens": 568187046.0, + "step": 14896 + }, + { + "epoch": 1.8950515201628293, + "ewc_loss": 0.00807275902479887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.072759374044836e-05, + "grad_norm": 4.036661624908447, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8720577359199524, + "num_tokens": 568225052.0, + "step": 14897 + }, + { + "epoch": 1.8951787304414198, + "ewc_loss": 0.008010201156139374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010201418073848e-05, + "grad_norm": 4.032167434692383, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8671583533287048, + "num_tokens": 568258299.0, + "step": 14898 + }, + { + "epoch": 1.8953059407200101, + "ewc_loss": 0.008061722852289677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061722473939881e-05, + "grad_norm": 3.9900200366973877, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8817062973976135, + "num_tokens": 568297137.0, + "step": 14899 + }, + { + "epoch": 1.8954331509986007, + "ewc_loss": 0.008026596158742905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026596333365887e-05, + "grad_norm": 3.9760468006134033, + "learning_rate": 1e-06, + "loss": 0.2936, + "mean_token_accuracy": 0.8964082598686218, + "num_tokens": 568331102.0, + "step": 14900 + }, + { + "epoch": 1.8955603612771912, + "ewc_loss": 0.008043967187404633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043966954573989e-05, + "grad_norm": 4.016546249389648, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8725254535675049, + "num_tokens": 568373926.0, + "step": 14901 + }, + { + "epoch": 1.8956875715557817, + "ewc_loss": 0.008056270889937878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056270598899573e-05, + "grad_norm": 4.0619797706604, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8857271671295166, + "num_tokens": 568406470.0, + "step": 14902 + }, + { + "epoch": 1.8958147818343722, + "ewc_loss": 0.008063149638473988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.063149289228022e-05, + "grad_norm": 3.9511303901672363, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8811964392662048, + "num_tokens": 568443452.0, + "step": 14903 + }, + { + "epoch": 1.8959419921129628, + "ewc_loss": 0.007991723716259003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.991724123712629e-05, + "grad_norm": 4.015552043914795, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8710225820541382, + "num_tokens": 568480085.0, + "step": 14904 + }, + { + "epoch": 1.896069202391553, + "ewc_loss": 0.008083965629339218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.083965803962201e-05, + "grad_norm": 3.988774538040161, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.872468113899231, + "num_tokens": 568519191.0, + "step": 14905 + }, + { + "epoch": 1.8961964126701436, + "ewc_loss": 0.008043629117310047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043629350140691e-05, + "grad_norm": 4.021523475646973, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8806337118148804, + "num_tokens": 568554224.0, + "step": 14906 + }, + { + "epoch": 1.8963236229487341, + "ewc_loss": 0.008075675927102566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0756755778566e-05, + "grad_norm": 3.9948642253875732, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8920466899871826, + "num_tokens": 568586102.0, + "step": 14907 + }, + { + "epoch": 1.8964508332273247, + "ewc_loss": 0.00807097740471363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070977492025122e-05, + "grad_norm": 4.0100202560424805, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8693952560424805, + "num_tokens": 568625224.0, + "step": 14908 + }, + { + "epoch": 1.8965780435059152, + "ewc_loss": 0.008082822896540165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082822751021013e-05, + "grad_norm": 4.01890230178833, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8536462783813477, + "num_tokens": 568665588.0, + "step": 14909 + }, + { + "epoch": 1.8967052537845057, + "ewc_loss": 0.008079594001173973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0795944086276e-05, + "grad_norm": 4.018561363220215, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8913450241088867, + "num_tokens": 568698408.0, + "step": 14910 + }, + { + "epoch": 1.8968324640630962, + "ewc_loss": 0.00808879267424345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08879267424345e-05, + "grad_norm": 4.056004524230957, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8647858500480652, + "num_tokens": 568731982.0, + "step": 14911 + }, + { + "epoch": 1.8969596743416868, + "ewc_loss": 0.008124139159917831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.124139276333153e-05, + "grad_norm": 3.9994657039642334, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8752953410148621, + "num_tokens": 568769388.0, + "step": 14912 + }, + { + "epoch": 1.8970868846202773, + "ewc_loss": 0.00808898825198412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.088988397503272e-05, + "grad_norm": 3.9723143577575684, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8814661502838135, + "num_tokens": 568810329.0, + "step": 14913 + }, + { + "epoch": 1.8972140948988678, + "ewc_loss": 0.00810286495834589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102865103865042e-05, + "grad_norm": 3.9857797622680664, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8805882930755615, + "num_tokens": 568852241.0, + "step": 14914 + }, + { + "epoch": 1.8973413051774584, + "ewc_loss": 0.008108202368021011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.108202746370807e-05, + "grad_norm": 4.011680603027344, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8726541996002197, + "num_tokens": 568889660.0, + "step": 14915 + }, + { + "epoch": 1.8974685154560489, + "ewc_loss": 0.008116730488836765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116730896290392e-05, + "grad_norm": 3.9765684604644775, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8732649087905884, + "num_tokens": 568934489.0, + "step": 14916 + }, + { + "epoch": 1.8975957257346394, + "ewc_loss": 0.008098749443888664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.098749094642699e-05, + "grad_norm": 3.9708445072174072, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8964148759841919, + "num_tokens": 568971904.0, + "step": 14917 + }, + { + "epoch": 1.89772293601323, + "ewc_loss": 0.00808618776500225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086187881417572e-05, + "grad_norm": 4.005080223083496, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8592402935028076, + "num_tokens": 569011863.0, + "step": 14918 + }, + { + "epoch": 1.8978501462918205, + "ewc_loss": 0.008125342428684235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.125341992126778e-05, + "grad_norm": 3.960298538208008, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8961085081100464, + "num_tokens": 569050295.0, + "step": 14919 + }, + { + "epoch": 1.897977356570411, + "ewc_loss": 0.008070610463619232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070610783761367e-05, + "grad_norm": 4.031596660614014, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8705085515975952, + "num_tokens": 569087013.0, + "step": 14920 + }, + { + "epoch": 1.8981045668490015, + "ewc_loss": 0.008127626962959766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.127626642817631e-05, + "grad_norm": 4.024203777313232, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8693874478340149, + "num_tokens": 569125226.0, + "step": 14921 + }, + { + "epoch": 1.898231777127592, + "ewc_loss": 0.008099204860627651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.099205297185108e-05, + "grad_norm": 3.97238826751709, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8721350431442261, + "num_tokens": 569161910.0, + "step": 14922 + }, + { + "epoch": 1.8983589874061826, + "ewc_loss": 0.008066252805292606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066253212746233e-05, + "grad_norm": 4.053884506225586, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8792707920074463, + "num_tokens": 569196519.0, + "step": 14923 + }, + { + "epoch": 1.898486197684773, + "ewc_loss": 0.008114012889564037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.114012598525733e-05, + "grad_norm": 4.00584077835083, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8703972697257996, + "num_tokens": 569234404.0, + "step": 14924 + }, + { + "epoch": 1.8986134079633634, + "ewc_loss": 0.008066179230809212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066178997978568e-05, + "grad_norm": 3.998183488845825, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8670702576637268, + "num_tokens": 569272728.0, + "step": 14925 + }, + { + "epoch": 1.898740618241954, + "ewc_loss": 0.008086136542260647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086136949714273e-05, + "grad_norm": 3.983825206756592, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8683268427848816, + "num_tokens": 569315434.0, + "step": 14926 + }, + { + "epoch": 1.8988678285205445, + "ewc_loss": 0.008066235110163689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066235022852197e-05, + "grad_norm": 4.003250598907471, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.86097651720047, + "num_tokens": 569357473.0, + "step": 14927 + }, + { + "epoch": 1.898995038799135, + "ewc_loss": 0.008090137504041195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090137271210551e-05, + "grad_norm": 3.9645583629608154, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8643960356712341, + "num_tokens": 569398747.0, + "step": 14928 + }, + { + "epoch": 1.8991222490777255, + "ewc_loss": 0.008060201071202755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060201071202755e-05, + "grad_norm": 4.0058417320251465, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8829308152198792, + "num_tokens": 569434855.0, + "step": 14929 + }, + { + "epoch": 1.8992494593563158, + "ewc_loss": 0.008113452233374119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11345234978944e-05, + "grad_norm": 3.982372283935547, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8753262758255005, + "num_tokens": 569475236.0, + "step": 14930 + }, + { + "epoch": 1.8993766696349064, + "ewc_loss": 0.008062002249062061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062001870712265e-05, + "grad_norm": 3.965001344680786, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8751884698867798, + "num_tokens": 569513599.0, + "step": 14931 + }, + { + "epoch": 1.899503879913497, + "ewc_loss": 0.008059735409915447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059735409915447e-05, + "grad_norm": 3.964948892593384, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8814346790313721, + "num_tokens": 569559059.0, + "step": 14932 + }, + { + "epoch": 1.8996310901920874, + "ewc_loss": 0.008062208071351051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062208507908508e-05, + "grad_norm": 4.005975723266602, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8912273645401001, + "num_tokens": 569595240.0, + "step": 14933 + }, + { + "epoch": 1.899758300470678, + "ewc_loss": 0.00809497945010662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09497942100279e-05, + "grad_norm": 4.03383207321167, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8676437139511108, + "num_tokens": 569629336.0, + "step": 14934 + }, + { + "epoch": 1.8998855107492685, + "ewc_loss": 0.00809786468744278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.097864338196814e-05, + "grad_norm": 3.9766335487365723, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8860108852386475, + "num_tokens": 569663613.0, + "step": 14935 + }, + { + "epoch": 1.900012721027859, + "ewc_loss": 0.008057957515120506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057957165874541e-05, + "grad_norm": 4.026859283447266, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8583951592445374, + "num_tokens": 569702884.0, + "step": 14936 + }, + { + "epoch": 1.9001399313064495, + "ewc_loss": 0.008116889744997025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116889512166381e-05, + "grad_norm": 3.9373624324798584, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8899331092834473, + "num_tokens": 569744618.0, + "step": 14937 + }, + { + "epoch": 1.90026714158504, + "ewc_loss": 0.008050124160945415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050124597502872e-05, + "grad_norm": 4.026086807250977, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8842331171035767, + "num_tokens": 569777083.0, + "step": 14938 + }, + { + "epoch": 1.9003943518636306, + "ewc_loss": 0.0081138014793396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.113801595754921e-05, + "grad_norm": 4.027617454528809, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8828663229942322, + "num_tokens": 569809652.0, + "step": 14939 + }, + { + "epoch": 1.9005215621422211, + "ewc_loss": 0.008062029257416725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062029519351199e-05, + "grad_norm": 3.983252763748169, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8798174858093262, + "num_tokens": 569846465.0, + "step": 14940 + }, + { + "epoch": 1.9006487724208116, + "ewc_loss": 0.008044330403208733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044330024858937e-05, + "grad_norm": 3.995598316192627, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8688995242118835, + "num_tokens": 569883655.0, + "step": 14941 + }, + { + "epoch": 1.9007759826994022, + "ewc_loss": 0.008066785521805286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066785812843591e-05, + "grad_norm": 3.963859796524048, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8728410005569458, + "num_tokens": 569923358.0, + "step": 14942 + }, + { + "epoch": 1.9009031929779927, + "ewc_loss": 0.008051687851548195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051687473198399e-05, + "grad_norm": 4.015495300292969, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8726948499679565, + "num_tokens": 569961393.0, + "step": 14943 + }, + { + "epoch": 1.9010304032565832, + "ewc_loss": 0.00809524580836296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09524572105147e-05, + "grad_norm": 4.025018215179443, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8485140204429626, + "num_tokens": 570002092.0, + "step": 14944 + }, + { + "epoch": 1.9011576135351738, + "ewc_loss": 0.008072223514318466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07222313596867e-05, + "grad_norm": 4.0404953956604, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8604221940040588, + "num_tokens": 570037699.0, + "step": 14945 + }, + { + "epoch": 1.9012848238137643, + "ewc_loss": 0.008100651204586029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.100651029963046e-05, + "grad_norm": 4.0057196617126465, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8543457984924316, + "num_tokens": 570077693.0, + "step": 14946 + }, + { + "epoch": 1.9014120340923548, + "ewc_loss": 0.008083023130893707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.083023567451164e-05, + "grad_norm": 3.960522174835205, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8935221433639526, + "num_tokens": 570115354.0, + "step": 14947 + }, + { + "epoch": 1.9015392443709451, + "ewc_loss": 0.008074251934885979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074251672951505e-05, + "grad_norm": 3.9768998622894287, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8838717937469482, + "num_tokens": 570154387.0, + "step": 14948 + }, + { + "epoch": 1.9016664546495357, + "ewc_loss": 0.008104872889816761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.104873268166557e-05, + "grad_norm": 4.029759883880615, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8705106377601624, + "num_tokens": 570189743.0, + "step": 14949 + }, + { + "epoch": 1.9017936649281262, + "ewc_loss": 0.008115909062325954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.115908713079989e-05, + "grad_norm": 4.068043231964111, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8868386745452881, + "num_tokens": 570224009.0, + "step": 14950 + }, + { + "epoch": 1.9019208752067167, + "ewc_loss": 0.008107280358672142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.107280154945329e-05, + "grad_norm": 3.9313769340515137, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8803572654724121, + "num_tokens": 570266703.0, + "step": 14951 + }, + { + "epoch": 1.9020480854853072, + "ewc_loss": 0.008036065846681595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036065992200747e-05, + "grad_norm": 4.032934665679932, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8739025592803955, + "num_tokens": 570308241.0, + "step": 14952 + }, + { + "epoch": 1.9021752957638978, + "ewc_loss": 0.008141817525029182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.141817670548335e-05, + "grad_norm": 3.982804298400879, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8750051259994507, + "num_tokens": 570346931.0, + "step": 14953 + }, + { + "epoch": 1.902302506042488, + "ewc_loss": 0.008057069033384323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05706949904561e-05, + "grad_norm": 4.007923126220703, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.888744056224823, + "num_tokens": 570380373.0, + "step": 14954 + }, + { + "epoch": 1.9024297163210786, + "ewc_loss": 0.008083632215857506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.083632565103471e-05, + "grad_norm": 3.9586970806121826, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8866869211196899, + "num_tokens": 570420106.0, + "step": 14955 + }, + { + "epoch": 1.9025569265996691, + "ewc_loss": 0.008055991493165493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055991202127188e-05, + "grad_norm": 3.9606306552886963, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8966459631919861, + "num_tokens": 570464534.0, + "step": 14956 + }, + { + "epoch": 1.9026841368782597, + "ewc_loss": 0.008058645762503147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058646199060604e-05, + "grad_norm": 4.000327110290527, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8656578660011292, + "num_tokens": 570505734.0, + "step": 14957 + }, + { + "epoch": 1.9028113471568502, + "ewc_loss": 0.008088940754532814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08894110377878e-05, + "grad_norm": 3.974151849746704, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8853488564491272, + "num_tokens": 570545913.0, + "step": 14958 + }, + { + "epoch": 1.9029385574354407, + "ewc_loss": 0.008039225824177265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039225940592587e-05, + "grad_norm": 3.9746317863464355, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8825550079345703, + "num_tokens": 570584397.0, + "step": 14959 + }, + { + "epoch": 1.9030657677140312, + "ewc_loss": 0.008031054399907589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031054312596098e-05, + "grad_norm": 3.986084222793579, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8823727965354919, + "num_tokens": 570622285.0, + "step": 14960 + }, + { + "epoch": 1.9031929779926218, + "ewc_loss": 0.008039860054850578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039859676500782e-05, + "grad_norm": 4.063502311706543, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8600111603736877, + "num_tokens": 570656331.0, + "step": 14961 + }, + { + "epoch": 1.9033201882712123, + "ewc_loss": 0.00807221420109272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.072214404819533e-05, + "grad_norm": 3.9556431770324707, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8793782591819763, + "num_tokens": 570699944.0, + "step": 14962 + }, + { + "epoch": 1.9034473985498028, + "ewc_loss": 0.007979410700500011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.979411020642146e-05, + "grad_norm": 4.045633316040039, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8689923286437988, + "num_tokens": 570737649.0, + "step": 14963 + }, + { + "epoch": 1.9035746088283934, + "ewc_loss": 0.00807696022093296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076959784375504e-05, + "grad_norm": 4.025792121887207, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8719714879989624, + "num_tokens": 570773193.0, + "step": 14964 + }, + { + "epoch": 1.9037018191069839, + "ewc_loss": 0.008025786839425564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.02578724687919e-05, + "grad_norm": 3.99920654296875, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8693324327468872, + "num_tokens": 570809117.0, + "step": 14965 + }, + { + "epoch": 1.9038290293855744, + "ewc_loss": 0.008022150956094265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022150723263621e-05, + "grad_norm": 3.981419324874878, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8867475986480713, + "num_tokens": 570844268.0, + "step": 14966 + }, + { + "epoch": 1.903956239664165, + "ewc_loss": 0.00803607888519764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036079088924453e-05, + "grad_norm": 3.982391119003296, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8688838481903076, + "num_tokens": 570890166.0, + "step": 14967 + }, + { + "epoch": 1.9040834499427555, + "ewc_loss": 0.008055062033236027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055062062339857e-05, + "grad_norm": 4.089971542358398, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8710949420928955, + "num_tokens": 570921419.0, + "step": 14968 + }, + { + "epoch": 1.904210660221346, + "ewc_loss": 0.008107436820864677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.107436588034034e-05, + "grad_norm": 3.970613479614258, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.866109311580658, + "num_tokens": 570966444.0, + "step": 14969 + }, + { + "epoch": 1.9043378704999365, + "ewc_loss": 0.0080082006752491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.008200529729947e-05, + "grad_norm": 3.941226005554199, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8842002153396606, + "num_tokens": 571011656.0, + "step": 14970 + }, + { + "epoch": 1.904465080778527, + "ewc_loss": 0.008050015196204185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050014730542898e-05, + "grad_norm": 3.9962263107299805, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.879901111125946, + "num_tokens": 571047828.0, + "step": 14971 + }, + { + "epoch": 1.9045922910571176, + "ewc_loss": 0.008085687644779682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085688023129478e-05, + "grad_norm": 3.9941320419311523, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8736311793327332, + "num_tokens": 571084541.0, + "step": 14972 + }, + { + "epoch": 1.9047195013357079, + "ewc_loss": 0.008066314272582531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066314330790192e-05, + "grad_norm": 4.021974563598633, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8698529601097107, + "num_tokens": 571120978.0, + "step": 14973 + }, + { + "epoch": 1.9048467116142984, + "ewc_loss": 0.008080762811005116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080762927420437e-05, + "grad_norm": 3.973362445831299, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8882547616958618, + "num_tokens": 571160181.0, + "step": 14974 + }, + { + "epoch": 1.904973921892889, + "ewc_loss": 0.008035946637392044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035946666495875e-05, + "grad_norm": 4.01295804977417, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8809876441955566, + "num_tokens": 571194648.0, + "step": 14975 + }, + { + "epoch": 1.9051011321714795, + "ewc_loss": 0.00809631496667862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.096314559224993e-05, + "grad_norm": 3.9777822494506836, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8721401691436768, + "num_tokens": 571238449.0, + "step": 14976 + }, + { + "epoch": 1.90522834245007, + "ewc_loss": 0.008052665740251541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052666089497507e-05, + "grad_norm": 3.9926984310150146, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8765528202056885, + "num_tokens": 571277266.0, + "step": 14977 + }, + { + "epoch": 1.9053555527286605, + "ewc_loss": 0.008076848462224007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076848462224007e-05, + "grad_norm": 3.988680362701416, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8857043981552124, + "num_tokens": 571316891.0, + "step": 14978 + }, + { + "epoch": 1.9054827630072508, + "ewc_loss": 0.008068159222602844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068159513641149e-05, + "grad_norm": 4.036318302154541, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8772289156913757, + "num_tokens": 571353097.0, + "step": 14979 + }, + { + "epoch": 1.9056099732858414, + "ewc_loss": 0.008075779303908348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.075778896454722e-05, + "grad_norm": 3.9698421955108643, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.890650749206543, + "num_tokens": 571392217.0, + "step": 14980 + }, + { + "epoch": 1.905737183564432, + "ewc_loss": 0.008037172257900238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037171937758103e-05, + "grad_norm": 3.992769718170166, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8837279081344604, + "num_tokens": 571430440.0, + "step": 14981 + }, + { + "epoch": 1.9058643938430224, + "ewc_loss": 0.008060031570494175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060031541390345e-05, + "grad_norm": 3.9689223766326904, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8721246719360352, + "num_tokens": 571472002.0, + "step": 14982 + }, + { + "epoch": 1.905991604121613, + "ewc_loss": 0.008023692294955254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023692498682067e-05, + "grad_norm": 4.075906276702881, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8564308881759644, + "num_tokens": 571509368.0, + "step": 14983 + }, + { + "epoch": 1.9061188144002035, + "ewc_loss": 0.00808387529104948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.083875582087785e-05, + "grad_norm": 4.042388439178467, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8613481521606445, + "num_tokens": 571543898.0, + "step": 14984 + }, + { + "epoch": 1.906246024678794, + "ewc_loss": 0.008043565787374973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043566049309447e-05, + "grad_norm": 3.999423027038574, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8697967529296875, + "num_tokens": 571584395.0, + "step": 14985 + }, + { + "epoch": 1.9063732349573845, + "ewc_loss": 0.008028846234083176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028846059460193e-05, + "grad_norm": 3.997434616088867, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.87510746717453, + "num_tokens": 571624792.0, + "step": 14986 + }, + { + "epoch": 1.906500445235975, + "ewc_loss": 0.008054807782173157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054807403823361e-05, + "grad_norm": 3.953793525695801, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.876996636390686, + "num_tokens": 571667197.0, + "step": 14987 + }, + { + "epoch": 1.9066276555145656, + "ewc_loss": 0.008025864139199257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0258643720299e-05, + "grad_norm": 4.011839389801025, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8706637620925903, + "num_tokens": 571703648.0, + "step": 14988 + }, + { + "epoch": 1.9067548657931561, + "ewc_loss": 0.008078133687376976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.078133396338671e-05, + "grad_norm": 4.107056140899658, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8678410053253174, + "num_tokens": 571738980.0, + "step": 14989 + }, + { + "epoch": 1.9068820760717466, + "ewc_loss": 0.008097532205283642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.097532554529607e-05, + "grad_norm": 3.9928369522094727, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8793357610702515, + "num_tokens": 571774540.0, + "step": 14990 + }, + { + "epoch": 1.9070092863503372, + "ewc_loss": 0.008020970970392227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020971290534362e-05, + "grad_norm": 4.060522556304932, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8805469274520874, + "num_tokens": 571805286.0, + "step": 14991 + }, + { + "epoch": 1.9071364966289277, + "ewc_loss": 0.008096873760223389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.096873352769762e-05, + "grad_norm": 3.937288522720337, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8687600493431091, + "num_tokens": 571847936.0, + "step": 14992 + }, + { + "epoch": 1.9072637069075182, + "ewc_loss": 0.008011255413293839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011254976736382e-05, + "grad_norm": 4.063010215759277, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8722384572029114, + "num_tokens": 571885454.0, + "step": 14993 + }, + { + "epoch": 1.9073909171861088, + "ewc_loss": 0.008123677223920822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.123677253024653e-05, + "grad_norm": 3.9641664028167725, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8748800754547119, + "num_tokens": 571924870.0, + "step": 14994 + }, + { + "epoch": 1.9075181274646993, + "ewc_loss": 0.00802933145314455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029331365833059e-05, + "grad_norm": 4.08643102645874, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8553031086921692, + "num_tokens": 571957870.0, + "step": 14995 + }, + { + "epoch": 1.9076453377432898, + "ewc_loss": 0.008139684796333313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.139684359775856e-05, + "grad_norm": 4.0507378578186035, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8681485056877136, + "num_tokens": 571993645.0, + "step": 14996 + }, + { + "epoch": 1.9077725480218801, + "ewc_loss": 0.008062555454671383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062555571086705e-05, + "grad_norm": 4.035172462463379, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8840540647506714, + "num_tokens": 572028594.0, + "step": 14997 + }, + { + "epoch": 1.9078997583004706, + "ewc_loss": 0.008094867691397667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.094868098851293e-05, + "grad_norm": 4.026482582092285, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8500543832778931, + "num_tokens": 572066971.0, + "step": 14998 + }, + { + "epoch": 1.9080269685790612, + "ewc_loss": 0.008102914318442345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102914580376819e-05, + "grad_norm": 3.9466755390167236, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8795130252838135, + "num_tokens": 572107609.0, + "step": 14999 + }, + { + "epoch": 1.9081541788576517, + "ewc_loss": 0.008057771250605583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057770901359618e-05, + "grad_norm": 4.032569885253906, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8844794034957886, + "num_tokens": 572146940.0, + "step": 15000 + }, + { + "epoch": 1.9082813891362422, + "ewc_loss": 0.008136061020195484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.136060932883993e-05, + "grad_norm": 3.941437005996704, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.9000470042228699, + "num_tokens": 572184010.0, + "step": 15001 + }, + { + "epoch": 1.9084085994148328, + "ewc_loss": 0.00806758739054203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067587623372674e-05, + "grad_norm": 4.055777549743652, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8595727682113647, + "num_tokens": 572223184.0, + "step": 15002 + }, + { + "epoch": 1.908535809693423, + "ewc_loss": 0.008169448003172874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.169448119588196e-05, + "grad_norm": 3.9793031215667725, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8766610622406006, + "num_tokens": 572261770.0, + "step": 15003 + }, + { + "epoch": 1.9086630199720136, + "ewc_loss": 0.008059586398303509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059586252784356e-05, + "grad_norm": 3.9794178009033203, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8774677515029907, + "num_tokens": 572303229.0, + "step": 15004 + }, + { + "epoch": 1.9087902302506041, + "ewc_loss": 0.008105971850454807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.105971937766299e-05, + "grad_norm": 4.080143928527832, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8636479377746582, + "num_tokens": 572337839.0, + "step": 15005 + }, + { + "epoch": 1.9089174405291947, + "ewc_loss": 0.008172732777893543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.172733214451e-05, + "grad_norm": 4.028981685638428, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8755593299865723, + "num_tokens": 572373604.0, + "step": 15006 + }, + { + "epoch": 1.9090446508077852, + "ewc_loss": 0.008079875260591507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.079875260591507e-05, + "grad_norm": 3.9742579460144043, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8839985132217407, + "num_tokens": 572410425.0, + "step": 15007 + }, + { + "epoch": 1.9091718610863757, + "ewc_loss": 0.008089710026979446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.089710172498599e-05, + "grad_norm": 4.00824499130249, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8840433359146118, + "num_tokens": 572443998.0, + "step": 15008 + }, + { + "epoch": 1.9092990713649662, + "ewc_loss": 0.008109179325401783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.109179179882631e-05, + "grad_norm": 3.9981207847595215, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8624510169029236, + "num_tokens": 572485337.0, + "step": 15009 + }, + { + "epoch": 1.9094262816435568, + "ewc_loss": 0.008099083788692951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.099083788692951e-05, + "grad_norm": 4.032410144805908, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8653639554977417, + "num_tokens": 572528273.0, + "step": 15010 + }, + { + "epoch": 1.9095534919221473, + "ewc_loss": 0.008112486451864243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.112486102618277e-05, + "grad_norm": 3.9928605556488037, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8781458139419556, + "num_tokens": 572569467.0, + "step": 15011 + }, + { + "epoch": 1.9096807022007378, + "ewc_loss": 0.008080816827714443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080816769506782e-05, + "grad_norm": 3.96679425239563, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8826773166656494, + "num_tokens": 572609626.0, + "step": 15012 + }, + { + "epoch": 1.9098079124793284, + "ewc_loss": 0.008060027845203876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060027903411537e-05, + "grad_norm": 3.996659994125366, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8596857786178589, + "num_tokens": 572652907.0, + "step": 15013 + }, + { + "epoch": 1.9099351227579189, + "ewc_loss": 0.008110185153782368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.110185444820672e-05, + "grad_norm": 4.072879314422607, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8794902563095093, + "num_tokens": 572687460.0, + "step": 15014 + }, + { + "epoch": 1.9100623330365094, + "ewc_loss": 0.008135664276778698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13566439319402e-05, + "grad_norm": 3.9844741821289062, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8772796392440796, + "num_tokens": 572723937.0, + "step": 15015 + }, + { + "epoch": 1.9101895433151, + "ewc_loss": 0.00806236919015646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062369306571782e-05, + "grad_norm": 3.9513518810272217, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.872087836265564, + "num_tokens": 572764929.0, + "step": 15016 + }, + { + "epoch": 1.9103167535936905, + "ewc_loss": 0.008076303638517857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076303492998704e-05, + "grad_norm": 3.9964823722839355, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8740852475166321, + "num_tokens": 572802545.0, + "step": 15017 + }, + { + "epoch": 1.910443963872281, + "ewc_loss": 0.008101426064968109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.101425919448957e-05, + "grad_norm": 3.924062967300415, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8707098960876465, + "num_tokens": 572850008.0, + "step": 15018 + }, + { + "epoch": 1.9105711741508715, + "ewc_loss": 0.008052486926317215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052487100940198e-05, + "grad_norm": 4.007594585418701, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8839856386184692, + "num_tokens": 572888968.0, + "step": 15019 + }, + { + "epoch": 1.910698384429462, + "ewc_loss": 0.008121111430227757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121111750369892e-05, + "grad_norm": 4.0080060958862305, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8739680647850037, + "num_tokens": 572926314.0, + "step": 15020 + }, + { + "epoch": 1.9108255947080524, + "ewc_loss": 0.008070996031165123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070996409514919e-05, + "grad_norm": 4.028360366821289, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8793361186981201, + "num_tokens": 572961789.0, + "step": 15021 + }, + { + "epoch": 1.9109528049866429, + "ewc_loss": 0.008106299676001072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.106300083454698e-05, + "grad_norm": 4.0240864753723145, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8771957159042358, + "num_tokens": 572993680.0, + "step": 15022 + }, + { + "epoch": 1.9110800152652334, + "ewc_loss": 0.008093347772955894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093347423709929e-05, + "grad_norm": 4.048290252685547, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8777056932449341, + "num_tokens": 573024003.0, + "step": 15023 + }, + { + "epoch": 1.911207225543824, + "ewc_loss": 0.008098411373794079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0984114902094e-05, + "grad_norm": 4.017544269561768, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8836807012557983, + "num_tokens": 573055138.0, + "step": 15024 + }, + { + "epoch": 1.9113344358224145, + "ewc_loss": 0.008106394670903683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.106394670903683e-05, + "grad_norm": 3.9601027965545654, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8789546489715576, + "num_tokens": 573092373.0, + "step": 15025 + }, + { + "epoch": 1.911461646101005, + "ewc_loss": 0.008082215674221516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08221593615599e-05, + "grad_norm": 4.0138678550720215, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8705542087554932, + "num_tokens": 573131486.0, + "step": 15026 + }, + { + "epoch": 1.9115888563795955, + "ewc_loss": 0.008139925077557564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.139925193972886e-05, + "grad_norm": 3.95697283744812, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8851258754730225, + "num_tokens": 573170114.0, + "step": 15027 + }, + { + "epoch": 1.9117160666581858, + "ewc_loss": 0.008101079612970352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.101079583866522e-05, + "grad_norm": 4.096465110778809, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8793179988861084, + "num_tokens": 573200906.0, + "step": 15028 + }, + { + "epoch": 1.9118432769367764, + "ewc_loss": 0.008194652386009693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.194652036763728e-05, + "grad_norm": 3.960655927658081, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.864467203617096, + "num_tokens": 573243731.0, + "step": 15029 + }, + { + "epoch": 1.9119704872153669, + "ewc_loss": 0.00806933268904686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069332398008555e-05, + "grad_norm": 3.972165584564209, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8807140588760376, + "num_tokens": 573283335.0, + "step": 15030 + }, + { + "epoch": 1.9120976974939574, + "ewc_loss": 0.008135476149618626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.135475945891812e-05, + "grad_norm": 3.998101234436035, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8619576692581177, + "num_tokens": 573326628.0, + "step": 15031 + }, + { + "epoch": 1.912224907772548, + "ewc_loss": 0.00812438502907753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.124385203700513e-05, + "grad_norm": 3.973344564437866, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8801725506782532, + "num_tokens": 573369025.0, + "step": 15032 + }, + { + "epoch": 1.9123521180511385, + "ewc_loss": 0.008097270503640175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.097270620055497e-05, + "grad_norm": 3.984731435775757, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.869361162185669, + "num_tokens": 573411923.0, + "step": 15033 + }, + { + "epoch": 1.912479328329729, + "ewc_loss": 0.008091917261481285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.091917698038742e-05, + "grad_norm": 4.0318498611450195, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8707486391067505, + "num_tokens": 573450455.0, + "step": 15034 + }, + { + "epoch": 1.9126065386083195, + "ewc_loss": 0.00811633188277483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116332173813134e-05, + "grad_norm": 3.9868078231811523, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8679676055908203, + "num_tokens": 573492886.0, + "step": 15035 + }, + { + "epoch": 1.91273374888691, + "ewc_loss": 0.008062457665801048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062458073254675e-05, + "grad_norm": 3.9794299602508545, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8814911842346191, + "num_tokens": 573532073.0, + "step": 15036 + }, + { + "epoch": 1.9128609591655006, + "ewc_loss": 0.008073166012763977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.073166100075468e-05, + "grad_norm": 3.9628241062164307, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8901612758636475, + "num_tokens": 573573632.0, + "step": 15037 + }, + { + "epoch": 1.9129881694440911, + "ewc_loss": 0.008064107038080692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064106805250049e-05, + "grad_norm": 3.984222888946533, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8847075700759888, + "num_tokens": 573610194.0, + "step": 15038 + }, + { + "epoch": 1.9131153797226816, + "ewc_loss": 0.008056438528001308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056438673520461e-05, + "grad_norm": 4.01422119140625, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8687797784805298, + "num_tokens": 573650237.0, + "step": 15039 + }, + { + "epoch": 1.9132425900012722, + "ewc_loss": 0.008059944026172161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059944229898974e-05, + "grad_norm": 4.0119500160217285, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8795774579048157, + "num_tokens": 573686859.0, + "step": 15040 + }, + { + "epoch": 1.9133698002798627, + "ewc_loss": 0.008053493686020374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053493365878239e-05, + "grad_norm": 3.99875545501709, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8772950172424316, + "num_tokens": 573726770.0, + "step": 15041 + }, + { + "epoch": 1.9134970105584532, + "ewc_loss": 0.008028959855437279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028960291994736e-05, + "grad_norm": 4.006836414337158, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8713632225990295, + "num_tokens": 573768668.0, + "step": 15042 + }, + { + "epoch": 1.9136242208370438, + "ewc_loss": 0.008036824874579906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036824874579906e-05, + "grad_norm": 4.050967216491699, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8666380643844604, + "num_tokens": 573807880.0, + "step": 15043 + }, + { + "epoch": 1.9137514311156343, + "ewc_loss": 0.008054961450397968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054961654124781e-05, + "grad_norm": 3.9887003898620605, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.87327641248703, + "num_tokens": 573847018.0, + "step": 15044 + }, + { + "epoch": 1.9138786413942248, + "ewc_loss": 0.00799255445599556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.992554310476407e-05, + "grad_norm": 4.021614074707031, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8906158208847046, + "num_tokens": 573881193.0, + "step": 15045 + }, + { + "epoch": 1.9140058516728151, + "ewc_loss": 0.008046552538871765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.046552102314308e-05, + "grad_norm": 3.9868106842041016, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8840073943138123, + "num_tokens": 573918457.0, + "step": 15046 + }, + { + "epoch": 1.9141330619514056, + "ewc_loss": 0.008010176941752434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01017667981796e-05, + "grad_norm": 3.9798614978790283, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.894451379776001, + "num_tokens": 573958818.0, + "step": 15047 + }, + { + "epoch": 1.9142602722299962, + "ewc_loss": 0.008023413829505444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.023413829505444e-05, + "grad_norm": 4.015057563781738, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8708921074867249, + "num_tokens": 573996735.0, + "step": 15048 + }, + { + "epoch": 1.9143874825085867, + "ewc_loss": 0.008035845123231411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035844803089276e-05, + "grad_norm": 3.993999481201172, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8796994090080261, + "num_tokens": 574035005.0, + "step": 15049 + }, + { + "epoch": 1.9145146927871772, + "ewc_loss": 0.008011569269001484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011569298105314e-05, + "grad_norm": 3.9657790660858154, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8742874264717102, + "num_tokens": 574077817.0, + "step": 15050 + }, + { + "epoch": 1.9146419030657678, + "ewc_loss": 0.008020046167075634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020045788725838e-05, + "grad_norm": 4.05834436416626, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.883460283279419, + "num_tokens": 574113083.0, + "step": 15051 + }, + { + "epoch": 1.914769113344358, + "ewc_loss": 0.008085383102297783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085383160505444e-05, + "grad_norm": 4.020249843597412, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8804764747619629, + "num_tokens": 574147324.0, + "step": 15052 + }, + { + "epoch": 1.9148963236229486, + "ewc_loss": 0.00802628230303526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026282011996955e-05, + "grad_norm": 3.9861481189727783, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8910573720932007, + "num_tokens": 574186108.0, + "step": 15053 + }, + { + "epoch": 1.9150235339015391, + "ewc_loss": 0.008022347465157509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022347174119204e-05, + "grad_norm": 4.031778812408447, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.888698160648346, + "num_tokens": 574218076.0, + "step": 15054 + }, + { + "epoch": 1.9151507441801296, + "ewc_loss": 0.00807502493262291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.075025107245892e-05, + "grad_norm": 3.985631227493286, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8855827450752258, + "num_tokens": 574256416.0, + "step": 15055 + }, + { + "epoch": 1.9152779544587202, + "ewc_loss": 0.008012818172574043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.01281858002767e-05, + "grad_norm": 3.9879846572875977, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8893464803695679, + "num_tokens": 574295262.0, + "step": 15056 + }, + { + "epoch": 1.9154051647373107, + "ewc_loss": 0.008037419989705086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037420047912747e-05, + "grad_norm": 4.028441429138184, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8650203943252563, + "num_tokens": 574329836.0, + "step": 15057 + }, + { + "epoch": 1.9155323750159012, + "ewc_loss": 0.008067999966442585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068000170169398e-05, + "grad_norm": 4.06876802444458, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.879181981086731, + "num_tokens": 574363189.0, + "step": 15058 + }, + { + "epoch": 1.9156595852944918, + "ewc_loss": 0.00808516051620245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08516051620245e-05, + "grad_norm": 4.023873805999756, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.884570837020874, + "num_tokens": 574396374.0, + "step": 15059 + }, + { + "epoch": 1.9157867955730823, + "ewc_loss": 0.008037461899220943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037461520871148e-05, + "grad_norm": 4.012520790100098, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8785549402236938, + "num_tokens": 574435488.0, + "step": 15060 + }, + { + "epoch": 1.9159140058516728, + "ewc_loss": 0.00806085579097271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060855907388031e-05, + "grad_norm": 3.996220588684082, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8772994875907898, + "num_tokens": 574475261.0, + "step": 15061 + }, + { + "epoch": 1.9160412161302633, + "ewc_loss": 0.008030303753912449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.030303433770314e-05, + "grad_norm": 3.977571725845337, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8944761753082275, + "num_tokens": 574513531.0, + "step": 15062 + }, + { + "epoch": 1.9161684264088539, + "ewc_loss": 0.008030969649553299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.030969911487773e-05, + "grad_norm": 4.0283379554748535, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8622418642044067, + "num_tokens": 574551736.0, + "step": 15063 + }, + { + "epoch": 1.9162956366874444, + "ewc_loss": 0.008057153783738613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057153900153935e-05, + "grad_norm": 4.004949569702148, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8716504573822021, + "num_tokens": 574589978.0, + "step": 15064 + }, + { + "epoch": 1.916422846966035, + "ewc_loss": 0.008036612533032894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036612416617572e-05, + "grad_norm": 4.0245747566223145, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8564242720603943, + "num_tokens": 574628845.0, + "step": 15065 + }, + { + "epoch": 1.9165500572446255, + "ewc_loss": 0.008058055303990841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058055391302332e-05, + "grad_norm": 4.0876383781433105, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8722716569900513, + "num_tokens": 574660700.0, + "step": 15066 + }, + { + "epoch": 1.916677267523216, + "ewc_loss": 0.008081928826868534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08192853583023e-05, + "grad_norm": 4.075425624847412, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8684598803520203, + "num_tokens": 574692195.0, + "step": 15067 + }, + { + "epoch": 1.9168044778018065, + "ewc_loss": 0.008068297058343887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068297029240057e-05, + "grad_norm": 3.9515600204467773, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8650733232498169, + "num_tokens": 574732462.0, + "step": 15068 + }, + { + "epoch": 1.916931688080397, + "ewc_loss": 0.008031661622226238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031661855056882e-05, + "grad_norm": 4.008913516998291, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8834600448608398, + "num_tokens": 574769353.0, + "step": 15069 + }, + { + "epoch": 1.9170588983589874, + "ewc_loss": 0.008126193657517433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.126193279167637e-05, + "grad_norm": 4.108941555023193, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8761518597602844, + "num_tokens": 574804958.0, + "step": 15070 + }, + { + "epoch": 1.9171861086375779, + "ewc_loss": 0.00813809409737587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.138094563037157e-05, + "grad_norm": 3.979311943054199, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8754478096961975, + "num_tokens": 574843966.0, + "step": 15071 + }, + { + "epoch": 1.9173133189161684, + "ewc_loss": 0.008051013574004173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051013719523326e-05, + "grad_norm": 3.9901466369628906, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8790379166603088, + "num_tokens": 574882916.0, + "step": 15072 + }, + { + "epoch": 1.917440529194759, + "ewc_loss": 0.00810404121875763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.104040898615494e-05, + "grad_norm": 3.986781597137451, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8839132189750671, + "num_tokens": 574922154.0, + "step": 15073 + }, + { + "epoch": 1.9175677394733495, + "ewc_loss": 0.00810133758932352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.101337152766064e-05, + "grad_norm": 4.040963649749756, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8724359273910522, + "num_tokens": 574956894.0, + "step": 15074 + }, + { + "epoch": 1.91769494975194, + "ewc_loss": 0.008126907050609589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.126907050609589e-05, + "grad_norm": 3.9801604747772217, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.890308141708374, + "num_tokens": 574998343.0, + "step": 15075 + }, + { + "epoch": 1.9178221600305305, + "ewc_loss": 0.008073213510215282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07321339379996e-05, + "grad_norm": 4.009346008300781, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8846023082733154, + "num_tokens": 575036689.0, + "step": 15076 + }, + { + "epoch": 1.9179493703091208, + "ewc_loss": 0.008110569790005684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.110569615382701e-05, + "grad_norm": 4.001636505126953, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8719766139984131, + "num_tokens": 575075510.0, + "step": 15077 + }, + { + "epoch": 1.9180765805877114, + "ewc_loss": 0.008091743104159832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.091743075056002e-05, + "grad_norm": 4.007948875427246, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.87359619140625, + "num_tokens": 575115797.0, + "step": 15078 + }, + { + "epoch": 1.9182037908663019, + "ewc_loss": 0.008086572401225567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086572051979601e-05, + "grad_norm": 4.034952163696289, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8628212213516235, + "num_tokens": 575155240.0, + "step": 15079 + }, + { + "epoch": 1.9183310011448924, + "ewc_loss": 0.008117783814668655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.117783727357164e-05, + "grad_norm": 4.057742595672607, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8633265495300293, + "num_tokens": 575191048.0, + "step": 15080 + }, + { + "epoch": 1.918458211423483, + "ewc_loss": 0.008105389773845673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.105389861157164e-05, + "grad_norm": 4.014498233795166, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8640984296798706, + "num_tokens": 575227778.0, + "step": 15081 + }, + { + "epoch": 1.9185854217020735, + "ewc_loss": 0.008095287717878819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.095287921605632e-05, + "grad_norm": 3.9637155532836914, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8770492672920227, + "num_tokens": 575269774.0, + "step": 15082 + }, + { + "epoch": 1.918712631980664, + "ewc_loss": 0.008062051609158516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062052074819803e-05, + "grad_norm": 3.9935903549194336, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8685425519943237, + "num_tokens": 575310710.0, + "step": 15083 + }, + { + "epoch": 1.9188398422592545, + "ewc_loss": 0.008123951964080334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.123952284222469e-05, + "grad_norm": 4.023397445678711, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8822894096374512, + "num_tokens": 575344073.0, + "step": 15084 + }, + { + "epoch": 1.918967052537845, + "ewc_loss": 0.008107813075184822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.107812755042687e-05, + "grad_norm": 3.9499247074127197, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8689764738082886, + "num_tokens": 575386737.0, + "step": 15085 + }, + { + "epoch": 1.9190942628164356, + "ewc_loss": 0.008080576546490192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080576662905514e-05, + "grad_norm": 3.977922201156616, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8804370760917664, + "num_tokens": 575424287.0, + "step": 15086 + }, + { + "epoch": 1.919221473095026, + "ewc_loss": 0.008134099654853344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13410006230697e-05, + "grad_norm": 3.997490644454956, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8782612085342407, + "num_tokens": 575463859.0, + "step": 15087 + }, + { + "epoch": 1.9193486833736166, + "ewc_loss": 0.0081172501668334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.117250399664044e-05, + "grad_norm": 4.055230617523193, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8707619905471802, + "num_tokens": 575500838.0, + "step": 15088 + }, + { + "epoch": 1.9194758936522072, + "ewc_loss": 0.008142612874507904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.142612932715565e-05, + "grad_norm": 4.042006015777588, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8680174946784973, + "num_tokens": 575535164.0, + "step": 15089 + }, + { + "epoch": 1.9196031039307977, + "ewc_loss": 0.008128282614052296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.128282934194431e-05, + "grad_norm": 3.9947264194488525, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8830735683441162, + "num_tokens": 575569493.0, + "step": 15090 + }, + { + "epoch": 1.9197303142093882, + "ewc_loss": 0.008112668991088867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.112669456750154e-05, + "grad_norm": 4.083927154541016, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8623261451721191, + "num_tokens": 575603396.0, + "step": 15091 + }, + { + "epoch": 1.9198575244879788, + "ewc_loss": 0.008190960623323917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.190960943466052e-05, + "grad_norm": 4.044175148010254, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.882468581199646, + "num_tokens": 575638255.0, + "step": 15092 + }, + { + "epoch": 1.9199847347665693, + "ewc_loss": 0.008130610920488834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13061124063097e-05, + "grad_norm": 3.9547810554504395, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8657232522964478, + "num_tokens": 575685281.0, + "step": 15093 + }, + { + "epoch": 1.9201119450451598, + "ewc_loss": 0.00808519497513771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085195440798998e-05, + "grad_norm": 3.9946272373199463, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8835670948028564, + "num_tokens": 575725030.0, + "step": 15094 + }, + { + "epoch": 1.9202391553237501, + "ewc_loss": 0.008149759843945503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.149760105879977e-05, + "grad_norm": 4.010900497436523, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8719938397407532, + "num_tokens": 575764237.0, + "step": 15095 + }, + { + "epoch": 1.9203663656023406, + "ewc_loss": 0.00812719389796257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.127193723339587e-05, + "grad_norm": 4.001648902893066, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.874062180519104, + "num_tokens": 575796732.0, + "step": 15096 + }, + { + "epoch": 1.9204935758809312, + "ewc_loss": 0.008134787902235985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.134788367897272e-05, + "grad_norm": 4.023660182952881, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.880190372467041, + "num_tokens": 575833133.0, + "step": 15097 + }, + { + "epoch": 1.9206207861595217, + "ewc_loss": 0.008149709552526474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.149709174176678e-05, + "grad_norm": 3.998295783996582, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8719110488891602, + "num_tokens": 575870920.0, + "step": 15098 + }, + { + "epoch": 1.9207479964381122, + "ewc_loss": 0.008120285347104073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.12028520158492e-05, + "grad_norm": 4.026546001434326, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8797420859336853, + "num_tokens": 575909132.0, + "step": 15099 + }, + { + "epoch": 1.9208752067167028, + "ewc_loss": 0.008144613355398178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.144613821059465e-05, + "grad_norm": 3.992215633392334, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8765454888343811, + "num_tokens": 575947979.0, + "step": 15100 + }, + { + "epoch": 1.921002416995293, + "ewc_loss": 0.008116931654512882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116931712720543e-05, + "grad_norm": 4.067226886749268, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8649014234542847, + "num_tokens": 575988888.0, + "step": 15101 + }, + { + "epoch": 1.9211296272738836, + "ewc_loss": 0.008150354959070683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.150354551617056e-05, + "grad_norm": 3.9748387336730957, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.877836287021637, + "num_tokens": 576028614.0, + "step": 15102 + }, + { + "epoch": 1.9212568375524741, + "ewc_loss": 0.00808400847017765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.084008732112125e-05, + "grad_norm": 3.9888129234313965, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.9030344486236572, + "num_tokens": 576066648.0, + "step": 15103 + }, + { + "epoch": 1.9213840478310646, + "ewc_loss": 0.008093750104308128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093750511761755e-05, + "grad_norm": 3.9980530738830566, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8734314441680908, + "num_tokens": 576104045.0, + "step": 15104 + }, + { + "epoch": 1.9215112581096552, + "ewc_loss": 0.008106742985546589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.106743189273402e-05, + "grad_norm": 3.985183000564575, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8679684996604919, + "num_tokens": 576145419.0, + "step": 15105 + }, + { + "epoch": 1.9216384683882457, + "ewc_loss": 0.00807336624711752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.073366188909858e-05, + "grad_norm": 3.9538278579711914, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8725395202636719, + "num_tokens": 576191410.0, + "step": 15106 + }, + { + "epoch": 1.9217656786668362, + "ewc_loss": 0.008075966499745846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.075966616161168e-05, + "grad_norm": 4.023268699645996, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8697881698608398, + "num_tokens": 576226383.0, + "step": 15107 + }, + { + "epoch": 1.9218928889454268, + "ewc_loss": 0.00811039935797453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.110399357974529e-05, + "grad_norm": 4.051797389984131, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8682548403739929, + "num_tokens": 576259958.0, + "step": 15108 + }, + { + "epoch": 1.9220200992240173, + "ewc_loss": 0.008120039477944374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.120039274217561e-05, + "grad_norm": 3.9996445178985596, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.878210186958313, + "num_tokens": 576299467.0, + "step": 15109 + }, + { + "epoch": 1.9221473095026078, + "ewc_loss": 0.008077532052993774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07753240223974e-05, + "grad_norm": 4.060958385467529, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8687642812728882, + "num_tokens": 576334137.0, + "step": 15110 + }, + { + "epoch": 1.9222745197811983, + "ewc_loss": 0.00813552737236023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.135527605190873e-05, + "grad_norm": 3.956118106842041, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8732247352600098, + "num_tokens": 576379582.0, + "step": 15111 + }, + { + "epoch": 1.9224017300597889, + "ewc_loss": 0.008049805648624897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.049805910559371e-05, + "grad_norm": 4.02644157409668, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8903970718383789, + "num_tokens": 576414590.0, + "step": 15112 + }, + { + "epoch": 1.9225289403383794, + "ewc_loss": 0.008141164667904377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.141164289554581e-05, + "grad_norm": 4.08010196685791, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8706352114677429, + "num_tokens": 576445023.0, + "step": 15113 + }, + { + "epoch": 1.92265615061697, + "ewc_loss": 0.00813916977494955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.139169949572533e-05, + "grad_norm": 4.053765296936035, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8728477358818054, + "num_tokens": 576477861.0, + "step": 15114 + }, + { + "epoch": 1.9227833608955605, + "ewc_loss": 0.00810422282665968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.104222797555849e-05, + "grad_norm": 4.010008335113525, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.872665286064148, + "num_tokens": 576513121.0, + "step": 15115 + }, + { + "epoch": 1.922910571174151, + "ewc_loss": 0.008113977499306202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.113977673929185e-05, + "grad_norm": 4.002084255218506, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8812868595123291, + "num_tokens": 576551039.0, + "step": 15116 + }, + { + "epoch": 1.9230377814527415, + "ewc_loss": 0.008122803643345833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.122803410515189e-05, + "grad_norm": 4.04105281829834, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8675077557563782, + "num_tokens": 576590651.0, + "step": 15117 + }, + { + "epoch": 1.923164991731332, + "ewc_loss": 0.008139564655721188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.139564306475222e-05, + "grad_norm": 4.031003475189209, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8676932454109192, + "num_tokens": 576629119.0, + "step": 15118 + }, + { + "epoch": 1.9232922020099223, + "ewc_loss": 0.008116212673485279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116212848108262e-05, + "grad_norm": 3.981731414794922, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8844718933105469, + "num_tokens": 576664240.0, + "step": 15119 + }, + { + "epoch": 1.9234194122885129, + "ewc_loss": 0.008111310191452503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.111310307867825e-05, + "grad_norm": 4.015328407287598, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8685775995254517, + "num_tokens": 576701946.0, + "step": 15120 + }, + { + "epoch": 1.9235466225671034, + "ewc_loss": 0.008131284266710281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.131284266710281e-05, + "grad_norm": 4.018126010894775, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8870276808738708, + "num_tokens": 576733541.0, + "step": 15121 + }, + { + "epoch": 1.923673832845694, + "ewc_loss": 0.008135698735713959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.135698590194806e-05, + "grad_norm": 4.07399320602417, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8634335994720459, + "num_tokens": 576770930.0, + "step": 15122 + }, + { + "epoch": 1.9238010431242845, + "ewc_loss": 0.008162075653672218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.162075391737744e-05, + "grad_norm": 3.985326051712036, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.874966561794281, + "num_tokens": 576807455.0, + "step": 15123 + }, + { + "epoch": 1.923928253402875, + "ewc_loss": 0.008092242293059826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092242205748335e-05, + "grad_norm": 3.9355781078338623, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8820157051086426, + "num_tokens": 576849577.0, + "step": 15124 + }, + { + "epoch": 1.9240554636814655, + "ewc_loss": 0.008106832392513752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.106832683552057e-05, + "grad_norm": 4.060823440551758, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8686598539352417, + "num_tokens": 576883506.0, + "step": 15125 + }, + { + "epoch": 1.9241826739600558, + "ewc_loss": 0.008200323209166527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.200323645723984e-05, + "grad_norm": 3.989954710006714, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8598403334617615, + "num_tokens": 576923892.0, + "step": 15126 + }, + { + "epoch": 1.9243098842386464, + "ewc_loss": 0.008103306405246258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103306026896462e-05, + "grad_norm": 3.9128005504608154, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8909763693809509, + "num_tokens": 576970832.0, + "step": 15127 + }, + { + "epoch": 1.9244370945172369, + "ewc_loss": 0.00810038112103939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.10038109193556e-05, + "grad_norm": 4.050110816955566, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8600287437438965, + "num_tokens": 577006990.0, + "step": 15128 + }, + { + "epoch": 1.9245643047958274, + "ewc_loss": 0.008198278024792671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.198278374038637e-05, + "grad_norm": 4.00437068939209, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8802860379219055, + "num_tokens": 577044932.0, + "step": 15129 + }, + { + "epoch": 1.924691515074418, + "ewc_loss": 0.008084478788077831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.084478758974001e-05, + "grad_norm": 4.099480152130127, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8602616786956787, + "num_tokens": 577076217.0, + "step": 15130 + }, + { + "epoch": 1.9248187253530085, + "ewc_loss": 0.008191462606191635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.19146225694567e-05, + "grad_norm": 3.9743614196777344, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.890926718711853, + "num_tokens": 577112696.0, + "step": 15131 + }, + { + "epoch": 1.924945935631599, + "ewc_loss": 0.008073407225310802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.073406934272498e-05, + "grad_norm": 3.954246997833252, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8754057884216309, + "num_tokens": 577156397.0, + "step": 15132 + }, + { + "epoch": 1.9250731459101895, + "ewc_loss": 0.008091758005321026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.091758354566991e-05, + "grad_norm": 3.970046043395996, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8789787292480469, + "num_tokens": 577194192.0, + "step": 15133 + }, + { + "epoch": 1.92520035618878, + "ewc_loss": 0.008118458092212677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.118458208627999e-05, + "grad_norm": 4.016040325164795, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8709787130355835, + "num_tokens": 577232639.0, + "step": 15134 + }, + { + "epoch": 1.9253275664673706, + "ewc_loss": 0.008141308091580868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.141308353515342e-05, + "grad_norm": 4.046806812286377, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8850272297859192, + "num_tokens": 577267349.0, + "step": 15135 + }, + { + "epoch": 1.925454776745961, + "ewc_loss": 0.008128644898533821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.128645276883617e-05, + "grad_norm": 4.027501583099365, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8707311153411865, + "num_tokens": 577308329.0, + "step": 15136 + }, + { + "epoch": 1.9255819870245516, + "ewc_loss": 0.008101125247776508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.101125422399491e-05, + "grad_norm": 3.9799225330352783, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8768793940544128, + "num_tokens": 577347825.0, + "step": 15137 + }, + { + "epoch": 1.9257091973031422, + "ewc_loss": 0.008077798411250114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077797974692658e-05, + "grad_norm": 4.056157112121582, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.875791072845459, + "num_tokens": 577379981.0, + "step": 15138 + }, + { + "epoch": 1.9258364075817327, + "ewc_loss": 0.008142873644828796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.142873411998153e-05, + "grad_norm": 4.055145263671875, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8795288801193237, + "num_tokens": 577414134.0, + "step": 15139 + }, + { + "epoch": 1.9259636178603232, + "ewc_loss": 0.008119424805045128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.119424455799162e-05, + "grad_norm": 4.028868198394775, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8746123909950256, + "num_tokens": 577451170.0, + "step": 15140 + }, + { + "epoch": 1.9260908281389137, + "ewc_loss": 0.0080875838175416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087584137683734e-05, + "grad_norm": 4.0400848388671875, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8639194965362549, + "num_tokens": 577487447.0, + "step": 15141 + }, + { + "epoch": 1.9262180384175043, + "ewc_loss": 0.008119146339595318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.1191465142183e-05, + "grad_norm": 4.041764259338379, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8794121146202087, + "num_tokens": 577525732.0, + "step": 15142 + }, + { + "epoch": 1.9263452486960948, + "ewc_loss": 0.008110367693006992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.110367343761027e-05, + "grad_norm": 3.9902281761169434, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8736575841903687, + "num_tokens": 577563303.0, + "step": 15143 + }, + { + "epoch": 1.926472458974685, + "ewc_loss": 0.008074463345110416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074463403318077e-05, + "grad_norm": 3.9659321308135986, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8881553411483765, + "num_tokens": 577607592.0, + "step": 15144 + }, + { + "epoch": 1.9265996692532756, + "ewc_loss": 0.008087866939604282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087867172434926e-05, + "grad_norm": 3.99811053276062, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8662413358688354, + "num_tokens": 577651646.0, + "step": 15145 + }, + { + "epoch": 1.9267268795318662, + "ewc_loss": 0.00810533668845892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.10533674666658e-05, + "grad_norm": 4.032652854919434, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8826002478599548, + "num_tokens": 577689530.0, + "step": 15146 + }, + { + "epoch": 1.9268540898104567, + "ewc_loss": 0.008114761672914028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.114762022159994e-05, + "grad_norm": 4.008020401000977, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8713923692703247, + "num_tokens": 577732524.0, + "step": 15147 + }, + { + "epoch": 1.9269813000890472, + "ewc_loss": 0.008074850775301456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074850484263152e-05, + "grad_norm": 4.023494720458984, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8800630569458008, + "num_tokens": 577767164.0, + "step": 15148 + }, + { + "epoch": 1.9271085103676378, + "ewc_loss": 0.008069385774433613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069385512499139e-05, + "grad_norm": 4.031002044677734, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8821345567703247, + "num_tokens": 577802347.0, + "step": 15149 + }, + { + "epoch": 1.927235720646228, + "ewc_loss": 0.00806084182113409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060842083068565e-05, + "grad_norm": 4.10984992980957, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8681553602218628, + "num_tokens": 577842899.0, + "step": 15150 + }, + { + "epoch": 1.9273629309248186, + "ewc_loss": 0.008109282702207565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.109282498480752e-05, + "grad_norm": 3.978227138519287, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8750996589660645, + "num_tokens": 577885137.0, + "step": 15151 + }, + { + "epoch": 1.9274901412034091, + "ewc_loss": 0.008007712662220001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007712312974036e-05, + "grad_norm": 3.962385416030884, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8884375691413879, + "num_tokens": 577925727.0, + "step": 15152 + }, + { + "epoch": 1.9276173514819996, + "ewc_loss": 0.00804004818201065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04004812380299e-05, + "grad_norm": 3.9892942905426025, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.878086507320404, + "num_tokens": 577965399.0, + "step": 15153 + }, + { + "epoch": 1.9277445617605902, + "ewc_loss": 0.008068297058343887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068297029240057e-05, + "grad_norm": 4.013083457946777, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8749159574508667, + "num_tokens": 578007608.0, + "step": 15154 + }, + { + "epoch": 1.9278717720391807, + "ewc_loss": 0.008039404638111591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039404201554134e-05, + "grad_norm": 4.004523754119873, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8870474696159363, + "num_tokens": 578043425.0, + "step": 15155 + }, + { + "epoch": 1.9279989823177712, + "ewc_loss": 0.00803443044424057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034430356929079e-05, + "grad_norm": 4.065657615661621, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8734223246574402, + "num_tokens": 578075838.0, + "step": 15156 + }, + { + "epoch": 1.9281261925963618, + "ewc_loss": 0.008069823496043682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069823525147513e-05, + "grad_norm": 4.052340030670166, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8703212738037109, + "num_tokens": 578110113.0, + "step": 15157 + }, + { + "epoch": 1.9282534028749523, + "ewc_loss": 0.00804365798830986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043658453971148e-05, + "grad_norm": 4.029432773590088, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8756510615348816, + "num_tokens": 578145743.0, + "step": 15158 + }, + { + "epoch": 1.9283806131535428, + "ewc_loss": 0.008054310455918312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054310455918312e-05, + "grad_norm": 4.061917304992676, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8839691877365112, + "num_tokens": 578181588.0, + "step": 15159 + }, + { + "epoch": 1.9285078234321333, + "ewc_loss": 0.008090141229331493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09014163678512e-05, + "grad_norm": 3.9801743030548096, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8739446401596069, + "num_tokens": 578222048.0, + "step": 15160 + }, + { + "epoch": 1.9286350337107239, + "ewc_loss": 0.008040744811296463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040745160542428e-05, + "grad_norm": 4.003859996795654, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8774656057357788, + "num_tokens": 578262664.0, + "step": 15161 + }, + { + "epoch": 1.9287622439893144, + "ewc_loss": 0.008082491345703602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082490967353806e-05, + "grad_norm": 3.9966187477111816, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8657033443450928, + "num_tokens": 578303431.0, + "step": 15162 + }, + { + "epoch": 1.928889454267905, + "ewc_loss": 0.008072475902736187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.072475611697882e-05, + "grad_norm": 4.060159683227539, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8653216361999512, + "num_tokens": 578338978.0, + "step": 15163 + }, + { + "epoch": 1.9290166645464955, + "ewc_loss": 0.00811238307505846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.112382784020156e-05, + "grad_norm": 4.013764381408691, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8715914487838745, + "num_tokens": 578376340.0, + "step": 15164 + }, + { + "epoch": 1.929143874825086, + "ewc_loss": 0.008068266324698925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068266470218077e-05, + "grad_norm": 4.012792110443115, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8768160939216614, + "num_tokens": 578414714.0, + "step": 15165 + }, + { + "epoch": 1.9292710851036765, + "ewc_loss": 0.008090656250715256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090656046988443e-05, + "grad_norm": 4.004225730895996, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8627644181251526, + "num_tokens": 578453668.0, + "step": 15166 + }, + { + "epoch": 1.929398295382267, + "ewc_loss": 0.00807896163314581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.078961400315166e-05, + "grad_norm": 4.00062370300293, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8684558868408203, + "num_tokens": 578494148.0, + "step": 15167 + }, + { + "epoch": 1.9295255056608573, + "ewc_loss": 0.008086771704256535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086772140813991e-05, + "grad_norm": 4.0823211669921875, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8552680015563965, + "num_tokens": 578530743.0, + "step": 15168 + }, + { + "epoch": 1.9296527159394479, + "ewc_loss": 0.008129344321787357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.12934449641034e-05, + "grad_norm": 3.949584722518921, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8972505927085876, + "num_tokens": 578567613.0, + "step": 15169 + }, + { + "epoch": 1.9297799262180384, + "ewc_loss": 0.008037597872316837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037597581278533e-05, + "grad_norm": 4.095315456390381, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8735339641571045, + "num_tokens": 578602981.0, + "step": 15170 + }, + { + "epoch": 1.929907136496629, + "ewc_loss": 0.008148928172886372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.148927736328915e-05, + "grad_norm": 4.021417140960693, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8699716329574585, + "num_tokens": 578641027.0, + "step": 15171 + }, + { + "epoch": 1.9300343467752195, + "ewc_loss": 0.008068874478340149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068874740274623e-05, + "grad_norm": 3.9727160930633545, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8869475722312927, + "num_tokens": 578683006.0, + "step": 15172 + }, + { + "epoch": 1.93016155705381, + "ewc_loss": 0.008079337887465954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.079337567323819e-05, + "grad_norm": 4.057238578796387, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8791122436523438, + "num_tokens": 578716722.0, + "step": 15173 + }, + { + "epoch": 1.9302887673324005, + "ewc_loss": 0.00814064871519804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.140648424159735e-05, + "grad_norm": 4.0780134201049805, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8571712374687195, + "num_tokens": 578750740.0, + "step": 15174 + }, + { + "epoch": 1.9304159776109908, + "ewc_loss": 0.008122514933347702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.122515282593668e-05, + "grad_norm": 3.9967236518859863, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8785051107406616, + "num_tokens": 578791358.0, + "step": 15175 + }, + { + "epoch": 1.9305431878895813, + "ewc_loss": 0.0080938171595335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093817450571805e-05, + "grad_norm": 3.99429988861084, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8603845834732056, + "num_tokens": 578834514.0, + "step": 15176 + }, + { + "epoch": 1.9306703981681719, + "ewc_loss": 0.008118168450891972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.118168625514954e-05, + "grad_norm": 4.036716938018799, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8665744662284851, + "num_tokens": 578875869.0, + "step": 15177 + }, + { + "epoch": 1.9307976084467624, + "ewc_loss": 0.008135084062814713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13508449937217e-05, + "grad_norm": 3.980916738510132, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8799057006835938, + "num_tokens": 578921079.0, + "step": 15178 + }, + { + "epoch": 1.930924818725353, + "ewc_loss": 0.00808765273541212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087652531685308e-05, + "grad_norm": 4.059803485870361, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8738453388214111, + "num_tokens": 578959837.0, + "step": 15179 + }, + { + "epoch": 1.9310520290039435, + "ewc_loss": 0.00814798567444086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.147985499817878e-05, + "grad_norm": 4.06024169921875, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8489347696304321, + "num_tokens": 579001629.0, + "step": 15180 + }, + { + "epoch": 1.931179239282534, + "ewc_loss": 0.008095302619040012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09530247352086e-05, + "grad_norm": 3.9692890644073486, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8977564573287964, + "num_tokens": 579036762.0, + "step": 15181 + }, + { + "epoch": 1.9313064495611245, + "ewc_loss": 0.008050446398556232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050446194829419e-05, + "grad_norm": 4.0290045738220215, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8538672924041748, + "num_tokens": 579078455.0, + "step": 15182 + }, + { + "epoch": 1.931433659839715, + "ewc_loss": 0.008103668689727783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103668369585648e-05, + "grad_norm": 4.005358695983887, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8775879144668579, + "num_tokens": 579116942.0, + "step": 15183 + }, + { + "epoch": 1.9315608701183056, + "ewc_loss": 0.008064652793109417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064653229666874e-05, + "grad_norm": 3.9860329627990723, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8869500160217285, + "num_tokens": 579155180.0, + "step": 15184 + }, + { + "epoch": 1.931688080396896, + "ewc_loss": 0.008073022589087486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.073022763710469e-05, + "grad_norm": 4.035287380218506, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8677729368209839, + "num_tokens": 579194336.0, + "step": 15185 + }, + { + "epoch": 1.9318152906754866, + "ewc_loss": 0.008093447424471378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093447831925005e-05, + "grad_norm": 4.033008098602295, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8730579614639282, + "num_tokens": 579226609.0, + "step": 15186 + }, + { + "epoch": 1.9319425009540772, + "ewc_loss": 0.008082408457994461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082408749032766e-05, + "grad_norm": 4.0479254722595215, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8791242837905884, + "num_tokens": 579259288.0, + "step": 15187 + }, + { + "epoch": 1.9320697112326677, + "ewc_loss": 0.008090383373200893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09038319857791e-05, + "grad_norm": 4.004146575927734, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8830801248550415, + "num_tokens": 579294797.0, + "step": 15188 + }, + { + "epoch": 1.9321969215112582, + "ewc_loss": 0.008064155466854572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064155554166064e-05, + "grad_norm": 4.006458759307861, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8838406205177307, + "num_tokens": 579334352.0, + "step": 15189 + }, + { + "epoch": 1.9323241317898487, + "ewc_loss": 0.008076783269643784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07678370620124e-05, + "grad_norm": 4.025951385498047, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.864759087562561, + "num_tokens": 579379618.0, + "step": 15190 + }, + { + "epoch": 1.9324513420684393, + "ewc_loss": 0.008081065490841866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081065607257187e-05, + "grad_norm": 4.000516891479492, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8753591775894165, + "num_tokens": 579420230.0, + "step": 15191 + }, + { + "epoch": 1.9325785523470298, + "ewc_loss": 0.008067375048995018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06737516541034e-05, + "grad_norm": 4.021918296813965, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8767986297607422, + "num_tokens": 579458236.0, + "step": 15192 + }, + { + "epoch": 1.93270576262562, + "ewc_loss": 0.008084882982075214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.084882574621588e-05, + "grad_norm": 4.01919412612915, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.875662088394165, + "num_tokens": 579495561.0, + "step": 15193 + }, + { + "epoch": 1.9328329729042106, + "ewc_loss": 0.008085524663329124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08552504167892e-05, + "grad_norm": 4.060615539550781, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8753001689910889, + "num_tokens": 579530225.0, + "step": 15194 + }, + { + "epoch": 1.9329601831828012, + "ewc_loss": 0.008128772489726543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.128772606141865e-05, + "grad_norm": 4.000635147094727, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8675979375839233, + "num_tokens": 579570143.0, + "step": 15195 + }, + { + "epoch": 1.9330873934613917, + "ewc_loss": 0.008089062757790089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.089062612270936e-05, + "grad_norm": 4.005886554718018, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8878265619277954, + "num_tokens": 579607105.0, + "step": 15196 + }, + { + "epoch": 1.9332146037399822, + "ewc_loss": 0.008104581385850906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.104581502266228e-05, + "grad_norm": 4.017081260681152, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.890758752822876, + "num_tokens": 579640018.0, + "step": 15197 + }, + { + "epoch": 1.9333418140185727, + "ewc_loss": 0.00811749417334795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11749414424412e-05, + "grad_norm": 4.021749973297119, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8702371120452881, + "num_tokens": 579681250.0, + "step": 15198 + }, + { + "epoch": 1.933469024297163, + "ewc_loss": 0.00811625923961401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116259414236993e-05, + "grad_norm": 4.096177101135254, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8629748821258545, + "num_tokens": 579718535.0, + "step": 15199 + }, + { + "epoch": 1.9335962345757536, + "ewc_loss": 0.00815829262137413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.15829262137413e-05, + "grad_norm": 4.1022419929504395, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.865705668926239, + "num_tokens": 579756959.0, + "step": 15200 + }, + { + "epoch": 1.933723444854344, + "ewc_loss": 0.008117509074509144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.117509423755109e-05, + "grad_norm": 3.997427463531494, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8654583692550659, + "num_tokens": 579798266.0, + "step": 15201 + }, + { + "epoch": 1.9338506551329346, + "ewc_loss": 0.008091341704130173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09134216979146e-05, + "grad_norm": 4.024541854858398, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8938798904418945, + "num_tokens": 579831869.0, + "step": 15202 + }, + { + "epoch": 1.9339778654115252, + "ewc_loss": 0.00813350174576044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.133501978591084e-05, + "grad_norm": 3.9622745513916016, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8768237829208374, + "num_tokens": 579874168.0, + "step": 15203 + }, + { + "epoch": 1.9341050756901157, + "ewc_loss": 0.008085491135716438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085491572273895e-05, + "grad_norm": 3.9629712104797363, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8815122842788696, + "num_tokens": 579916313.0, + "step": 15204 + }, + { + "epoch": 1.9342322859687062, + "ewc_loss": 0.008102218620479107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102218998828903e-05, + "grad_norm": 3.948568820953369, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.886185884475708, + "num_tokens": 579959022.0, + "step": 15205 + }, + { + "epoch": 1.9343594962472968, + "ewc_loss": 0.008108187466859818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.108187466859818e-05, + "grad_norm": 4.00881814956665, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8615108132362366, + "num_tokens": 580003675.0, + "step": 15206 + }, + { + "epoch": 1.9344867065258873, + "ewc_loss": 0.008149313740432262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.149314089678228e-05, + "grad_norm": 3.98575496673584, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8836690187454224, + "num_tokens": 580044384.0, + "step": 15207 + }, + { + "epoch": 1.9346139168044778, + "ewc_loss": 0.008096395060420036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09639532235451e-05, + "grad_norm": 4.078014373779297, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8660744428634644, + "num_tokens": 580078955.0, + "step": 15208 + }, + { + "epoch": 1.9347411270830683, + "ewc_loss": 0.008144606836140156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.144606545101851e-05, + "grad_norm": 4.014894962310791, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8721040487289429, + "num_tokens": 580116793.0, + "step": 15209 + }, + { + "epoch": 1.9348683373616589, + "ewc_loss": 0.008067209273576736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067209273576736e-05, + "grad_norm": 3.9956207275390625, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8734152317047119, + "num_tokens": 580153622.0, + "step": 15210 + }, + { + "epoch": 1.9349955476402494, + "ewc_loss": 0.00808756984770298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087570313364267e-05, + "grad_norm": 3.964672803878784, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8742115497589111, + "num_tokens": 580195287.0, + "step": 15211 + }, + { + "epoch": 1.93512275791884, + "ewc_loss": 0.008061039261519909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061039261519909e-05, + "grad_norm": 4.0666046142578125, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8643811345100403, + "num_tokens": 580229601.0, + "step": 15212 + }, + { + "epoch": 1.9352499681974304, + "ewc_loss": 0.008144271560013294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.144271123455837e-05, + "grad_norm": 4.0058183670043945, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8771445155143738, + "num_tokens": 580268079.0, + "step": 15213 + }, + { + "epoch": 1.935377178476021, + "ewc_loss": 0.008057666011154652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057666127569973e-05, + "grad_norm": 4.05715799331665, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8573000431060791, + "num_tokens": 580305379.0, + "step": 15214 + }, + { + "epoch": 1.9355043887546115, + "ewc_loss": 0.008115801028907299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.115801028907299e-05, + "grad_norm": 4.02235221862793, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8743115663528442, + "num_tokens": 580338438.0, + "step": 15215 + }, + { + "epoch": 1.935631599033202, + "ewc_loss": 0.008093231357634068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093231735983863e-05, + "grad_norm": 4.05247163772583, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8759427070617676, + "num_tokens": 580370017.0, + "step": 15216 + }, + { + "epoch": 1.9357588093117923, + "ewc_loss": 0.008127965033054352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.127964974846691e-05, + "grad_norm": 3.9310712814331055, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8843060731887817, + "num_tokens": 580409572.0, + "step": 15217 + }, + { + "epoch": 1.9358860195903829, + "ewc_loss": 0.008070344105362892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070344483712688e-05, + "grad_norm": 3.9712700843811035, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8806004524230957, + "num_tokens": 580448481.0, + "step": 15218 + }, + { + "epoch": 1.9360132298689734, + "ewc_loss": 0.008121409453451633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121409337036312e-05, + "grad_norm": 3.9881303310394287, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8874635100364685, + "num_tokens": 580481919.0, + "step": 15219 + }, + { + "epoch": 1.936140440147564, + "ewc_loss": 0.008110512979328632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11051286291331e-05, + "grad_norm": 3.9851629734039307, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8698166608810425, + "num_tokens": 580520758.0, + "step": 15220 + }, + { + "epoch": 1.9362676504261545, + "ewc_loss": 0.008111711591482162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.111711213132367e-05, + "grad_norm": 4.032822608947754, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8645873069763184, + "num_tokens": 580559839.0, + "step": 15221 + }, + { + "epoch": 1.936394860704745, + "ewc_loss": 0.008155286312103271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.15528619568795e-05, + "grad_norm": 3.983677625656128, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8994818925857544, + "num_tokens": 580591767.0, + "step": 15222 + }, + { + "epoch": 1.9365220709833355, + "ewc_loss": 0.008109449408948421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.109449845505878e-05, + "grad_norm": 3.998039722442627, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8839124441146851, + "num_tokens": 580633180.0, + "step": 15223 + }, + { + "epoch": 1.9366492812619258, + "ewc_loss": 0.008125543594360352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.12554353615269e-05, + "grad_norm": 3.9264729022979736, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8781152963638306, + "num_tokens": 580677045.0, + "step": 15224 + }, + { + "epoch": 1.9367764915405163, + "ewc_loss": 0.008077637292444706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077637176029384e-05, + "grad_norm": 3.995980739593506, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8884403705596924, + "num_tokens": 580717423.0, + "step": 15225 + }, + { + "epoch": 1.9369037018191069, + "ewc_loss": 0.008134623989462852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.134623931255192e-05, + "grad_norm": 4.017410755157471, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8804609775543213, + "num_tokens": 580756677.0, + "step": 15226 + }, + { + "epoch": 1.9370309120976974, + "ewc_loss": 0.008103504776954651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103504660539329e-05, + "grad_norm": 4.00546407699585, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8796243071556091, + "num_tokens": 580796993.0, + "step": 15227 + }, + { + "epoch": 1.937158122376288, + "ewc_loss": 0.008089044131338596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08904369478114e-05, + "grad_norm": 4.013179302215576, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8787444829940796, + "num_tokens": 580834616.0, + "step": 15228 + }, + { + "epoch": 1.9372853326548785, + "ewc_loss": 0.008096030913293362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09603079687804e-05, + "grad_norm": 4.05338716506958, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8775630593299866, + "num_tokens": 580873718.0, + "step": 15229 + }, + { + "epoch": 1.937412542933469, + "ewc_loss": 0.008082267828285694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08226759545505e-05, + "grad_norm": 4.029803276062012, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8707219362258911, + "num_tokens": 580911678.0, + "step": 15230 + }, + { + "epoch": 1.9375397532120595, + "ewc_loss": 0.008063501678407192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.063501445576549e-05, + "grad_norm": 4.011507987976074, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8806558847427368, + "num_tokens": 580953282.0, + "step": 15231 + }, + { + "epoch": 1.93766696349065, + "ewc_loss": 0.008043787442147732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.043787238420919e-05, + "grad_norm": 3.959766387939453, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8811041712760925, + "num_tokens": 580997954.0, + "step": 15232 + }, + { + "epoch": 1.9377941737692406, + "ewc_loss": 0.008001415058970451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00141497165896e-05, + "grad_norm": 3.9981155395507812, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8764276504516602, + "num_tokens": 581035623.0, + "step": 15233 + }, + { + "epoch": 1.937921384047831, + "ewc_loss": 0.00805556308478117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055562648223713e-05, + "grad_norm": 4.051143169403076, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8831702470779419, + "num_tokens": 581068727.0, + "step": 15234 + }, + { + "epoch": 1.9380485943264216, + "ewc_loss": 0.008049379102885723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04937953944318e-05, + "grad_norm": 3.96789288520813, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8742417097091675, + "num_tokens": 581111180.0, + "step": 15235 + }, + { + "epoch": 1.9381758046050122, + "ewc_loss": 0.007997232489287853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.997232751222327e-05, + "grad_norm": 4.0306806564331055, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8553293943405151, + "num_tokens": 581152373.0, + "step": 15236 + }, + { + "epoch": 1.9383030148836027, + "ewc_loss": 0.008077908307313919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077908569248393e-05, + "grad_norm": 3.9923224449157715, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8732523322105408, + "num_tokens": 581195854.0, + "step": 15237 + }, + { + "epoch": 1.9384302251621932, + "ewc_loss": 0.00801183097064495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011830504983664e-05, + "grad_norm": 4.0299882888793945, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8504742383956909, + "num_tokens": 581236414.0, + "step": 15238 + }, + { + "epoch": 1.9385574354407837, + "ewc_loss": 0.008059360086917877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059359970502555e-05, + "grad_norm": 3.9915659427642822, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8613649606704712, + "num_tokens": 581275349.0, + "step": 15239 + }, + { + "epoch": 1.9386846457193743, + "ewc_loss": 0.008037043735384941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037043880904093e-05, + "grad_norm": 4.024114608764648, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8824396729469299, + "num_tokens": 581313231.0, + "step": 15240 + }, + { + "epoch": 1.9388118559979648, + "ewc_loss": 0.008060039952397346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060040272539482e-05, + "grad_norm": 3.9834280014038086, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8779600858688354, + "num_tokens": 581355295.0, + "step": 15241 + }, + { + "epoch": 1.938939066276555, + "ewc_loss": 0.008044114336371422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044113928917795e-05, + "grad_norm": 3.993236780166626, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8787664771080017, + "num_tokens": 581392633.0, + "step": 15242 + }, + { + "epoch": 1.9390662765551456, + "ewc_loss": 0.008065514266490936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.065514703048393e-05, + "grad_norm": 4.08109712600708, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8883930444717407, + "num_tokens": 581425552.0, + "step": 15243 + }, + { + "epoch": 1.9391934868337362, + "ewc_loss": 0.008093577809631824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093578071566299e-05, + "grad_norm": 3.9993653297424316, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8657354712486267, + "num_tokens": 581467707.0, + "step": 15244 + }, + { + "epoch": 1.9393206971123267, + "ewc_loss": 0.00802843552082777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.028435695450753e-05, + "grad_norm": 3.990597724914551, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8817894458770752, + "num_tokens": 581504888.0, + "step": 15245 + }, + { + "epoch": 1.9394479073909172, + "ewc_loss": 0.00806161854416132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061618427745998e-05, + "grad_norm": 4.046370983123779, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8754684925079346, + "num_tokens": 581537800.0, + "step": 15246 + }, + { + "epoch": 1.9395751176695077, + "ewc_loss": 0.008093421347439289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093420910881832e-05, + "grad_norm": 4.011652946472168, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8966227769851685, + "num_tokens": 581570594.0, + "step": 15247 + }, + { + "epoch": 1.939702327948098, + "ewc_loss": 0.00804019533097744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040195098146796e-05, + "grad_norm": 3.967989206314087, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.869667649269104, + "num_tokens": 581610764.0, + "step": 15248 + }, + { + "epoch": 1.9398295382266886, + "ewc_loss": 0.008058647625148296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058647654252127e-05, + "grad_norm": 4.022522926330566, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8757749199867249, + "num_tokens": 581652341.0, + "step": 15249 + }, + { + "epoch": 1.939956748505279, + "ewc_loss": 0.008102591149508953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102591527858749e-05, + "grad_norm": 4.056171894073486, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.868274450302124, + "num_tokens": 581689978.0, + "step": 15250 + }, + { + "epoch": 1.9400839587838696, + "ewc_loss": 0.008081586100161076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081585838226601e-05, + "grad_norm": 3.9857592582702637, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8848578929901123, + "num_tokens": 581728811.0, + "step": 15251 + }, + { + "epoch": 1.9402111690624602, + "ewc_loss": 0.00803097803145647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.030977915041149e-05, + "grad_norm": 3.9712283611297607, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8858985304832458, + "num_tokens": 581768016.0, + "step": 15252 + }, + { + "epoch": 1.9403383793410507, + "ewc_loss": 0.008069577626883984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069577597780153e-05, + "grad_norm": 4.008296489715576, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8790528178215027, + "num_tokens": 581805405.0, + "step": 15253 + }, + { + "epoch": 1.9404655896196412, + "ewc_loss": 0.008077481761574745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077482198132202e-05, + "grad_norm": 4.007117748260498, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8571099638938904, + "num_tokens": 581845302.0, + "step": 15254 + }, + { + "epoch": 1.9405927998982317, + "ewc_loss": 0.008085942827165127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085942681645975e-05, + "grad_norm": 4.053289413452148, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8853081464767456, + "num_tokens": 581875391.0, + "step": 15255 + }, + { + "epoch": 1.9407200101768223, + "ewc_loss": 0.008110372349619865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.110372436931357e-05, + "grad_norm": 3.9746651649475098, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8751335740089417, + "num_tokens": 581916580.0, + "step": 15256 + }, + { + "epoch": 1.9408472204554128, + "ewc_loss": 0.008051222190260887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051221811911091e-05, + "grad_norm": 3.9731521606445312, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8773586750030518, + "num_tokens": 581958539.0, + "step": 15257 + }, + { + "epoch": 1.9409744307340033, + "ewc_loss": 0.008092299103736877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092298958217725e-05, + "grad_norm": 4.007477283477783, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8754727244377136, + "num_tokens": 581996981.0, + "step": 15258 + }, + { + "epoch": 1.9411016410125939, + "ewc_loss": 0.0081001166254282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.100116974674165e-05, + "grad_norm": 4.005817890167236, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8856456279754639, + "num_tokens": 582036068.0, + "step": 15259 + }, + { + "epoch": 1.9412288512911844, + "ewc_loss": 0.008068305440247059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068305760389194e-05, + "grad_norm": 4.01439094543457, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8778665065765381, + "num_tokens": 582073322.0, + "step": 15260 + }, + { + "epoch": 1.941356061569775, + "ewc_loss": 0.008080067113041878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080067345872521e-05, + "grad_norm": 4.031564235687256, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8786306977272034, + "num_tokens": 582110736.0, + "step": 15261 + }, + { + "epoch": 1.9414832718483654, + "ewc_loss": 0.008091856725513935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.091856579994783e-05, + "grad_norm": 3.9898006916046143, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8604295253753662, + "num_tokens": 582153431.0, + "step": 15262 + }, + { + "epoch": 1.941610482126956, + "ewc_loss": 0.00806165300309658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061652624746785e-05, + "grad_norm": 4.019076347351074, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8580016493797302, + "num_tokens": 582191793.0, + "step": 15263 + }, + { + "epoch": 1.9417376924055465, + "ewc_loss": 0.00810480397194624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.104804146569222e-05, + "grad_norm": 4.032968044281006, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8668465614318848, + "num_tokens": 582227923.0, + "step": 15264 + }, + { + "epoch": 1.941864902684137, + "ewc_loss": 0.008095160126686096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.095160592347383e-05, + "grad_norm": 4.011887073516846, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8631935119628906, + "num_tokens": 582268029.0, + "step": 15265 + }, + { + "epoch": 1.9419921129627273, + "ewc_loss": 0.008098186925053596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.098186663119122e-05, + "grad_norm": 4.073085308074951, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8711246848106384, + "num_tokens": 582302173.0, + "step": 15266 + }, + { + "epoch": 1.9421193232413179, + "ewc_loss": 0.00811243336647749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.112432988127694e-05, + "grad_norm": 3.9529616832733154, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8901866674423218, + "num_tokens": 582341642.0, + "step": 15267 + }, + { + "epoch": 1.9422465335199084, + "ewc_loss": 0.008035369217395782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.035368955461308e-05, + "grad_norm": 4.043633937835693, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8780415058135986, + "num_tokens": 582378960.0, + "step": 15268 + }, + { + "epoch": 1.942373743798499, + "ewc_loss": 0.008150008507072926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.150008943630382e-05, + "grad_norm": 3.992443561553955, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8822450637817383, + "num_tokens": 582422149.0, + "step": 15269 + }, + { + "epoch": 1.9425009540770894, + "ewc_loss": 0.008071528747677803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071529009612277e-05, + "grad_norm": 4.023429870605469, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8606795072555542, + "num_tokens": 582462657.0, + "step": 15270 + }, + { + "epoch": 1.94262816435568, + "ewc_loss": 0.008112918585538864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.112919022096321e-05, + "grad_norm": 3.989410161972046, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8880174160003662, + "num_tokens": 582499264.0, + "step": 15271 + }, + { + "epoch": 1.9427553746342705, + "ewc_loss": 0.008080186322331429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080185943981633e-05, + "grad_norm": 4.009607791900635, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8796828985214233, + "num_tokens": 582534744.0, + "step": 15272 + }, + { + "epoch": 1.9428825849128608, + "ewc_loss": 0.00809644628316164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09644625405781e-05, + "grad_norm": 3.9831862449645996, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8719212412834167, + "num_tokens": 582577010.0, + "step": 15273 + }, + { + "epoch": 1.9430097951914513, + "ewc_loss": 0.008074725046753883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074725337792188e-05, + "grad_norm": 4.016726970672607, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8740342855453491, + "num_tokens": 582617103.0, + "step": 15274 + }, + { + "epoch": 1.9431370054700419, + "ewc_loss": 0.008086404763162136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086404704954475e-05, + "grad_norm": 4.0092692375183105, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8706597089767456, + "num_tokens": 582661754.0, + "step": 15275 + }, + { + "epoch": 1.9432642157486324, + "ewc_loss": 0.008075203746557236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07520336820744e-05, + "grad_norm": 3.993976593017578, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.878746747970581, + "num_tokens": 582701701.0, + "step": 15276 + }, + { + "epoch": 1.943391426027223, + "ewc_loss": 0.008066375739872456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066375448834151e-05, + "grad_norm": 4.00696325302124, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8672735691070557, + "num_tokens": 582741977.0, + "step": 15277 + }, + { + "epoch": 1.9435186363058135, + "ewc_loss": 0.008080226369202137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080226689344272e-05, + "grad_norm": 3.9992334842681885, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8839126825332642, + "num_tokens": 582783420.0, + "step": 15278 + }, + { + "epoch": 1.943645846584404, + "ewc_loss": 0.008051997981965542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051998156588525e-05, + "grad_norm": 4.049692630767822, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8807294964790344, + "num_tokens": 582819464.0, + "step": 15279 + }, + { + "epoch": 1.9437730568629945, + "ewc_loss": 0.008076833561062813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076833910308778e-05, + "grad_norm": 3.9476561546325684, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8815656304359436, + "num_tokens": 582860757.0, + "step": 15280 + }, + { + "epoch": 1.943900267141585, + "ewc_loss": 0.00801331177353859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.013311889953911e-05, + "grad_norm": 3.96359920501709, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8712297677993774, + "num_tokens": 582910300.0, + "step": 15281 + }, + { + "epoch": 1.9440274774201756, + "ewc_loss": 0.008061200380325317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061200060183182e-05, + "grad_norm": 3.9930596351623535, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8685228228569031, + "num_tokens": 582956333.0, + "step": 15282 + }, + { + "epoch": 1.944154687698766, + "ewc_loss": 0.008061571046710014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061571134021506e-05, + "grad_norm": 4.027563571929932, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8834449052810669, + "num_tokens": 582994068.0, + "step": 15283 + }, + { + "epoch": 1.9442818979773566, + "ewc_loss": 0.008059008046984673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059007814154029e-05, + "grad_norm": 4.049444198608398, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8735353946685791, + "num_tokens": 583027829.0, + "step": 15284 + }, + { + "epoch": 1.9444091082559471, + "ewc_loss": 0.008050435222685337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.050435280892998e-05, + "grad_norm": 3.9913671016693115, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8731286525726318, + "num_tokens": 583068881.0, + "step": 15285 + }, + { + "epoch": 1.9445363185345377, + "ewc_loss": 0.008006315678358078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.006316056707874e-05, + "grad_norm": 4.019845485687256, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8788025379180908, + "num_tokens": 583100097.0, + "step": 15286 + }, + { + "epoch": 1.9446635288131282, + "ewc_loss": 0.008055235259234905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055235230131075e-05, + "grad_norm": 4.062194347381592, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8736211657524109, + "num_tokens": 583135340.0, + "step": 15287 + }, + { + "epoch": 1.9447907390917187, + "ewc_loss": 0.008067387156188488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067386806942523e-05, + "grad_norm": 3.980928421020508, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8753585815429688, + "num_tokens": 583172032.0, + "step": 15288 + }, + { + "epoch": 1.9449179493703093, + "ewc_loss": 0.008011325262486935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.011325553525239e-05, + "grad_norm": 4.039696216583252, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.881003201007843, + "num_tokens": 583205244.0, + "step": 15289 + }, + { + "epoch": 1.9450451596488998, + "ewc_loss": 0.008071372285485268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071371848927811e-05, + "grad_norm": 4.029234886169434, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8726925849914551, + "num_tokens": 583247537.0, + "step": 15290 + }, + { + "epoch": 1.94517236992749, + "ewc_loss": 0.008051562123000622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.051562326727435e-05, + "grad_norm": 4.022948741912842, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.868281900882721, + "num_tokens": 583285349.0, + "step": 15291 + }, + { + "epoch": 1.9452995802060806, + "ewc_loss": 0.008063127286732197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06312746135518e-05, + "grad_norm": 4.018543243408203, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8774558901786804, + "num_tokens": 583323810.0, + "step": 15292 + }, + { + "epoch": 1.9454267904846712, + "ewc_loss": 0.008055237121880054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055236685322598e-05, + "grad_norm": 4.025284767150879, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.886468768119812, + "num_tokens": 583354244.0, + "step": 15293 + }, + { + "epoch": 1.9455540007632617, + "ewc_loss": 0.008076252415776253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076252561295405e-05, + "grad_norm": 3.997384786605835, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8889412879943848, + "num_tokens": 583391764.0, + "step": 15294 + }, + { + "epoch": 1.9456812110418522, + "ewc_loss": 0.008037811145186424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037810766836628e-05, + "grad_norm": 4.041014671325684, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8626184463500977, + "num_tokens": 583426305.0, + "step": 15295 + }, + { + "epoch": 1.9458084213204427, + "ewc_loss": 0.008101937361061573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.101937419269234e-05, + "grad_norm": 4.036729335784912, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8718146085739136, + "num_tokens": 583465252.0, + "step": 15296 + }, + { + "epoch": 1.945935631599033, + "ewc_loss": 0.00807013176381588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070132025750354e-05, + "grad_norm": 3.966505527496338, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8677701950073242, + "num_tokens": 583510001.0, + "step": 15297 + }, + { + "epoch": 1.9460628418776236, + "ewc_loss": 0.008053300902247429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053301280597225e-05, + "grad_norm": 4.017825603485107, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8781285881996155, + "num_tokens": 583547380.0, + "step": 15298 + }, + { + "epoch": 1.946190052156214, + "ewc_loss": 0.008105862885713577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.105862798402086e-05, + "grad_norm": 3.9998152256011963, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8758188486099243, + "num_tokens": 583588852.0, + "step": 15299 + }, + { + "epoch": 1.9463172624348046, + "ewc_loss": 0.008064514957368374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.064514986472204e-05, + "grad_norm": 3.9760255813598633, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8832566738128662, + "num_tokens": 583634006.0, + "step": 15300 + }, + { + "epoch": 1.9464444727133952, + "ewc_loss": 0.008055071346461773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055071521084756e-05, + "grad_norm": 3.983344316482544, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8633585572242737, + "num_tokens": 583679430.0, + "step": 15301 + }, + { + "epoch": 1.9465716829919857, + "ewc_loss": 0.008065405301749706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06540556368418e-05, + "grad_norm": 4.102492332458496, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8829361200332642, + "num_tokens": 583708856.0, + "step": 15302 + }, + { + "epoch": 1.9466988932705762, + "ewc_loss": 0.008119186386466026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.119186531985179e-05, + "grad_norm": 4.0000176429748535, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8751822710037231, + "num_tokens": 583750737.0, + "step": 15303 + }, + { + "epoch": 1.9468261035491667, + "ewc_loss": 0.008010405115783215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010405144887045e-05, + "grad_norm": 4.040677547454834, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8808652758598328, + "num_tokens": 583786342.0, + "step": 15304 + }, + { + "epoch": 1.9469533138277573, + "ewc_loss": 0.0080780740827322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.078074461081997e-05, + "grad_norm": 4.010717868804932, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8823399543762207, + "num_tokens": 583821681.0, + "step": 15305 + }, + { + "epoch": 1.9470805241063478, + "ewc_loss": 0.008053470402956009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.053470082813874e-05, + "grad_norm": 4.1119513511657715, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8604630827903748, + "num_tokens": 583855640.0, + "step": 15306 + }, + { + "epoch": 1.9472077343849383, + "ewc_loss": 0.008105349726974964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.105349843390286e-05, + "grad_norm": 4.083664894104004, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8854830861091614, + "num_tokens": 583888047.0, + "step": 15307 + }, + { + "epoch": 1.9473349446635289, + "ewc_loss": 0.00807422399520874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074224024312571e-05, + "grad_norm": 4.027604579925537, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8754746913909912, + "num_tokens": 583923220.0, + "step": 15308 + }, + { + "epoch": 1.9474621549421194, + "ewc_loss": 0.008062256500124931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.062256529228762e-05, + "grad_norm": 3.9975624084472656, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8716042041778564, + "num_tokens": 583965718.0, + "step": 15309 + }, + { + "epoch": 1.94758936522071, + "ewc_loss": 0.008067063987255096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067063754424453e-05, + "grad_norm": 3.964609146118164, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8808807134628296, + "num_tokens": 584014027.0, + "step": 15310 + }, + { + "epoch": 1.9477165754993004, + "ewc_loss": 0.00805810745805502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058107778197154e-05, + "grad_norm": 4.080647945404053, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8791190385818481, + "num_tokens": 584045019.0, + "step": 15311 + }, + { + "epoch": 1.947843785777891, + "ewc_loss": 0.008136522956192493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.136522956192493e-05, + "grad_norm": 4.003912925720215, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8604816794395447, + "num_tokens": 584086839.0, + "step": 15312 + }, + { + "epoch": 1.9479709960564815, + "ewc_loss": 0.00805652141571045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056521619437262e-05, + "grad_norm": 3.976839303970337, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8892888426780701, + "num_tokens": 584123785.0, + "step": 15313 + }, + { + "epoch": 1.948098206335072, + "ewc_loss": 0.008071676827967167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071676711551845e-05, + "grad_norm": 3.9762721061706543, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.869035542011261, + "num_tokens": 584163768.0, + "step": 15314 + }, + { + "epoch": 1.9482254166136623, + "ewc_loss": 0.00807070080190897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070701005635783e-05, + "grad_norm": 4.020901679992676, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8719568252563477, + "num_tokens": 584201668.0, + "step": 15315 + }, + { + "epoch": 1.9483526268922529, + "ewc_loss": 0.00812583789229393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.125838212436065e-05, + "grad_norm": 4.025287628173828, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8847969770431519, + "num_tokens": 584236626.0, + "step": 15316 + }, + { + "epoch": 1.9484798371708434, + "ewc_loss": 0.008102753199636936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102753054117784e-05, + "grad_norm": 4.000294208526611, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8598740696907043, + "num_tokens": 584279811.0, + "step": 15317 + }, + { + "epoch": 1.948607047449434, + "ewc_loss": 0.008103535510599613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103535219561309e-05, + "grad_norm": 4.02749490737915, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8766616582870483, + "num_tokens": 584317314.0, + "step": 15318 + }, + { + "epoch": 1.9487342577280244, + "ewc_loss": 0.008118519559502602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.118519326671958e-05, + "grad_norm": 3.964505434036255, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8807486891746521, + "num_tokens": 584355768.0, + "step": 15319 + }, + { + "epoch": 1.948861468006615, + "ewc_loss": 0.008084770292043686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08477052487433e-05, + "grad_norm": 4.109836578369141, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8736997246742249, + "num_tokens": 584386205.0, + "step": 15320 + }, + { + "epoch": 1.9489886782852053, + "ewc_loss": 0.008195332251489162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.195332338800654e-05, + "grad_norm": 4.033103942871094, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8891851902008057, + "num_tokens": 584419721.0, + "step": 15321 + }, + { + "epoch": 1.9491158885637958, + "ewc_loss": 0.008076582103967667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076582162175328e-05, + "grad_norm": 4.078821659088135, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8750066757202148, + "num_tokens": 584451029.0, + "step": 15322 + }, + { + "epoch": 1.9492430988423863, + "ewc_loss": 0.008169928565621376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.169928332790732e-05, + "grad_norm": 4.04913854598999, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8898317217826843, + "num_tokens": 584483991.0, + "step": 15323 + }, + { + "epoch": 1.9493703091209769, + "ewc_loss": 0.008137921802699566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.137922122841701e-05, + "grad_norm": 4.07290506362915, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8751102685928345, + "num_tokens": 584517652.0, + "step": 15324 + }, + { + "epoch": 1.9494975193995674, + "ewc_loss": 0.008166048675775528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.16604879219085e-05, + "grad_norm": 3.9660961627960205, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8845434784889221, + "num_tokens": 584561058.0, + "step": 15325 + }, + { + "epoch": 1.949624729678158, + "ewc_loss": 0.008121552877128124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121552673401311e-05, + "grad_norm": 4.027961730957031, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8797908425331116, + "num_tokens": 584595097.0, + "step": 15326 + }, + { + "epoch": 1.9497519399567484, + "ewc_loss": 0.00817803293466568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.178033021977171e-05, + "grad_norm": 3.962570905685425, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8838350772857666, + "num_tokens": 584639073.0, + "step": 15327 + }, + { + "epoch": 1.949879150235339, + "ewc_loss": 0.008113249205052853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.113249350572005e-05, + "grad_norm": 4.044780731201172, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8678693175315857, + "num_tokens": 584680517.0, + "step": 15328 + }, + { + "epoch": 1.9500063605139295, + "ewc_loss": 0.008187701925635338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.18770204205066e-05, + "grad_norm": 3.999436616897583, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.877021312713623, + "num_tokens": 584717796.0, + "step": 15329 + }, + { + "epoch": 1.95013357079252, + "ewc_loss": 0.008130330592393875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.130330388667062e-05, + "grad_norm": 4.026583671569824, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8893277645111084, + "num_tokens": 584755595.0, + "step": 15330 + }, + { + "epoch": 1.9502607810711106, + "ewc_loss": 0.008164187893271446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.164187602233142e-05, + "grad_norm": 4.012855529785156, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8666689395904541, + "num_tokens": 584798681.0, + "step": 15331 + }, + { + "epoch": 1.950387991349701, + "ewc_loss": 0.008130747824907303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.130748028634116e-05, + "grad_norm": 4.0087571144104, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8785359859466553, + "num_tokens": 584835424.0, + "step": 15332 + }, + { + "epoch": 1.9505152016282916, + "ewc_loss": 0.008139168843626976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13916849438101e-05, + "grad_norm": 4.0351738929748535, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8900848627090454, + "num_tokens": 584872063.0, + "step": 15333 + }, + { + "epoch": 1.9506424119068821, + "ewc_loss": 0.008138027973473072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.138027624227107e-05, + "grad_norm": 4.044886112213135, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8912254571914673, + "num_tokens": 584905850.0, + "step": 15334 + }, + { + "epoch": 1.9507696221854727, + "ewc_loss": 0.008144604042172432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.144603634718806e-05, + "grad_norm": 4.03065299987793, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8845452070236206, + "num_tokens": 584946480.0, + "step": 15335 + }, + { + "epoch": 1.9508968324640632, + "ewc_loss": 0.00813472829759121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.134727977449074e-05, + "grad_norm": 4.097091197967529, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8825269341468811, + "num_tokens": 584977382.0, + "step": 15336 + }, + { + "epoch": 1.9510240427426537, + "ewc_loss": 0.008164712227880955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.164712198777124e-05, + "grad_norm": 4.023837089538574, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8779638409614563, + "num_tokens": 585010885.0, + "step": 15337 + }, + { + "epoch": 1.9511512530212443, + "ewc_loss": 0.008103807456791401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103807340376079e-05, + "grad_norm": 4.06939172744751, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8757981061935425, + "num_tokens": 585048208.0, + "step": 15338 + }, + { + "epoch": 1.9512784632998348, + "ewc_loss": 0.008154126815497875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.154126408044249e-05, + "grad_norm": 3.9787230491638184, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8719861507415771, + "num_tokens": 585087477.0, + "step": 15339 + }, + { + "epoch": 1.951405673578425, + "ewc_loss": 0.008096872828900814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.096872625174001e-05, + "grad_norm": 4.066280364990234, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8757996559143066, + "num_tokens": 585120860.0, + "step": 15340 + }, + { + "epoch": 1.9515328838570156, + "ewc_loss": 0.008188772946596146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.188773063011467e-05, + "grad_norm": 3.999232053756714, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8781854510307312, + "num_tokens": 585161887.0, + "step": 15341 + }, + { + "epoch": 1.9516600941356061, + "ewc_loss": 0.008101475425064564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.101475395960733e-05, + "grad_norm": 3.9648544788360596, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8855836391448975, + "num_tokens": 585200496.0, + "step": 15342 + }, + { + "epoch": 1.9517873044141967, + "ewc_loss": 0.008117714896798134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.117714605759829e-05, + "grad_norm": 3.996832847595215, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8763881921768188, + "num_tokens": 585243976.0, + "step": 15343 + }, + { + "epoch": 1.9519145146927872, + "ewc_loss": 0.008150706067681313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.150705980369821e-05, + "grad_norm": 4.027255535125732, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8716609477996826, + "num_tokens": 585282366.0, + "step": 15344 + }, + { + "epoch": 1.9520417249713777, + "ewc_loss": 0.008156606927514076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.156606781994924e-05, + "grad_norm": 3.983372449874878, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8748360872268677, + "num_tokens": 585325224.0, + "step": 15345 + }, + { + "epoch": 1.952168935249968, + "ewc_loss": 0.008096718229353428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09671837487258e-05, + "grad_norm": 4.012133598327637, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8852478265762329, + "num_tokens": 585361954.0, + "step": 15346 + }, + { + "epoch": 1.9522961455285586, + "ewc_loss": 0.008145439438521862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.145439642248675e-05, + "grad_norm": 3.9767518043518066, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8733823299407959, + "num_tokens": 585406033.0, + "step": 15347 + }, + { + "epoch": 1.952423355807149, + "ewc_loss": 0.008111433126032352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.111433271551505e-05, + "grad_norm": 4.060360431671143, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8754210472106934, + "num_tokens": 585440457.0, + "step": 15348 + }, + { + "epoch": 1.9525505660857396, + "ewc_loss": 0.008155331015586853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.155331306625158e-05, + "grad_norm": 4.007558345794678, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.89734947681427, + "num_tokens": 585473451.0, + "step": 15349 + }, + { + "epoch": 1.9526777763643302, + "ewc_loss": 0.008094126358628273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.094126678770408e-05, + "grad_norm": 4.020200252532959, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8793084621429443, + "num_tokens": 585509652.0, + "step": 15350 + }, + { + "epoch": 1.9528049866429207, + "ewc_loss": 0.008115716278553009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.115715900203213e-05, + "grad_norm": 3.990447521209717, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.880487322807312, + "num_tokens": 585545591.0, + "step": 15351 + }, + { + "epoch": 1.9529321969215112, + "ewc_loss": 0.008089205250144005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.089205221040174e-05, + "grad_norm": 4.037698268890381, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8644490242004395, + "num_tokens": 585583757.0, + "step": 15352 + }, + { + "epoch": 1.9530594072001017, + "ewc_loss": 0.008125382475554943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.125382737489417e-05, + "grad_norm": 4.006461143493652, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8642646074295044, + "num_tokens": 585623897.0, + "step": 15353 + }, + { + "epoch": 1.9531866174786923, + "ewc_loss": 0.008090630173683167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090630581136793e-05, + "grad_norm": 4.013103008270264, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8804970979690552, + "num_tokens": 585661043.0, + "step": 15354 + }, + { + "epoch": 1.9533138277572828, + "ewc_loss": 0.008105238899588585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.105238521238789e-05, + "grad_norm": 4.042698383331299, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8516548275947571, + "num_tokens": 585699201.0, + "step": 15355 + }, + { + "epoch": 1.9534410380358733, + "ewc_loss": 0.008118174970149994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.118175173876807e-05, + "grad_norm": 4.0177998542785645, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8699743747711182, + "num_tokens": 585735325.0, + "step": 15356 + }, + { + "epoch": 1.9535682483144639, + "ewc_loss": 0.008106157183647156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.106156747089699e-05, + "grad_norm": 4.01090669631958, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.883345365524292, + "num_tokens": 585769909.0, + "step": 15357 + }, + { + "epoch": 1.9536954585930544, + "ewc_loss": 0.008108974434435368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.108973997877911e-05, + "grad_norm": 4.076828479766846, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8646112680435181, + "num_tokens": 585804590.0, + "step": 15358 + }, + { + "epoch": 1.953822668871645, + "ewc_loss": 0.008166993968188763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.166993939084932e-05, + "grad_norm": 4.033346176147461, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8723422288894653, + "num_tokens": 585843349.0, + "step": 15359 + }, + { + "epoch": 1.9539498791502354, + "ewc_loss": 0.008120030164718628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.120030543068424e-05, + "grad_norm": 3.9640095233917236, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8872921466827393, + "num_tokens": 585881846.0, + "step": 15360 + }, + { + "epoch": 1.954077089428826, + "ewc_loss": 0.008084253408014774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.084253204287961e-05, + "grad_norm": 4.0428900718688965, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8737007975578308, + "num_tokens": 585917848.0, + "step": 15361 + }, + { + "epoch": 1.9542042997074165, + "ewc_loss": 0.008167735300958157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.167735359165817e-05, + "grad_norm": 3.9766428470611572, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8817341327667236, + "num_tokens": 585961249.0, + "step": 15362 + }, + { + "epoch": 1.954331509986007, + "ewc_loss": 0.008093764074146748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093764336081222e-05, + "grad_norm": 3.9658713340759277, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8831996321678162, + "num_tokens": 586005795.0, + "step": 15363 + }, + { + "epoch": 1.9544587202645973, + "ewc_loss": 0.00812834408134222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.12834405223839e-05, + "grad_norm": 3.990706205368042, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8851112127304077, + "num_tokens": 586041523.0, + "step": 15364 + }, + { + "epoch": 1.9545859305431879, + "ewc_loss": 0.008124223910272121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.124223677441478e-05, + "grad_norm": 4.054286003112793, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8682053089141846, + "num_tokens": 586077800.0, + "step": 15365 + }, + { + "epoch": 1.9547131408217784, + "ewc_loss": 0.008128668181598186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.128668559947982e-05, + "grad_norm": 4.000289440155029, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.872520387172699, + "num_tokens": 586116010.0, + "step": 15366 + }, + { + "epoch": 1.954840351100369, + "ewc_loss": 0.008088896051049232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.088895992841572e-05, + "grad_norm": 3.9689600467681885, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8873937129974365, + "num_tokens": 586155359.0, + "step": 15367 + }, + { + "epoch": 1.9549675613789594, + "ewc_loss": 0.0080760782584548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076078665908426e-05, + "grad_norm": 4.051056385040283, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.882377564907074, + "num_tokens": 586188190.0, + "step": 15368 + }, + { + "epoch": 1.95509477165755, + "ewc_loss": 0.008126399479806423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.12639991636388e-05, + "grad_norm": 4.0304460525512695, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8755691051483154, + "num_tokens": 586225971.0, + "step": 15369 + }, + { + "epoch": 1.9552219819361403, + "ewc_loss": 0.008095039054751396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.095039083855227e-05, + "grad_norm": 4.035963535308838, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8755356669425964, + "num_tokens": 586263295.0, + "step": 15370 + }, + { + "epoch": 1.9553491922147308, + "ewc_loss": 0.00810960028320551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.109600457828492e-05, + "grad_norm": 4.056273460388184, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8967738151550293, + "num_tokens": 586293547.0, + "step": 15371 + }, + { + "epoch": 1.9554764024933213, + "ewc_loss": 0.008115623146295547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.115622767945752e-05, + "grad_norm": 3.978179693222046, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8719567656517029, + "num_tokens": 586333180.0, + "step": 15372 + }, + { + "epoch": 1.9556036127719119, + "ewc_loss": 0.008074315264821053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074314973782748e-05, + "grad_norm": 4.013435363769531, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8779585361480713, + "num_tokens": 586368780.0, + "step": 15373 + }, + { + "epoch": 1.9557308230505024, + "ewc_loss": 0.008122413419187069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.122413419187069e-05, + "grad_norm": 4.048000812530518, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8630769848823547, + "num_tokens": 586404177.0, + "step": 15374 + }, + { + "epoch": 1.955858033329093, + "ewc_loss": 0.00813776534050703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.137765689752996e-05, + "grad_norm": 4.011918067932129, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8609955906867981, + "num_tokens": 586446917.0, + "step": 15375 + }, + { + "epoch": 1.9559852436076834, + "ewc_loss": 0.008108019828796387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.10801939223893e-05, + "grad_norm": 4.011225700378418, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.877638578414917, + "num_tokens": 586485832.0, + "step": 15376 + }, + { + "epoch": 1.956112453886274, + "ewc_loss": 0.008124412968754768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.124412852339447e-05, + "grad_norm": 4.034564018249512, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8848180770874023, + "num_tokens": 586524089.0, + "step": 15377 + }, + { + "epoch": 1.9562396641648645, + "ewc_loss": 0.008145563304424286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.145563333528116e-05, + "grad_norm": 3.969590902328491, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8800790309906006, + "num_tokens": 586564097.0, + "step": 15378 + }, + { + "epoch": 1.956366874443455, + "ewc_loss": 0.008093221113085747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093220822047442e-05, + "grad_norm": 4.031748294830322, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8899929523468018, + "num_tokens": 586600653.0, + "step": 15379 + }, + { + "epoch": 1.9564940847220456, + "ewc_loss": 0.008159482851624489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.15948296803981e-05, + "grad_norm": 3.9986822605133057, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8823381662368774, + "num_tokens": 586639527.0, + "step": 15380 + }, + { + "epoch": 1.956621295000636, + "ewc_loss": 0.008116235956549644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116236131172627e-05, + "grad_norm": 4.092809677124023, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8806397914886475, + "num_tokens": 586670885.0, + "step": 15381 + }, + { + "epoch": 1.9567485052792266, + "ewc_loss": 0.00816280022263527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.162800077116117e-05, + "grad_norm": 3.986628532409668, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8902361989021301, + "num_tokens": 586709875.0, + "step": 15382 + }, + { + "epoch": 1.9568757155578171, + "ewc_loss": 0.008092622272670269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092622010735795e-05, + "grad_norm": 3.9983482360839844, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.870403528213501, + "num_tokens": 586753164.0, + "step": 15383 + }, + { + "epoch": 1.9570029258364077, + "ewc_loss": 0.008122045546770096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.122045255731791e-05, + "grad_norm": 3.965284824371338, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8825024366378784, + "num_tokens": 586793737.0, + "step": 15384 + }, + { + "epoch": 1.9571301361149982, + "ewc_loss": 0.00809653103351593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.096531382761896e-05, + "grad_norm": 4.023062229156494, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8714674711227417, + "num_tokens": 586833428.0, + "step": 15385 + }, + { + "epoch": 1.9572573463935887, + "ewc_loss": 0.00812306348234415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.123063889797777e-05, + "grad_norm": 4.08646821975708, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8556243181228638, + "num_tokens": 586871202.0, + "step": 15386 + }, + { + "epoch": 1.9573845566721793, + "ewc_loss": 0.008124618791043758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.124618761939928e-05, + "grad_norm": 4.019405841827393, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8874369859695435, + "num_tokens": 586905537.0, + "step": 15387 + }, + { + "epoch": 1.9575117669507698, + "ewc_loss": 0.008056407794356346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05640738690272e-05, + "grad_norm": 4.06261682510376, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8674837350845337, + "num_tokens": 586940378.0, + "step": 15388 + }, + { + "epoch": 1.95763897722936, + "ewc_loss": 0.00812629796564579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.126298052957281e-05, + "grad_norm": 4.025835990905762, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.885600745677948, + "num_tokens": 586978806.0, + "step": 15389 + }, + { + "epoch": 1.9577661875079506, + "ewc_loss": 0.008092200383543968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092200005194172e-05, + "grad_norm": 4.148813724517822, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8715026378631592, + "num_tokens": 587007367.0, + "step": 15390 + }, + { + "epoch": 1.9578933977865411, + "ewc_loss": 0.008187890984117985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.187891216948628e-05, + "grad_norm": 4.016287326812744, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8974441885948181, + "num_tokens": 587040422.0, + "step": 15391 + }, + { + "epoch": 1.9580206080651317, + "ewc_loss": 0.008080464787781239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080464613158256e-05, + "grad_norm": 4.054299831390381, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8935854434967041, + "num_tokens": 587070238.0, + "step": 15392 + }, + { + "epoch": 1.9581478183437222, + "ewc_loss": 0.008159368298947811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.159368007909507e-05, + "grad_norm": 4.030172348022461, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.88278728723526, + "num_tokens": 587105973.0, + "step": 15393 + }, + { + "epoch": 1.9582750286223127, + "ewc_loss": 0.008125614374876022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.12561484053731e-05, + "grad_norm": 4.039505958557129, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8909900188446045, + "num_tokens": 587137975.0, + "step": 15394 + }, + { + "epoch": 1.958402238900903, + "ewc_loss": 0.008166658692061901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.166658517438918e-05, + "grad_norm": 4.107594966888428, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8734482526779175, + "num_tokens": 587169176.0, + "step": 15395 + }, + { + "epoch": 1.9585294491794936, + "ewc_loss": 0.008198358118534088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.198357681976631e-05, + "grad_norm": 3.97236704826355, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8749005198478699, + "num_tokens": 587205570.0, + "step": 15396 + }, + { + "epoch": 1.958656659458084, + "ewc_loss": 0.00811700988560915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.117010293062776e-05, + "grad_norm": 4.012937545776367, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.869152307510376, + "num_tokens": 587246729.0, + "step": 15397 + }, + { + "epoch": 1.9587838697366746, + "ewc_loss": 0.008191165514290333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.191165397875011e-05, + "grad_norm": 3.9921960830688477, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8810611963272095, + "num_tokens": 587284537.0, + "step": 15398 + }, + { + "epoch": 1.9589110800152651, + "ewc_loss": 0.008151646703481674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.151646761689335e-05, + "grad_norm": 3.9867634773254395, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8825608491897583, + "num_tokens": 587324394.0, + "step": 15399 + }, + { + "epoch": 1.9590382902938557, + "ewc_loss": 0.0081590311601758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.15903113107197e-05, + "grad_norm": 4.020034313201904, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8847222328186035, + "num_tokens": 587359935.0, + "step": 15400 + }, + { + "epoch": 1.9591655005724462, + "ewc_loss": 0.008187877014279366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.1878766650334e-05, + "grad_norm": 3.9732515811920166, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8890787363052368, + "num_tokens": 587402081.0, + "step": 15401 + }, + { + "epoch": 1.9592927108510367, + "ewc_loss": 0.008131661452353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.131661161314696e-05, + "grad_norm": 3.968961238861084, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8748102784156799, + "num_tokens": 587442939.0, + "step": 15402 + }, + { + "epoch": 1.9594199211296273, + "ewc_loss": 0.00815526582300663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.15526582300663e-05, + "grad_norm": 4.060189247131348, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8694712519645691, + "num_tokens": 587479155.0, + "step": 15403 + }, + { + "epoch": 1.9595471314082178, + "ewc_loss": 0.008180614560842514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.180614531738684e-05, + "grad_norm": 3.9817750453948975, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8668066263198853, + "num_tokens": 587519285.0, + "step": 15404 + }, + { + "epoch": 1.9596743416868083, + "ewc_loss": 0.008113063871860504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.113063813652843e-05, + "grad_norm": 3.9987473487854004, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8600684404373169, + "num_tokens": 587563978.0, + "step": 15405 + }, + { + "epoch": 1.9598015519653988, + "ewc_loss": 0.008130693808197975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13069345895201e-05, + "grad_norm": 4.0396728515625, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8722923398017883, + "num_tokens": 587599052.0, + "step": 15406 + }, + { + "epoch": 1.9599287622439894, + "ewc_loss": 0.008158944547176361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.158944547176361e-05, + "grad_norm": 4.01241397857666, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8815467953681946, + "num_tokens": 587639980.0, + "step": 15407 + }, + { + "epoch": 1.96005597252258, + "ewc_loss": 0.008133784867823124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.133785013342276e-05, + "grad_norm": 4.074095726013184, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8727021217346191, + "num_tokens": 587673836.0, + "step": 15408 + }, + { + "epoch": 1.9601831828011704, + "ewc_loss": 0.00814623013138771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.146230538841337e-05, + "grad_norm": 4.000476360321045, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.878955602645874, + "num_tokens": 587712896.0, + "step": 15409 + }, + { + "epoch": 1.960310393079761, + "ewc_loss": 0.008088083006441593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.088083268376067e-05, + "grad_norm": 4.0484209060668945, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8716213703155518, + "num_tokens": 587747352.0, + "step": 15410 + }, + { + "epoch": 1.9604376033583515, + "ewc_loss": 0.00814389530569315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.143894956447184e-05, + "grad_norm": 3.981001853942871, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8737468719482422, + "num_tokens": 587790234.0, + "step": 15411 + }, + { + "epoch": 1.960564813636942, + "ewc_loss": 0.00808002334088087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080023690126836e-05, + "grad_norm": 3.9679980278015137, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8905638456344604, + "num_tokens": 587830185.0, + "step": 15412 + }, + { + "epoch": 1.9606920239155323, + "ewc_loss": 0.008086804300546646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086804155027494e-05, + "grad_norm": 4.032270431518555, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8785620927810669, + "num_tokens": 587866265.0, + "step": 15413 + }, + { + "epoch": 1.9608192341941229, + "ewc_loss": 0.00812107790261507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121077553369105e-05, + "grad_norm": 3.9690334796905518, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8894611597061157, + "num_tokens": 587904956.0, + "step": 15414 + }, + { + "epoch": 1.9609464444727134, + "ewc_loss": 0.008055898360908031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055898069869727e-05, + "grad_norm": 4.063398838043213, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8831846714019775, + "num_tokens": 587942369.0, + "step": 15415 + }, + { + "epoch": 1.961073654751304, + "ewc_loss": 0.008142155595123768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.142155274981633e-05, + "grad_norm": 4.044483661651611, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8821719884872437, + "num_tokens": 587979340.0, + "step": 15416 + }, + { + "epoch": 1.9612008650298944, + "ewc_loss": 0.008100247010588646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.100247214315459e-05, + "grad_norm": 4.0081634521484375, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8728184700012207, + "num_tokens": 588021812.0, + "step": 15417 + }, + { + "epoch": 1.961328075308485, + "ewc_loss": 0.008067144080996513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06714451755397e-05, + "grad_norm": 3.9902961254119873, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8736258149147034, + "num_tokens": 588064978.0, + "step": 15418 + }, + { + "epoch": 1.9614552855870753, + "ewc_loss": 0.008077454753220081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077454549493268e-05, + "grad_norm": 4.072566509246826, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8739814758300781, + "num_tokens": 588098142.0, + "step": 15419 + }, + { + "epoch": 1.9615824958656658, + "ewc_loss": 0.008120783604681492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.120783604681492e-05, + "grad_norm": 3.9567580223083496, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8821476101875305, + "num_tokens": 588141808.0, + "step": 15420 + }, + { + "epoch": 1.9617097061442563, + "ewc_loss": 0.008039623498916626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.039623935474083e-05, + "grad_norm": 3.9911811351776123, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8828689455986023, + "num_tokens": 588178137.0, + "step": 15421 + }, + { + "epoch": 1.9618369164228469, + "ewc_loss": 0.00811991561204195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11991558293812e-05, + "grad_norm": 4.064836502075195, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8718405365943909, + "num_tokens": 588215646.0, + "step": 15422 + }, + { + "epoch": 1.9619641267014374, + "ewc_loss": 0.008121832273900509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121832070173696e-05, + "grad_norm": 4.003789901733398, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.868137776851654, + "num_tokens": 588251345.0, + "step": 15423 + }, + { + "epoch": 1.962091336980028, + "ewc_loss": 0.008056502789258957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056502701947466e-05, + "grad_norm": 4.048079967498779, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.883023738861084, + "num_tokens": 588291222.0, + "step": 15424 + }, + { + "epoch": 1.9622185472586184, + "ewc_loss": 0.008109601214528084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.109601185424253e-05, + "grad_norm": 4.008450984954834, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8708306550979614, + "num_tokens": 588329480.0, + "step": 15425 + }, + { + "epoch": 1.962345757537209, + "ewc_loss": 0.008069194853305817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069194882409647e-05, + "grad_norm": 4.0195136070251465, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8679136633872986, + "num_tokens": 588367143.0, + "step": 15426 + }, + { + "epoch": 1.9624729678157995, + "ewc_loss": 0.008078141137957573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.078140672296286e-05, + "grad_norm": 3.9712846279144287, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8856738805770874, + "num_tokens": 588408363.0, + "step": 15427 + }, + { + "epoch": 1.96260017809439, + "ewc_loss": 0.008040514774620533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040514512686059e-05, + "grad_norm": 4.001922130584717, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8837651610374451, + "num_tokens": 588445397.0, + "step": 15428 + }, + { + "epoch": 1.9627273883729806, + "ewc_loss": 0.008104038424789906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.104037988232449e-05, + "grad_norm": 4.030741214752197, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8828784227371216, + "num_tokens": 588480436.0, + "step": 15429 + }, + { + "epoch": 1.962854598651571, + "ewc_loss": 0.008084570057690144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.084569708444178e-05, + "grad_norm": 3.9688241481781006, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8747937679290771, + "num_tokens": 588526918.0, + "step": 15430 + }, + { + "epoch": 1.9629818089301616, + "ewc_loss": 0.008049918338656425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.049918687902391e-05, + "grad_norm": 4.048264980316162, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8785116672515869, + "num_tokens": 588561300.0, + "step": 15431 + }, + { + "epoch": 1.9631090192087521, + "ewc_loss": 0.008128813467919827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.128813351504505e-05, + "grad_norm": 4.017442226409912, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.867708683013916, + "num_tokens": 588603393.0, + "step": 15432 + }, + { + "epoch": 1.9632362294873427, + "ewc_loss": 0.008070035837590694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070035983109847e-05, + "grad_norm": 3.9959194660186768, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8848441243171692, + "num_tokens": 588643806.0, + "step": 15433 + }, + { + "epoch": 1.9633634397659332, + "ewc_loss": 0.008069198578596115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069198520388454e-05, + "grad_norm": 4.0061421394348145, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8765038847923279, + "num_tokens": 588685090.0, + "step": 15434 + }, + { + "epoch": 1.9634906500445237, + "ewc_loss": 0.008070636540651321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070636249613017e-05, + "grad_norm": 4.027099609375, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8661825060844421, + "num_tokens": 588720983.0, + "step": 15435 + }, + { + "epoch": 1.9636178603231143, + "ewc_loss": 0.008058885112404823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05888557806611e-05, + "grad_norm": 3.9721033573150635, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8710203766822815, + "num_tokens": 588760143.0, + "step": 15436 + }, + { + "epoch": 1.9637450706017048, + "ewc_loss": 0.008036850020289421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036849612835795e-05, + "grad_norm": 4.071117877960205, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8794426321983337, + "num_tokens": 588796636.0, + "step": 15437 + }, + { + "epoch": 1.963872280880295, + "ewc_loss": 0.008108248002827168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.108247857308015e-05, + "grad_norm": 4.078673362731934, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8732671141624451, + "num_tokens": 588831681.0, + "step": 15438 + }, + { + "epoch": 1.9639994911588856, + "ewc_loss": 0.008061379194259644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061379048740491e-05, + "grad_norm": 3.94642972946167, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8871968984603882, + "num_tokens": 588875408.0, + "step": 15439 + }, + { + "epoch": 1.9641267014374761, + "ewc_loss": 0.008007223717868328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007224096218124e-05, + "grad_norm": 4.013640880584717, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8811500072479248, + "num_tokens": 588913583.0, + "step": 15440 + }, + { + "epoch": 1.9642539117160667, + "ewc_loss": 0.008091420866549015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.091420750133693e-05, + "grad_norm": 4.001432418823242, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8852424621582031, + "num_tokens": 588949833.0, + "step": 15441 + }, + { + "epoch": 1.9643811219946572, + "ewc_loss": 0.008044295944273472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.04429582785815e-05, + "grad_norm": 4.001418590545654, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8825111389160156, + "num_tokens": 588985678.0, + "step": 15442 + }, + { + "epoch": 1.9645083322732477, + "ewc_loss": 0.00804765336215496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047652954701334e-05, + "grad_norm": 4.057294845581055, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8759956955909729, + "num_tokens": 589018276.0, + "step": 15443 + }, + { + "epoch": 1.964635542551838, + "ewc_loss": 0.00808723084628582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087230526143685e-05, + "grad_norm": 3.9574058055877686, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.874937117099762, + "num_tokens": 589061900.0, + "step": 15444 + }, + { + "epoch": 1.9647627528304286, + "ewc_loss": 0.008022306486964226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.022306428756565e-05, + "grad_norm": 4.046219348907471, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8791592717170715, + "num_tokens": 589102740.0, + "step": 15445 + }, + { + "epoch": 1.964889963109019, + "ewc_loss": 0.0081196753308177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.119675476336852e-05, + "grad_norm": 4.037180423736572, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8862011432647705, + "num_tokens": 589135276.0, + "step": 15446 + }, + { + "epoch": 1.9650171733876096, + "ewc_loss": 0.008061087690293789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061087282840163e-05, + "grad_norm": 4.002598285675049, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8774908781051636, + "num_tokens": 589173390.0, + "step": 15447 + }, + { + "epoch": 1.9651443836662001, + "ewc_loss": 0.00805927999317646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.0592799349688e-05, + "grad_norm": 3.992525339126587, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8698234558105469, + "num_tokens": 589214592.0, + "step": 15448 + }, + { + "epoch": 1.9652715939447907, + "ewc_loss": 0.008083558641374111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.083558350335807e-05, + "grad_norm": 4.031525611877441, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8771110773086548, + "num_tokens": 589250682.0, + "step": 15449 + }, + { + "epoch": 1.9653988042233812, + "ewc_loss": 0.008097362704575062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.097362297121435e-05, + "grad_norm": 4.0367584228515625, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8707197904586792, + "num_tokens": 589288326.0, + "step": 15450 + }, + { + "epoch": 1.9655260145019717, + "ewc_loss": 0.0080907316878438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09073171694763e-05, + "grad_norm": 4.045054912567139, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8734261393547058, + "num_tokens": 589321632.0, + "step": 15451 + }, + { + "epoch": 1.9656532247805623, + "ewc_loss": 0.008109848946332932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.109849295578897e-05, + "grad_norm": 4.059221267700195, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8691465854644775, + "num_tokens": 589361134.0, + "step": 15452 + }, + { + "epoch": 1.9657804350591528, + "ewc_loss": 0.008126667700707912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.126667671604082e-05, + "grad_norm": 3.9736170768737793, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8791131973266602, + "num_tokens": 589402338.0, + "step": 15453 + }, + { + "epoch": 1.9659076453377433, + "ewc_loss": 0.008070348761975765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.070348849287257e-05, + "grad_norm": 3.994901657104492, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8708887100219727, + "num_tokens": 589441080.0, + "step": 15454 + }, + { + "epoch": 1.9660348556163338, + "ewc_loss": 0.008124181069433689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.124181476887316e-05, + "grad_norm": 3.9840316772460938, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8749383091926575, + "num_tokens": 589484019.0, + "step": 15455 + }, + { + "epoch": 1.9661620658949244, + "ewc_loss": 0.008092168718576431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092168718576431e-05, + "grad_norm": 3.9927215576171875, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8717220425605774, + "num_tokens": 589525890.0, + "step": 15456 + }, + { + "epoch": 1.966289276173515, + "ewc_loss": 0.008114142343401909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.114142110571265e-05, + "grad_norm": 4.029761791229248, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8828310370445251, + "num_tokens": 589560644.0, + "step": 15457 + }, + { + "epoch": 1.9664164864521054, + "ewc_loss": 0.008133006282150745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.133006485877559e-05, + "grad_norm": 3.971607208251953, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8959145545959473, + "num_tokens": 589600863.0, + "step": 15458 + }, + { + "epoch": 1.966543696730696, + "ewc_loss": 0.00807251688092947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.072517084656283e-05, + "grad_norm": 4.022587776184082, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8778316974639893, + "num_tokens": 589637948.0, + "step": 15459 + }, + { + "epoch": 1.9666709070092865, + "ewc_loss": 0.008139045909047127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.139046258293092e-05, + "grad_norm": 3.9881434440612793, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8896963000297546, + "num_tokens": 589678061.0, + "step": 15460 + }, + { + "epoch": 1.966798117287877, + "ewc_loss": 0.008094357326626778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.094357326626778e-05, + "grad_norm": 4.015532970428467, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8646788597106934, + "num_tokens": 589718050.0, + "step": 15461 + }, + { + "epoch": 1.9669253275664673, + "ewc_loss": 0.008121689781546593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121689461404458e-05, + "grad_norm": 3.9826595783233643, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8783704042434692, + "num_tokens": 589759206.0, + "step": 15462 + }, + { + "epoch": 1.9670525378450578, + "ewc_loss": 0.008081987500190735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081987471086904e-05, + "grad_norm": 3.9980318546295166, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8665027618408203, + "num_tokens": 589800218.0, + "step": 15463 + }, + { + "epoch": 1.9671797481236484, + "ewc_loss": 0.008098333142697811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.098332909867167e-05, + "grad_norm": 3.965559482574463, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8777568340301514, + "num_tokens": 589843105.0, + "step": 15464 + }, + { + "epoch": 1.967306958402239, + "ewc_loss": 0.00808698870241642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086988964350894e-05, + "grad_norm": 4.053981781005859, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8675420880317688, + "num_tokens": 589881560.0, + "step": 15465 + }, + { + "epoch": 1.9674341686808294, + "ewc_loss": 0.008138231001794338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.138231351040304e-05, + "grad_norm": 4.023352146148682, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8657239675521851, + "num_tokens": 589922755.0, + "step": 15466 + }, + { + "epoch": 1.96756137895942, + "ewc_loss": 0.008066367357969284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066367445280775e-05, + "grad_norm": 4.034061431884766, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8602874279022217, + "num_tokens": 589964982.0, + "step": 15467 + }, + { + "epoch": 1.9676885892380103, + "ewc_loss": 0.008082368411123753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082368731265888e-05, + "grad_norm": 4.045734405517578, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8627969622612, + "num_tokens": 590003371.0, + "step": 15468 + }, + { + "epoch": 1.9678157995166008, + "ewc_loss": 0.008075393736362457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.075393998296931e-05, + "grad_norm": 3.9392340183258057, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8739761114120483, + "num_tokens": 590048361.0, + "step": 15469 + }, + { + "epoch": 1.9679430097951913, + "ewc_loss": 0.008033132180571556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.033132326090708e-05, + "grad_norm": 4.044048309326172, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8758538365364075, + "num_tokens": 590083141.0, + "step": 15470 + }, + { + "epoch": 1.9680702200737819, + "ewc_loss": 0.008113597519695759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.113597141345963e-05, + "grad_norm": 3.9700679779052734, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8715112209320068, + "num_tokens": 590128161.0, + "step": 15471 + }, + { + "epoch": 1.9681974303523724, + "ewc_loss": 0.008047600276768208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.047600567806512e-05, + "grad_norm": 4.007287502288818, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8915228843688965, + "num_tokens": 590166101.0, + "step": 15472 + }, + { + "epoch": 1.968324640630963, + "ewc_loss": 0.008080988191068172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080988482106477e-05, + "grad_norm": 3.9521701335906982, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8750916719436646, + "num_tokens": 590207643.0, + "step": 15473 + }, + { + "epoch": 1.9684518509095534, + "ewc_loss": 0.008044124580919743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.044124842854217e-05, + "grad_norm": 4.040711879730225, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8695028424263, + "num_tokens": 590247567.0, + "step": 15474 + }, + { + "epoch": 1.968579061188144, + "ewc_loss": 0.0081032020971179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103201980702579e-05, + "grad_norm": 3.9987289905548096, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8724371194839478, + "num_tokens": 590287940.0, + "step": 15475 + }, + { + "epoch": 1.9687062714667345, + "ewc_loss": 0.008040615357458591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.040614920901135e-05, + "grad_norm": 4.064667224884033, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8712881207466125, + "num_tokens": 590322558.0, + "step": 15476 + }, + { + "epoch": 1.968833481745325, + "ewc_loss": 0.008110994473099709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11099453130737e-05, + "grad_norm": 3.9972972869873047, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8751711249351501, + "num_tokens": 590362913.0, + "step": 15477 + }, + { + "epoch": 1.9689606920239155, + "ewc_loss": 0.008025071583688259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025071292649955e-05, + "grad_norm": 3.9739081859588623, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8680936098098755, + "num_tokens": 590405877.0, + "step": 15478 + }, + { + "epoch": 1.969087902302506, + "ewc_loss": 0.008067192509770393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067192538874224e-05, + "grad_norm": 4.031937122344971, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8683643341064453, + "num_tokens": 590445373.0, + "step": 15479 + }, + { + "epoch": 1.9692151125810966, + "ewc_loss": 0.008084002882242203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.084002911346033e-05, + "grad_norm": 4.015336036682129, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8659359216690063, + "num_tokens": 590483319.0, + "step": 15480 + }, + { + "epoch": 1.9693423228596871, + "ewc_loss": 0.00805908627808094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059086394496262e-05, + "grad_norm": 4.029398441314697, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8625258207321167, + "num_tokens": 590522433.0, + "step": 15481 + }, + { + "epoch": 1.9694695331382777, + "ewc_loss": 0.00808832235634327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.088321919785812e-05, + "grad_norm": 3.975463390350342, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8803006410598755, + "num_tokens": 590564342.0, + "step": 15482 + }, + { + "epoch": 1.9695967434168682, + "ewc_loss": 0.008059391751885414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059391984716058e-05, + "grad_norm": 4.164858341217041, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.879185676574707, + "num_tokens": 590594527.0, + "step": 15483 + }, + { + "epoch": 1.9697239536954587, + "ewc_loss": 0.008181480690836906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.181480370694771e-05, + "grad_norm": 4.000177383422852, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8879827857017517, + "num_tokens": 590630573.0, + "step": 15484 + }, + { + "epoch": 1.9698511639740492, + "ewc_loss": 0.008020899258553982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.020899258553982e-05, + "grad_norm": 3.989415407180786, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8777709007263184, + "num_tokens": 590669333.0, + "step": 15485 + }, + { + "epoch": 1.9699783742526398, + "ewc_loss": 0.008098454214632511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.098454418359324e-05, + "grad_norm": 3.9585049152374268, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8916917443275452, + "num_tokens": 590709884.0, + "step": 15486 + }, + { + "epoch": 1.97010558453123, + "ewc_loss": 0.008093827404081821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093827636912465e-05, + "grad_norm": 4.052295207977295, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8821399211883545, + "num_tokens": 590746109.0, + "step": 15487 + }, + { + "epoch": 1.9702327948098206, + "ewc_loss": 0.008125555701553822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.125555905280635e-05, + "grad_norm": 3.9886014461517334, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8611295223236084, + "num_tokens": 590787881.0, + "step": 15488 + }, + { + "epoch": 1.9703600050884111, + "ewc_loss": 0.008075684309005737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.075684309005737e-05, + "grad_norm": 4.026254177093506, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8622862100601196, + "num_tokens": 590826861.0, + "step": 15489 + }, + { + "epoch": 1.9704872153670017, + "ewc_loss": 0.00812908262014389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.12908256193623e-05, + "grad_norm": 4.0414228439331055, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8841211795806885, + "num_tokens": 590863104.0, + "step": 15490 + }, + { + "epoch": 1.9706144256455922, + "ewc_loss": 0.008114222437143326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.114222873700783e-05, + "grad_norm": 3.980686902999878, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8824359774589539, + "num_tokens": 590900582.0, + "step": 15491 + }, + { + "epoch": 1.9707416359241827, + "ewc_loss": 0.008090002462267876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090002665994689e-05, + "grad_norm": 4.0081915855407715, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8833369016647339, + "num_tokens": 590937735.0, + "step": 15492 + }, + { + "epoch": 1.970868846202773, + "ewc_loss": 0.008129805326461792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.129805064527318e-05, + "grad_norm": 4.018031597137451, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8906385898590088, + "num_tokens": 590973089.0, + "step": 15493 + }, + { + "epoch": 1.9709960564813636, + "ewc_loss": 0.008106761611998081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.106761379167438e-05, + "grad_norm": 4.042091369628906, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8641976714134216, + "num_tokens": 591012117.0, + "step": 15494 + }, + { + "epoch": 1.971123266759954, + "ewc_loss": 0.008135830983519554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.135831012623385e-05, + "grad_norm": 4.10373592376709, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8571909666061401, + "num_tokens": 591044978.0, + "step": 15495 + }, + { + "epoch": 1.9712504770385446, + "ewc_loss": 0.008163048885762691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.163048914866522e-05, + "grad_norm": 3.967808723449707, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8740474581718445, + "num_tokens": 591085562.0, + "step": 15496 + }, + { + "epoch": 1.9713776873171351, + "ewc_loss": 0.00808399636298418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08399636298418e-05, + "grad_norm": 4.0498809814453125, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8668157458305359, + "num_tokens": 591120140.0, + "step": 15497 + }, + { + "epoch": 1.9715048975957257, + "ewc_loss": 0.008182449266314507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.182448800653219e-05, + "grad_norm": 4.0705060958862305, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8678624629974365, + "num_tokens": 591153952.0, + "step": 15498 + }, + { + "epoch": 1.9716321078743162, + "ewc_loss": 0.008159820921719074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.15982130006887e-05, + "grad_norm": 3.9675066471099854, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8923821449279785, + "num_tokens": 591192983.0, + "step": 15499 + }, + { + "epoch": 1.9717593181529067, + "ewc_loss": 0.008117654360830784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.117654215311632e-05, + "grad_norm": 4.040370464324951, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8767774105072021, + "num_tokens": 591226260.0, + "step": 15500 + }, + { + "epoch": 1.9718865284314973, + "ewc_loss": 0.00818584579974413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.185845945263281e-05, + "grad_norm": 4.117993354797363, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8651784658432007, + "num_tokens": 591259361.0, + "step": 15501 + }, + { + "epoch": 1.9720137387100878, + "ewc_loss": 0.008211837150156498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.211837121052667e-05, + "grad_norm": 3.98477840423584, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8777458667755127, + "num_tokens": 591299753.0, + "step": 15502 + }, + { + "epoch": 1.9721409489886783, + "ewc_loss": 0.008098343387246132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.098343096207827e-05, + "grad_norm": 4.015698432922363, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8790467381477356, + "num_tokens": 591335008.0, + "step": 15503 + }, + { + "epoch": 1.9722681592672688, + "ewc_loss": 0.00818369910120964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.183699537767097e-05, + "grad_norm": 3.9811112880706787, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8833840489387512, + "num_tokens": 591373001.0, + "step": 15504 + }, + { + "epoch": 1.9723953695458594, + "ewc_loss": 0.008140838705003262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.140838326653466e-05, + "grad_norm": 4.044094562530518, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8798493146896362, + "num_tokens": 591408637.0, + "step": 15505 + }, + { + "epoch": 1.97252257982445, + "ewc_loss": 0.008204687386751175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.204687765100971e-05, + "grad_norm": 4.063984394073486, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8739657998085022, + "num_tokens": 591444908.0, + "step": 15506 + }, + { + "epoch": 1.9726497901030404, + "ewc_loss": 0.008190831169486046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.19083143142052e-05, + "grad_norm": 4.018712997436523, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8780977725982666, + "num_tokens": 591484641.0, + "step": 15507 + }, + { + "epoch": 1.972777000381631, + "ewc_loss": 0.008159602992236614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.159603021340445e-05, + "grad_norm": 3.998905897140503, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8727344274520874, + "num_tokens": 591521859.0, + "step": 15508 + }, + { + "epoch": 1.9729042106602215, + "ewc_loss": 0.00818056333810091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.180563600035384e-05, + "grad_norm": 4.028749942779541, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8753870725631714, + "num_tokens": 591559124.0, + "step": 15509 + }, + { + "epoch": 1.973031420938812, + "ewc_loss": 0.008177255280315876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.177255222108215e-05, + "grad_norm": 3.992231607437134, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8758757710456848, + "num_tokens": 591597170.0, + "step": 15510 + }, + { + "epoch": 1.9731586312174023, + "ewc_loss": 0.008165336214005947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.165336475940421e-05, + "grad_norm": 4.075249671936035, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8827477097511292, + "num_tokens": 591634784.0, + "step": 15511 + }, + { + "epoch": 1.9732858414959928, + "ewc_loss": 0.008215418085455894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.215418347390369e-05, + "grad_norm": 4.035001754760742, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8798767328262329, + "num_tokens": 591671904.0, + "step": 15512 + }, + { + "epoch": 1.9734130517745834, + "ewc_loss": 0.008156980387866497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.156980038620532e-05, + "grad_norm": 4.012234687805176, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8771772384643555, + "num_tokens": 591712990.0, + "step": 15513 + }, + { + "epoch": 1.973540262053174, + "ewc_loss": 0.008153196424245834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.153196540661156e-05, + "grad_norm": 4.050216197967529, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8715265393257141, + "num_tokens": 591748579.0, + "step": 15514 + }, + { + "epoch": 1.9736674723317644, + "ewc_loss": 0.008184071630239487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.184072066796944e-05, + "grad_norm": 4.037538528442383, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.869046151638031, + "num_tokens": 591785865.0, + "step": 15515 + }, + { + "epoch": 1.973794682610355, + "ewc_loss": 0.008176423609256744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.176423580152914e-05, + "grad_norm": 4.031225204467773, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8859999179840088, + "num_tokens": 591820087.0, + "step": 15516 + }, + { + "epoch": 1.9739218928889453, + "ewc_loss": 0.008159584365785122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.159584103850648e-05, + "grad_norm": 4.07317590713501, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8751733303070068, + "num_tokens": 591859713.0, + "step": 15517 + }, + { + "epoch": 1.9740491031675358, + "ewc_loss": 0.008170549757778645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.170549699570984e-05, + "grad_norm": 4.042806625366211, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8844354152679443, + "num_tokens": 591893142.0, + "step": 15518 + }, + { + "epoch": 1.9741763134461263, + "ewc_loss": 0.008144537918269634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.144538151100278e-05, + "grad_norm": 4.0688323974609375, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8716393709182739, + "num_tokens": 591931529.0, + "step": 15519 + }, + { + "epoch": 1.9743035237247168, + "ewc_loss": 0.008165398612618446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.165399049175903e-05, + "grad_norm": 3.967299461364746, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8815404176712036, + "num_tokens": 591976248.0, + "step": 15520 + }, + { + "epoch": 1.9744307340033074, + "ewc_loss": 0.008084178902208805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.084178989520296e-05, + "grad_norm": 3.9754598140716553, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8736286163330078, + "num_tokens": 592018008.0, + "step": 15521 + }, + { + "epoch": 1.974557944281898, + "ewc_loss": 0.008130887523293495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.130887727020308e-05, + "grad_norm": 3.993262767791748, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8613402843475342, + "num_tokens": 592062934.0, + "step": 15522 + }, + { + "epoch": 1.9746851545604884, + "ewc_loss": 0.008111060597002506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.111060742521659e-05, + "grad_norm": 4.0164055824279785, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8740781545639038, + "num_tokens": 592103129.0, + "step": 15523 + }, + { + "epoch": 1.974812364839079, + "ewc_loss": 0.008120515383780003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.120515121845528e-05, + "grad_norm": 4.073726654052734, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8720775842666626, + "num_tokens": 592142116.0, + "step": 15524 + }, + { + "epoch": 1.9749395751176695, + "ewc_loss": 0.008156550116837025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.156550029525533e-05, + "grad_norm": 4.022480487823486, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8771053552627563, + "num_tokens": 592182507.0, + "step": 15525 + }, + { + "epoch": 1.97506678539626, + "ewc_loss": 0.008095790632069111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.095790690276772e-05, + "grad_norm": 4.049609184265137, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8562377691268921, + "num_tokens": 592225593.0, + "step": 15526 + }, + { + "epoch": 1.9751939956748505, + "ewc_loss": 0.008143991231918335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.143990999087691e-05, + "grad_norm": 3.930244207382202, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8991296291351318, + "num_tokens": 592270676.0, + "step": 15527 + }, + { + "epoch": 1.975321205953441, + "ewc_loss": 0.008038808591663837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.038809028221294e-05, + "grad_norm": 4.051897048950195, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8929944038391113, + "num_tokens": 592305537.0, + "step": 15528 + }, + { + "epoch": 1.9754484162320316, + "ewc_loss": 0.008158919401466846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.158919808920473e-05, + "grad_norm": 4.0408735275268555, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.882935643196106, + "num_tokens": 592338289.0, + "step": 15529 + }, + { + "epoch": 1.9755756265106221, + "ewc_loss": 0.008094302378594875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09430202934891e-05, + "grad_norm": 4.039319038391113, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8799908757209778, + "num_tokens": 592379189.0, + "step": 15530 + }, + { + "epoch": 1.9757028367892127, + "ewc_loss": 0.008102862164378166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102862193481997e-05, + "grad_norm": 4.006438732147217, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8895378708839417, + "num_tokens": 592421875.0, + "step": 15531 + }, + { + "epoch": 1.9758300470678032, + "ewc_loss": 0.00806998647749424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.06998650659807e-05, + "grad_norm": 4.154022216796875, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8583455085754395, + "num_tokens": 592457438.0, + "step": 15532 + }, + { + "epoch": 1.9759572573463937, + "ewc_loss": 0.00817115604877472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.171155786840245e-05, + "grad_norm": 4.023000717163086, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8865136504173279, + "num_tokens": 592493230.0, + "step": 15533 + }, + { + "epoch": 1.9760844676249842, + "ewc_loss": 0.008025357499718666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025357237784192e-05, + "grad_norm": 4.0098419189453125, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8606049418449402, + "num_tokens": 592533656.0, + "step": 15534 + }, + { + "epoch": 1.9762116779035748, + "ewc_loss": 0.008082029409706593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082029671641067e-05, + "grad_norm": 3.976910352706909, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8888306617736816, + "num_tokens": 592571622.0, + "step": 15535 + }, + { + "epoch": 1.976338888182165, + "ewc_loss": 0.008075492456555367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.075492223724723e-05, + "grad_norm": 4.01621150970459, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8707410097122192, + "num_tokens": 592613260.0, + "step": 15536 + }, + { + "epoch": 1.9764660984607556, + "ewc_loss": 0.008105183951556683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.105183951556683e-05, + "grad_norm": 4.065426349639893, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8733944892883301, + "num_tokens": 592648872.0, + "step": 15537 + }, + { + "epoch": 1.9765933087393461, + "ewc_loss": 0.008112304843962193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.112304931273684e-05, + "grad_norm": 4.070448875427246, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.876820981502533, + "num_tokens": 592682336.0, + "step": 15538 + }, + { + "epoch": 1.9767205190179367, + "ewc_loss": 0.008104018867015839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.104019070742652e-05, + "grad_norm": 4.0248122215271, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8909743428230286, + "num_tokens": 592716412.0, + "step": 15539 + }, + { + "epoch": 1.9768477292965272, + "ewc_loss": 0.008090822026133537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090821938822046e-05, + "grad_norm": 4.048563003540039, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8806700110435486, + "num_tokens": 592753936.0, + "step": 15540 + }, + { + "epoch": 1.9769749395751177, + "ewc_loss": 0.008121578954160213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121578866848722e-05, + "grad_norm": 3.9937856197357178, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8772735595703125, + "num_tokens": 592792731.0, + "step": 15541 + }, + { + "epoch": 1.977102149853708, + "ewc_loss": 0.0080802571028471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.080257248366252e-05, + "grad_norm": 3.993680715560913, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8799459934234619, + "num_tokens": 592832035.0, + "step": 15542 + }, + { + "epoch": 1.9772293601322986, + "ewc_loss": 0.008102272637188435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102272840915248e-05, + "grad_norm": 3.9884889125823975, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8934881687164307, + "num_tokens": 592869693.0, + "step": 15543 + }, + { + "epoch": 1.977356570410889, + "ewc_loss": 0.008097940124571323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.097940008156002e-05, + "grad_norm": 3.9873852729797363, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8777018785476685, + "num_tokens": 592909648.0, + "step": 15544 + }, + { + "epoch": 1.9774837806894796, + "ewc_loss": 0.008107560686767101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.107560279313475e-05, + "grad_norm": 4.106512546539307, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8802787065505981, + "num_tokens": 592942539.0, + "step": 15545 + }, + { + "epoch": 1.9776109909680701, + "ewc_loss": 0.00816583912819624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.165839244611561e-05, + "grad_norm": 4.002143859863281, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8762115836143494, + "num_tokens": 592983680.0, + "step": 15546 + }, + { + "epoch": 1.9777382012466607, + "ewc_loss": 0.008074981160461903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074980723904446e-05, + "grad_norm": 4.042721271514893, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8653887510299683, + "num_tokens": 593023161.0, + "step": 15547 + }, + { + "epoch": 1.9778654115252512, + "ewc_loss": 0.008131321519613266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.131321374094114e-05, + "grad_norm": 3.9887728691101074, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8601548671722412, + "num_tokens": 593066612.0, + "step": 15548 + }, + { + "epoch": 1.9779926218038417, + "ewc_loss": 0.008090387098491192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090386836556718e-05, + "grad_norm": 4.059354782104492, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8669943809509277, + "num_tokens": 593105224.0, + "step": 15549 + }, + { + "epoch": 1.9781198320824323, + "ewc_loss": 0.008140330202877522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.140330464811996e-05, + "grad_norm": 4.000375270843506, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8788483142852783, + "num_tokens": 593143683.0, + "step": 15550 + }, + { + "epoch": 1.9782470423610228, + "ewc_loss": 0.00809088908135891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090888877632096e-05, + "grad_norm": 4.052769184112549, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.863655686378479, + "num_tokens": 593179827.0, + "step": 15551 + }, + { + "epoch": 1.9783742526396133, + "ewc_loss": 0.008133010938763618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.133010851452127e-05, + "grad_norm": 3.9849908351898193, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8776357173919678, + "num_tokens": 593220936.0, + "step": 15552 + }, + { + "epoch": 1.9785014629182038, + "ewc_loss": 0.00808656308799982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086563320830464e-05, + "grad_norm": 4.014939308166504, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8717703223228455, + "num_tokens": 593258503.0, + "step": 15553 + }, + { + "epoch": 1.9786286731967944, + "ewc_loss": 0.008137485943734646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13748556538485e-05, + "grad_norm": 4.110949516296387, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8567502498626709, + "num_tokens": 593293738.0, + "step": 15554 + }, + { + "epoch": 1.978755883475385, + "ewc_loss": 0.00818654615432024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.186545892385766e-05, + "grad_norm": 3.985039472579956, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8681422472000122, + "num_tokens": 593338037.0, + "step": 15555 + }, + { + "epoch": 1.9788830937539754, + "ewc_loss": 0.008081684820353985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081684791250154e-05, + "grad_norm": 4.0261406898498535, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8612686395645142, + "num_tokens": 593383509.0, + "step": 15556 + }, + { + "epoch": 1.979010304032566, + "ewc_loss": 0.008159368298947811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.159368007909507e-05, + "grad_norm": 4.029913425445557, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8799875974655151, + "num_tokens": 593417734.0, + "step": 15557 + }, + { + "epoch": 1.9791375143111565, + "ewc_loss": 0.008142611011862755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.142611477524042e-05, + "grad_norm": 4.0160017013549805, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8882980942726135, + "num_tokens": 593454502.0, + "step": 15558 + }, + { + "epoch": 1.979264724589747, + "ewc_loss": 0.008141198195517063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.141198486555368e-05, + "grad_norm": 3.9854812622070312, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8722549676895142, + "num_tokens": 593498864.0, + "step": 15559 + }, + { + "epoch": 1.9793919348683373, + "ewc_loss": 0.008110363967716694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11036370578222e-05, + "grad_norm": 3.994765043258667, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8723857402801514, + "num_tokens": 593540125.0, + "step": 15560 + }, + { + "epoch": 1.9795191451469278, + "ewc_loss": 0.008127096109092236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.127096225507557e-05, + "grad_norm": 4.115181922912598, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8867008090019226, + "num_tokens": 593566937.0, + "step": 15561 + }, + { + "epoch": 1.9796463554255184, + "ewc_loss": 0.008188718929886818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.188718493329361e-05, + "grad_norm": 4.0580620765686035, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8726435303688049, + "num_tokens": 593600109.0, + "step": 15562 + }, + { + "epoch": 1.979773565704109, + "ewc_loss": 0.00810301024466753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103010623017326e-05, + "grad_norm": 4.0237321853637695, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8748066425323486, + "num_tokens": 593638982.0, + "step": 15563 + }, + { + "epoch": 1.9799007759826994, + "ewc_loss": 0.008123829029500484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.123829320538789e-05, + "grad_norm": 4.0525360107421875, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.869081974029541, + "num_tokens": 593677270.0, + "step": 15564 + }, + { + "epoch": 1.98002798626129, + "ewc_loss": 0.00814735982567072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.147359767463058e-05, + "grad_norm": 4.0351667404174805, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8831963539123535, + "num_tokens": 593714056.0, + "step": 15565 + }, + { + "epoch": 1.9801551965398803, + "ewc_loss": 0.008130237460136414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.1302372564096e-05, + "grad_norm": 4.0132551193237305, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8665151596069336, + "num_tokens": 593753516.0, + "step": 15566 + }, + { + "epoch": 1.9802824068184708, + "ewc_loss": 0.008135245181620121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.135245298035443e-05, + "grad_norm": 4.050897121429443, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8785593509674072, + "num_tokens": 593789276.0, + "step": 15567 + }, + { + "epoch": 1.9804096170970613, + "ewc_loss": 0.008163919672369957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.16391984699294e-05, + "grad_norm": 4.005458354949951, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8810187578201294, + "num_tokens": 593828112.0, + "step": 15568 + }, + { + "epoch": 1.9805368273756518, + "ewc_loss": 0.008114861324429512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.114861702779308e-05, + "grad_norm": 4.046785831451416, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8645700216293335, + "num_tokens": 593866370.0, + "step": 15569 + }, + { + "epoch": 1.9806640376542424, + "ewc_loss": 0.008179128170013428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.179128053598106e-05, + "grad_norm": 3.9796934127807617, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8806935548782349, + "num_tokens": 593907863.0, + "step": 15570 + }, + { + "epoch": 1.980791247932833, + "ewc_loss": 0.008123413659632206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.123413863359019e-05, + "grad_norm": 4.013843059539795, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8857760429382324, + "num_tokens": 593945254.0, + "step": 15571 + }, + { + "epoch": 1.9809184582114234, + "ewc_loss": 0.008172205649316311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.172205707523972e-05, + "grad_norm": 4.003331661224365, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8789639472961426, + "num_tokens": 593987105.0, + "step": 15572 + }, + { + "epoch": 1.981045668490014, + "ewc_loss": 0.008132324554026127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13232472864911e-05, + "grad_norm": 3.9803006649017334, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8889096975326538, + "num_tokens": 594024091.0, + "step": 15573 + }, + { + "epoch": 1.9811728787686045, + "ewc_loss": 0.008109291084110737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.109291229629889e-05, + "grad_norm": 4.054037094116211, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8864572048187256, + "num_tokens": 594056018.0, + "step": 15574 + }, + { + "epoch": 1.981300089047195, + "ewc_loss": 0.0081862797960639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.186279592337087e-05, + "grad_norm": 3.964615821838379, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8785459995269775, + "num_tokens": 594102297.0, + "step": 15575 + }, + { + "epoch": 1.9814272993257855, + "ewc_loss": 0.00808993075042963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.089930634014308e-05, + "grad_norm": 3.9935104846954346, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8774939775466919, + "num_tokens": 594140278.0, + "step": 15576 + }, + { + "epoch": 1.981554509604376, + "ewc_loss": 0.00814045500010252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.140454883687198e-05, + "grad_norm": 4.044284343719482, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.881462812423706, + "num_tokens": 594175025.0, + "step": 15577 + }, + { + "epoch": 1.9816817198829666, + "ewc_loss": 0.008132302202284336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.132302173180506e-05, + "grad_norm": 3.9903526306152344, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8750571608543396, + "num_tokens": 594217906.0, + "step": 15578 + }, + { + "epoch": 1.9818089301615571, + "ewc_loss": 0.008081783540546894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081783744273707e-05, + "grad_norm": 4.002678394317627, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.858893632888794, + "num_tokens": 594264331.0, + "step": 15579 + }, + { + "epoch": 1.9819361404401477, + "ewc_loss": 0.008116474375128746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116474782582372e-05, + "grad_norm": 3.9928805828094482, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8914303779602051, + "num_tokens": 594300220.0, + "step": 15580 + }, + { + "epoch": 1.9820633507187382, + "ewc_loss": 0.008077473379671574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.077473466983065e-05, + "grad_norm": 4.004197120666504, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8834432363510132, + "num_tokens": 594337849.0, + "step": 15581 + }, + { + "epoch": 1.9821905609973287, + "ewc_loss": 0.00810231827199459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102318679448217e-05, + "grad_norm": 4.026496887207031, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8700327277183533, + "num_tokens": 594379295.0, + "step": 15582 + }, + { + "epoch": 1.9823177712759192, + "ewc_loss": 0.008088774047791958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.088773756753653e-05, + "grad_norm": 4.143396377563477, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8729443550109863, + "num_tokens": 594408350.0, + "step": 15583 + }, + { + "epoch": 1.9824449815545098, + "ewc_loss": 0.008138904348015785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.138904377119616e-05, + "grad_norm": 3.9708964824676514, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8661710023880005, + "num_tokens": 594455110.0, + "step": 15584 + }, + { + "epoch": 1.9825721918331, + "ewc_loss": 0.007999171502888203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.999171793926507e-05, + "grad_norm": 3.977546453475952, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8817611932754517, + "num_tokens": 594499231.0, + "step": 15585 + }, + { + "epoch": 1.9826994021116906, + "ewc_loss": 0.00809139758348465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.091397467069328e-05, + "grad_norm": 4.026381015777588, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.857743501663208, + "num_tokens": 594541478.0, + "step": 15586 + }, + { + "epoch": 1.9828266123902811, + "ewc_loss": 0.008092844858765602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092844655038789e-05, + "grad_norm": 4.05686616897583, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8791497945785522, + "num_tokens": 594579075.0, + "step": 15587 + }, + { + "epoch": 1.9829538226688717, + "ewc_loss": 0.00807128008455038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.071280171861872e-05, + "grad_norm": 3.9679460525512695, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.892897367477417, + "num_tokens": 594616189.0, + "step": 15588 + }, + { + "epoch": 1.9830810329474622, + "ewc_loss": 0.008034304715692997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.034304482862353e-05, + "grad_norm": 4.028297424316406, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8834679126739502, + "num_tokens": 594656633.0, + "step": 15589 + }, + { + "epoch": 1.9832082432260527, + "ewc_loss": 0.008098442107439041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09844204923138e-05, + "grad_norm": 4.0408830642700195, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8592203855514526, + "num_tokens": 594696833.0, + "step": 15590 + }, + { + "epoch": 1.983335453504643, + "ewc_loss": 0.00806695781648159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066957525443286e-05, + "grad_norm": 4.022832870483398, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8636606931686401, + "num_tokens": 594735138.0, + "step": 15591 + }, + { + "epoch": 1.9834626637832335, + "ewc_loss": 0.008068066090345383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068065653787926e-05, + "grad_norm": 4.086292743682861, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8721466064453125, + "num_tokens": 594769587.0, + "step": 15592 + }, + { + "epoch": 1.983589874061824, + "ewc_loss": 0.008127700537443161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.127700129989535e-05, + "grad_norm": 4.166184902191162, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8816655874252319, + "num_tokens": 594803635.0, + "step": 15593 + }, + { + "epoch": 1.9837170843404146, + "ewc_loss": 0.008121262304484844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121262362692505e-05, + "grad_norm": 3.988032341003418, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8914965391159058, + "num_tokens": 594839872.0, + "step": 15594 + }, + { + "epoch": 1.9838442946190051, + "ewc_loss": 0.008029824122786522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029823948163539e-05, + "grad_norm": 4.008552551269531, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8630021810531616, + "num_tokens": 594879208.0, + "step": 15595 + }, + { + "epoch": 1.9839715048975957, + "ewc_loss": 0.008092914707958698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092914504231885e-05, + "grad_norm": 4.052769184112549, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8826857209205627, + "num_tokens": 594910377.0, + "step": 15596 + }, + { + "epoch": 1.9840987151761862, + "ewc_loss": 0.008113650605082512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.113650983432308e-05, + "grad_norm": 3.970017671585083, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.878209114074707, + "num_tokens": 594956574.0, + "step": 15597 + }, + { + "epoch": 1.9842259254547767, + "ewc_loss": 0.008055771701037884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.05577146820724e-05, + "grad_norm": 3.9998667240142822, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8609068393707275, + "num_tokens": 594998036.0, + "step": 15598 + }, + { + "epoch": 1.9843531357333672, + "ewc_loss": 0.008130628615617752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.130628702929243e-05, + "grad_norm": 4.044131755828857, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8715655207633972, + "num_tokens": 595031404.0, + "step": 15599 + }, + { + "epoch": 1.9844803460119578, + "ewc_loss": 0.00813213363289833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.132133370963857e-05, + "grad_norm": 4.001524925231934, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8851060271263123, + "num_tokens": 595069800.0, + "step": 15600 + }, + { + "epoch": 1.9846075562905483, + "ewc_loss": 0.008090280927717686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09028060757555e-05, + "grad_norm": 4.028933048248291, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8823609352111816, + "num_tokens": 595110987.0, + "step": 15601 + }, + { + "epoch": 1.9847347665691388, + "ewc_loss": 0.008133415132761002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.133415394695476e-05, + "grad_norm": 4.038334369659424, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8788647651672363, + "num_tokens": 595149192.0, + "step": 15602 + }, + { + "epoch": 1.9848619768477294, + "ewc_loss": 0.008115087635815144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.115087257465348e-05, + "grad_norm": 4.095970630645752, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.875638484954834, + "num_tokens": 595182993.0, + "step": 15603 + }, + { + "epoch": 1.9849891871263199, + "ewc_loss": 0.00812472216784954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.12472208053805e-05, + "grad_norm": 4.06419563293457, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8739652037620544, + "num_tokens": 595216177.0, + "step": 15604 + }, + { + "epoch": 1.9851163974049104, + "ewc_loss": 0.008100633509457111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.100633567664772e-05, + "grad_norm": 3.9856810569763184, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8779172897338867, + "num_tokens": 595253379.0, + "step": 15605 + }, + { + "epoch": 1.985243607683501, + "ewc_loss": 0.008074172772467136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.07417236501351e-05, + "grad_norm": 4.004200458526611, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8814206123352051, + "num_tokens": 595292878.0, + "step": 15606 + }, + { + "epoch": 1.9853708179620915, + "ewc_loss": 0.008113115094602108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.113115472951904e-05, + "grad_norm": 4.020545482635498, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.878905177116394, + "num_tokens": 595329082.0, + "step": 15607 + }, + { + "epoch": 1.985498028240682, + "ewc_loss": 0.008124603889882565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.124603482428938e-05, + "grad_norm": 4.057833194732666, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8691255450248718, + "num_tokens": 595367246.0, + "step": 15608 + }, + { + "epoch": 1.9856252385192723, + "ewc_loss": 0.008145879954099655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.145879837684333e-05, + "grad_norm": 4.014946460723877, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8829815983772278, + "num_tokens": 595404450.0, + "step": 15609 + }, + { + "epoch": 1.9857524487978628, + "ewc_loss": 0.008098909631371498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.098909165710211e-05, + "grad_norm": 4.002604961395264, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8889190554618835, + "num_tokens": 595439718.0, + "step": 15610 + }, + { + "epoch": 1.9858796590764534, + "ewc_loss": 0.00811285525560379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.112854993669316e-05, + "grad_norm": 4.052491188049316, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8792369365692139, + "num_tokens": 595473617.0, + "step": 15611 + }, + { + "epoch": 1.986006869355044, + "ewc_loss": 0.008147637359797955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.147636981448159e-05, + "grad_norm": 3.991929292678833, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8733037114143372, + "num_tokens": 595516432.0, + "step": 15612 + }, + { + "epoch": 1.9861340796336344, + "ewc_loss": 0.008087601512670517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087601599982008e-05, + "grad_norm": 4.011791229248047, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8784614205360413, + "num_tokens": 595556278.0, + "step": 15613 + }, + { + "epoch": 1.986261289912225, + "ewc_loss": 0.008125638589262962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.125638851197436e-05, + "grad_norm": 3.992140293121338, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8817092180252075, + "num_tokens": 595598713.0, + "step": 15614 + }, + { + "epoch": 1.9863885001908153, + "ewc_loss": 0.008096054196357727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.096054079942405e-05, + "grad_norm": 4.146214962005615, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8779316544532776, + "num_tokens": 595630597.0, + "step": 15615 + }, + { + "epoch": 1.9865157104694058, + "ewc_loss": 0.008195234462618828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.195234840968624e-05, + "grad_norm": 4.056722164154053, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8928110599517822, + "num_tokens": 595664163.0, + "step": 15616 + }, + { + "epoch": 1.9866429207479963, + "ewc_loss": 0.00807380024343729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.073799835983664e-05, + "grad_norm": 4.022305965423584, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8846989870071411, + "num_tokens": 595698809.0, + "step": 15617 + }, + { + "epoch": 1.9867701310265868, + "ewc_loss": 0.008109214715659618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.10921483207494e-05, + "grad_norm": 3.9875428676605225, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8722125291824341, + "num_tokens": 595741166.0, + "step": 15618 + }, + { + "epoch": 1.9868973413051774, + "ewc_loss": 0.008099627681076527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.099627302726731e-05, + "grad_norm": 4.003836154937744, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8723697662353516, + "num_tokens": 595787079.0, + "step": 15619 + }, + { + "epoch": 1.987024551583768, + "ewc_loss": 0.008088113740086555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.088113827398047e-05, + "grad_norm": 4.006806373596191, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.852540135383606, + "num_tokens": 595829085.0, + "step": 15620 + }, + { + "epoch": 1.9871517618623584, + "ewc_loss": 0.008115177042782307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.115176751744002e-05, + "grad_norm": 4.019179344177246, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8757833242416382, + "num_tokens": 595870646.0, + "step": 15621 + }, + { + "epoch": 1.987278972140949, + "ewc_loss": 0.008106770925223827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.106770837912336e-05, + "grad_norm": 3.9703218936920166, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8671061992645264, + "num_tokens": 595918473.0, + "step": 15622 + }, + { + "epoch": 1.9874061824195395, + "ewc_loss": 0.008061242289841175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.061242260737345e-05, + "grad_norm": 4.048869609832764, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8665546774864197, + "num_tokens": 595954097.0, + "step": 15623 + }, + { + "epoch": 1.98753339269813, + "ewc_loss": 0.008127720095217228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.127719775075093e-05, + "grad_norm": 4.063080787658691, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8604112863540649, + "num_tokens": 595990191.0, + "step": 15624 + }, + { + "epoch": 1.9876606029767205, + "ewc_loss": 0.008110161870718002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.110162161756307e-05, + "grad_norm": 4.045480728149414, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8688259124755859, + "num_tokens": 596027109.0, + "step": 15625 + }, + { + "epoch": 1.987787813255311, + "ewc_loss": 0.008087478578090668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087478636298329e-05, + "grad_norm": 3.9624693393707275, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8844110369682312, + "num_tokens": 596067930.0, + "step": 15626 + }, + { + "epoch": 1.9879150235339016, + "ewc_loss": 0.008067530579864979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.067530870903283e-05, + "grad_norm": 3.9931907653808594, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8856401443481445, + "num_tokens": 596105809.0, + "step": 15627 + }, + { + "epoch": 1.9880422338124921, + "ewc_loss": 0.00810573436319828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.105734741548076e-05, + "grad_norm": 4.017123699188232, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8792049884796143, + "num_tokens": 596144220.0, + "step": 15628 + }, + { + "epoch": 1.9881694440910826, + "ewc_loss": 0.008106416091322899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.106415771180764e-05, + "grad_norm": 4.00870418548584, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8552436828613281, + "num_tokens": 596183825.0, + "step": 15629 + }, + { + "epoch": 1.9882966543696732, + "ewc_loss": 0.008087916299700737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.087915921350941e-05, + "grad_norm": 3.9229156970977783, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8844911456108093, + "num_tokens": 596232333.0, + "step": 15630 + }, + { + "epoch": 1.9884238646482637, + "ewc_loss": 0.008060711435973644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.060711115831509e-05, + "grad_norm": 4.019615650177002, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.884060263633728, + "num_tokens": 596272138.0, + "step": 15631 + }, + { + "epoch": 1.9885510749268542, + "ewc_loss": 0.00812376569956541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.123765292111784e-05, + "grad_norm": 4.003044128417969, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8850864171981812, + "num_tokens": 596310254.0, + "step": 15632 + }, + { + "epoch": 1.9886782852054448, + "ewc_loss": 0.008057299070060253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.057299419306219e-05, + "grad_norm": 4.011348247528076, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8760326504707336, + "num_tokens": 596350396.0, + "step": 15633 + }, + { + "epoch": 1.988805495484035, + "ewc_loss": 0.008074048906564713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.074048673734069e-05, + "grad_norm": 4.004018783569336, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8807394504547119, + "num_tokens": 596388809.0, + "step": 15634 + }, + { + "epoch": 1.9889327057626256, + "ewc_loss": 0.008066180162131786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.066180453170091e-05, + "grad_norm": 4.002052307128906, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8842113018035889, + "num_tokens": 596427366.0, + "step": 15635 + }, + { + "epoch": 1.9890599160412161, + "ewc_loss": 0.008055989630520344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.055989746935666e-05, + "grad_norm": 4.022176742553711, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8686122894287109, + "num_tokens": 596469999.0, + "step": 15636 + }, + { + "epoch": 1.9891871263198067, + "ewc_loss": 0.008082791231572628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082791464403272e-05, + "grad_norm": 4.052153587341309, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.863235592842102, + "num_tokens": 596506773.0, + "step": 15637 + }, + { + "epoch": 1.9893143365983972, + "ewc_loss": 0.008082505315542221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.082505519269034e-05, + "grad_norm": 3.9947264194488525, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8767955899238586, + "num_tokens": 596547440.0, + "step": 15638 + }, + { + "epoch": 1.9894415468769877, + "ewc_loss": 0.008036291226744652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.036291546886787e-05, + "grad_norm": 4.03269624710083, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8853580951690674, + "num_tokens": 596580212.0, + "step": 15639 + }, + { + "epoch": 1.989568757155578, + "ewc_loss": 0.008103203028440475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103203435894102e-05, + "grad_norm": 4.039086818695068, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8978631496429443, + "num_tokens": 596611790.0, + "step": 15640 + }, + { + "epoch": 1.9896959674341685, + "ewc_loss": 0.008081524632871151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.081524720182642e-05, + "grad_norm": 4.0146870613098145, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8916093111038208, + "num_tokens": 596646984.0, + "step": 15641 + }, + { + "epoch": 1.989823177712759, + "ewc_loss": 0.008065606467425823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.065606380114332e-05, + "grad_norm": 4.038196563720703, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8742590546607971, + "num_tokens": 596688362.0, + "step": 15642 + }, + { + "epoch": 1.9899503879913496, + "ewc_loss": 0.008102630265057087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.102630090434104e-05, + "grad_norm": 4.056378364562988, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.879215657711029, + "num_tokens": 596725298.0, + "step": 15643 + }, + { + "epoch": 1.9900775982699401, + "ewc_loss": 0.00809819158166647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.098191756289452e-05, + "grad_norm": 4.041698932647705, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8834481239318848, + "num_tokens": 596757400.0, + "step": 15644 + }, + { + "epoch": 1.9902048085485307, + "ewc_loss": 0.008075115270912647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.075115329120308e-05, + "grad_norm": 4.00293493270874, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8878744840621948, + "num_tokens": 596790533.0, + "step": 15645 + }, + { + "epoch": 1.9903320188271212, + "ewc_loss": 0.008084220811724663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.084221190074459e-05, + "grad_norm": 4.01688814163208, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8712026476860046, + "num_tokens": 596834467.0, + "step": 15646 + }, + { + "epoch": 1.9904592291057117, + "ewc_loss": 0.008093294687569141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.093295036815107e-05, + "grad_norm": 4.0058207511901855, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8634921908378601, + "num_tokens": 596873262.0, + "step": 15647 + }, + { + "epoch": 1.9905864393843022, + "ewc_loss": 0.008099597878754139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.099597471300513e-05, + "grad_norm": 4.016946792602539, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8769639730453491, + "num_tokens": 596912833.0, + "step": 15648 + }, + { + "epoch": 1.9907136496628928, + "ewc_loss": 0.008112716488540173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.112716750474647e-05, + "grad_norm": 3.9831607341766357, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8841367959976196, + "num_tokens": 596953554.0, + "step": 15649 + }, + { + "epoch": 1.9908408599414833, + "ewc_loss": 0.008093276061117649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09327611932531e-05, + "grad_norm": 3.9912831783294678, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8711603879928589, + "num_tokens": 596991206.0, + "step": 15650 + }, + { + "epoch": 1.9909680702200738, + "ewc_loss": 0.008113575167953968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11357531347312e-05, + "grad_norm": 3.996849298477173, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8774684071540833, + "num_tokens": 597027222.0, + "step": 15651 + }, + { + "epoch": 1.9910952804986644, + "ewc_loss": 0.008137592114508152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.137592521961778e-05, + "grad_norm": 3.9796879291534424, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8855048418045044, + "num_tokens": 597069040.0, + "step": 15652 + }, + { + "epoch": 1.9912224907772549, + "ewc_loss": 0.008111825212836266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.111825445666909e-05, + "grad_norm": 4.017760753631592, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8696672320365906, + "num_tokens": 597112529.0, + "step": 15653 + }, + { + "epoch": 1.9913497010558454, + "ewc_loss": 0.008137413300573826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13741353340447e-05, + "grad_norm": 4.106572151184082, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8801759481430054, + "num_tokens": 597148304.0, + "step": 15654 + }, + { + "epoch": 1.991476911334436, + "ewc_loss": 0.008175927214324474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.175927359843627e-05, + "grad_norm": 4.003579616546631, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8848451375961304, + "num_tokens": 597184244.0, + "step": 15655 + }, + { + "epoch": 1.9916041216130265, + "ewc_loss": 0.008090301416814327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.090301707852632e-05, + "grad_norm": 4.040323734283447, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.870939314365387, + "num_tokens": 597218178.0, + "step": 15656 + }, + { + "epoch": 1.991731331891617, + "ewc_loss": 0.008139478974044323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.139479177771136e-05, + "grad_norm": 3.9871721267700195, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8748937845230103, + "num_tokens": 597259256.0, + "step": 15657 + }, + { + "epoch": 1.9918585421702073, + "ewc_loss": 0.008085704408586025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.085704030236229e-05, + "grad_norm": 3.97487211227417, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8881099820137024, + "num_tokens": 597296584.0, + "step": 15658 + }, + { + "epoch": 1.9919857524487978, + "ewc_loss": 0.008125114254653454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.125114254653454e-05, + "grad_norm": 3.98354434967041, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8624567985534668, + "num_tokens": 597341116.0, + "step": 15659 + }, + { + "epoch": 1.9921129627273884, + "ewc_loss": 0.008101875893771648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.101875573629513e-05, + "grad_norm": 4.0431904792785645, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8752167224884033, + "num_tokens": 597375595.0, + "step": 15660 + }, + { + "epoch": 1.9922401730059789, + "ewc_loss": 0.008161213248968124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.161213190760463e-05, + "grad_norm": 4.001580238342285, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8636772036552429, + "num_tokens": 597419226.0, + "step": 15661 + }, + { + "epoch": 1.9923673832845694, + "ewc_loss": 0.008108663372695446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.108663314487785e-05, + "grad_norm": 4.026689529418945, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8575340509414673, + "num_tokens": 597461242.0, + "step": 15662 + }, + { + "epoch": 1.99249459356316, + "ewc_loss": 0.008145368658006191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.145369065459818e-05, + "grad_norm": 4.026540279388428, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8759387731552124, + "num_tokens": 597500030.0, + "step": 15663 + }, + { + "epoch": 1.9926218038417502, + "ewc_loss": 0.00812545232474804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.125452586682513e-05, + "grad_norm": 4.050138473510742, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8618175983428955, + "num_tokens": 597540428.0, + "step": 15664 + }, + { + "epoch": 1.9927490141203408, + "ewc_loss": 0.008131083101034164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.131082722684368e-05, + "grad_norm": 4.022406578063965, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8874680995941162, + "num_tokens": 597575070.0, + "step": 15665 + }, + { + "epoch": 1.9928762243989313, + "ewc_loss": 0.00811841618269682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.118416008073837e-05, + "grad_norm": 4.000086784362793, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.874792218208313, + "num_tokens": 597617425.0, + "step": 15666 + }, + { + "epoch": 1.9930034346775218, + "ewc_loss": 0.008109922520816326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.1099227827508e-05, + "grad_norm": 3.986809253692627, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.882342517375946, + "num_tokens": 597660489.0, + "step": 15667 + }, + { + "epoch": 1.9931306449561124, + "ewc_loss": 0.00809610914438963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.096109377220273e-05, + "grad_norm": 3.9732425212860107, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.882301926612854, + "num_tokens": 597703626.0, + "step": 15668 + }, + { + "epoch": 1.993257855234703, + "ewc_loss": 0.008094850927591324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.094850636553019e-05, + "grad_norm": 4.0156683921813965, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8729860782623291, + "num_tokens": 597741176.0, + "step": 15669 + }, + { + "epoch": 1.9933850655132934, + "ewc_loss": 0.00811165850609541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.111658098641783e-05, + "grad_norm": 4.069921016693115, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8636098504066467, + "num_tokens": 597779763.0, + "step": 15670 + }, + { + "epoch": 1.993512275791884, + "ewc_loss": 0.00813019648194313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.130196511046961e-05, + "grad_norm": 4.033663749694824, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8776556253433228, + "num_tokens": 597813300.0, + "step": 15671 + }, + { + "epoch": 1.9936394860704745, + "ewc_loss": 0.008079727180302143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.079727558651939e-05, + "grad_norm": 4.061131477355957, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8671616315841675, + "num_tokens": 597849062.0, + "step": 15672 + }, + { + "epoch": 1.993766696349065, + "ewc_loss": 0.008124989457428455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.12498910818249e-05, + "grad_norm": 4.030757904052734, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8780397176742554, + "num_tokens": 597886565.0, + "step": 15673 + }, + { + "epoch": 1.9938939066276555, + "ewc_loss": 0.00811317004263401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11317004263401e-05, + "grad_norm": 4.016339302062988, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8785421252250671, + "num_tokens": 597925097.0, + "step": 15674 + }, + { + "epoch": 1.994021116906246, + "ewc_loss": 0.00810999795794487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.109997725114226e-05, + "grad_norm": 4.035791397094727, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8895283937454224, + "num_tokens": 597960786.0, + "step": 15675 + }, + { + "epoch": 1.9941483271848366, + "ewc_loss": 0.008150664158165455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.150663779815659e-05, + "grad_norm": 4.061678409576416, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8576221466064453, + "num_tokens": 598000442.0, + "step": 15676 + }, + { + "epoch": 1.9942755374634271, + "ewc_loss": 0.008141922764480114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.14192317193374e-05, + "grad_norm": 4.003734111785889, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8739774823188782, + "num_tokens": 598042946.0, + "step": 15677 + }, + { + "epoch": 1.9944027477420176, + "ewc_loss": 0.008086173795163631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.086174057098106e-05, + "grad_norm": 4.0645670890808105, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8561379313468933, + "num_tokens": 598081659.0, + "step": 15678 + }, + { + "epoch": 1.9945299580206082, + "ewc_loss": 0.008159525692462921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.159525896189734e-05, + "grad_norm": 4.027285099029541, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8720303177833557, + "num_tokens": 598120203.0, + "step": 15679 + }, + { + "epoch": 1.9946571682991987, + "ewc_loss": 0.008112635463476181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.112635259749368e-05, + "grad_norm": 4.053168296813965, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8634281158447266, + "num_tokens": 598154779.0, + "step": 15680 + }, + { + "epoch": 1.9947843785777892, + "ewc_loss": 0.008149564266204834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.149564382620156e-05, + "grad_norm": 3.9778802394866943, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8763407468795776, + "num_tokens": 598197075.0, + "step": 15681 + }, + { + "epoch": 1.9949115888563798, + "ewc_loss": 0.008087299764156342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08729964774102e-05, + "grad_norm": 3.9664289951324463, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8678858876228333, + "num_tokens": 598240539.0, + "step": 15682 + }, + { + "epoch": 1.99503879913497, + "ewc_loss": 0.008122689090669155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.122689177980646e-05, + "grad_norm": 4.006079196929932, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8778550624847412, + "num_tokens": 598281776.0, + "step": 15683 + }, + { + "epoch": 1.9951660094135606, + "ewc_loss": 0.008139904588460922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.139904821291566e-05, + "grad_norm": 3.986729621887207, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8813877105712891, + "num_tokens": 598322715.0, + "step": 15684 + }, + { + "epoch": 1.9952932196921511, + "ewc_loss": 0.008119449019432068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11944919405505e-05, + "grad_norm": 4.030317783355713, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8704105019569397, + "num_tokens": 598364495.0, + "step": 15685 + }, + { + "epoch": 1.9954204299707416, + "ewc_loss": 0.008145694620907307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.145695028360933e-05, + "grad_norm": 3.9854979515075684, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8729396462440491, + "num_tokens": 598407102.0, + "step": 15686 + }, + { + "epoch": 1.9955476402493322, + "ewc_loss": 0.008097934536635876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09793418738991e-05, + "grad_norm": 4.014363765716553, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8636372089385986, + "num_tokens": 598448600.0, + "step": 15687 + }, + { + "epoch": 1.9956748505279227, + "ewc_loss": 0.008150547742843628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.150547364493832e-05, + "grad_norm": 4.010433197021484, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8653545379638672, + "num_tokens": 598491140.0, + "step": 15688 + }, + { + "epoch": 1.995802060806513, + "ewc_loss": 0.008108987472951412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.108987094601616e-05, + "grad_norm": 4.007808208465576, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8732789754867554, + "num_tokens": 598527831.0, + "step": 15689 + }, + { + "epoch": 1.9959292710851035, + "ewc_loss": 0.008115340024232864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.115340460790321e-05, + "grad_norm": 4.022085189819336, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8638885617256165, + "num_tokens": 598568680.0, + "step": 15690 + }, + { + "epoch": 1.996056481363694, + "ewc_loss": 0.008114977739751339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.114977390505373e-05, + "grad_norm": 4.032590389251709, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8655053973197937, + "num_tokens": 598606254.0, + "step": 15691 + }, + { + "epoch": 1.9961836916422846, + "ewc_loss": 0.008122694678604603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.122694998746738e-05, + "grad_norm": 4.031937599182129, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8778492212295532, + "num_tokens": 598642400.0, + "step": 15692 + }, + { + "epoch": 1.9963109019208751, + "ewc_loss": 0.008114763535559177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.114763477351516e-05, + "grad_norm": 4.040436267852783, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8556437492370605, + "num_tokens": 598678379.0, + "step": 15693 + }, + { + "epoch": 1.9964381121994657, + "ewc_loss": 0.008127397857606411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.127397450152785e-05, + "grad_norm": 4.014766216278076, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8745333552360535, + "num_tokens": 598718234.0, + "step": 15694 + }, + { + "epoch": 1.9965653224780562, + "ewc_loss": 0.008108015172183514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.108015026664361e-05, + "grad_norm": 4.04254150390625, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8785334229469299, + "num_tokens": 598754897.0, + "step": 15695 + }, + { + "epoch": 1.9966925327566467, + "ewc_loss": 0.008120995946228504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.120996062643826e-05, + "grad_norm": 4.105470180511475, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8707253336906433, + "num_tokens": 598785268.0, + "step": 15696 + }, + { + "epoch": 1.9968197430352372, + "ewc_loss": 0.00815292913466692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.152928785420954e-05, + "grad_norm": 3.967728614807129, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8812228441238403, + "num_tokens": 598827131.0, + "step": 15697 + }, + { + "epoch": 1.9969469533138278, + "ewc_loss": 0.008056028746068478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056028309511021e-05, + "grad_norm": 4.077056884765625, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.876319169998169, + "num_tokens": 598859507.0, + "step": 15698 + }, + { + "epoch": 1.9970741635924183, + "ewc_loss": 0.008176425471901894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.176425035344437e-05, + "grad_norm": 4.083813667297363, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8671464323997498, + "num_tokens": 598893280.0, + "step": 15699 + }, + { + "epoch": 1.9972013738710088, + "ewc_loss": 0.008133758790791035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.133758819894865e-05, + "grad_norm": 3.976006269454956, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8881964683532715, + "num_tokens": 598932759.0, + "step": 15700 + }, + { + "epoch": 1.9973285841495994, + "ewc_loss": 0.008091552183032036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.09155244496651e-05, + "grad_norm": 4.045294761657715, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8861163854598999, + "num_tokens": 598972899.0, + "step": 15701 + }, + { + "epoch": 1.9974557944281899, + "ewc_loss": 0.008167999796569347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.167999476427212e-05, + "grad_norm": 3.9959564208984375, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8773630857467651, + "num_tokens": 599012811.0, + "step": 15702 + }, + { + "epoch": 1.9975830047067804, + "ewc_loss": 0.008100708946585655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.100708510028198e-05, + "grad_norm": 4.03568172454834, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8668370246887207, + "num_tokens": 599055632.0, + "step": 15703 + }, + { + "epoch": 1.997710214985371, + "ewc_loss": 0.008133815601468086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.133815572364256e-05, + "grad_norm": 4.039623260498047, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8585392236709595, + "num_tokens": 599090924.0, + "step": 15704 + }, + { + "epoch": 1.9978374252639615, + "ewc_loss": 0.008144230581820011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.144230378093198e-05, + "grad_norm": 4.004735946655273, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8894104957580566, + "num_tokens": 599130434.0, + "step": 15705 + }, + { + "epoch": 1.997964635542552, + "ewc_loss": 0.008120973594486713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.120973507175222e-05, + "grad_norm": 3.989537477493286, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8746906518936157, + "num_tokens": 599171018.0, + "step": 15706 + }, + { + "epoch": 1.9980918458211423, + "ewc_loss": 0.008129466325044632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.129466004902497e-05, + "grad_norm": 4.012617111206055, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8661823272705078, + "num_tokens": 599212445.0, + "step": 15707 + }, + { + "epoch": 1.9982190560997328, + "ewc_loss": 0.008135289885103703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13528968137689e-05, + "grad_norm": 4.009138107299805, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8805549144744873, + "num_tokens": 599250591.0, + "step": 15708 + }, + { + "epoch": 1.9983462663783234, + "ewc_loss": 0.00811914075165987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.119140693452209e-05, + "grad_norm": 4.0688066482543945, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8710228800773621, + "num_tokens": 599284987.0, + "step": 15709 + }, + { + "epoch": 1.9984734766569139, + "ewc_loss": 0.00815839134156704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.158391574397683e-05, + "grad_norm": 4.0214619636535645, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8710544109344482, + "num_tokens": 599324656.0, + "step": 15710 + }, + { + "epoch": 1.9986006869355044, + "ewc_loss": 0.008114631287753582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.114631054922938e-05, + "grad_norm": 4.023226261138916, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8852412700653076, + "num_tokens": 599364014.0, + "step": 15711 + }, + { + "epoch": 1.998727897214095, + "ewc_loss": 0.008133647963404655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13364822533913e-05, + "grad_norm": 4.071920871734619, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8672268390655518, + "num_tokens": 599396781.0, + "step": 15712 + }, + { + "epoch": 1.9988551074926852, + "ewc_loss": 0.00816999189555645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.169992361217737e-05, + "grad_norm": 4.040054798126221, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8844082355499268, + "num_tokens": 599433240.0, + "step": 15713 + }, + { + "epoch": 1.9989823177712758, + "ewc_loss": 0.008116720244288445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116720709949732e-05, + "grad_norm": 3.983649492263794, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8851281404495239, + "num_tokens": 599474883.0, + "step": 15714 + }, + { + "epoch": 1.9991095280498663, + "ewc_loss": 0.008118110708892345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.118110417854041e-05, + "grad_norm": 4.089351654052734, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8755587935447693, + "num_tokens": 599508492.0, + "step": 15715 + }, + { + "epoch": 1.9992367383284568, + "ewc_loss": 0.008188272826373577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.188272477127612e-05, + "grad_norm": 4.021439075469971, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.876800537109375, + "num_tokens": 599545946.0, + "step": 15716 + }, + { + "epoch": 1.9993639486070474, + "ewc_loss": 0.008111495524644852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.111495117191225e-05, + "grad_norm": 3.9946413040161133, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8889639377593994, + "num_tokens": 599586217.0, + "step": 15717 + }, + { + "epoch": 1.9994911588856379, + "ewc_loss": 0.008107723668217659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.107723988359794e-05, + "grad_norm": 4.043478488922119, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8833742141723633, + "num_tokens": 599620431.0, + "step": 15718 + }, + { + "epoch": 1.9996183691642284, + "ewc_loss": 0.008146127685904503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.146127947838977e-05, + "grad_norm": 4.042858600616455, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8628630042076111, + "num_tokens": 599659959.0, + "step": 15719 + }, + { + "epoch": 1.999745579442819, + "ewc_loss": 0.0081245806068182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.124580199364573e-05, + "grad_norm": 4.04064416885376, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8746841549873352, + "num_tokens": 599696558.0, + "step": 15720 + }, + { + "epoch": 1.9998727897214095, + "ewc_loss": 0.008121422491967678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.121422433760017e-05, + "grad_norm": 4.025732040405273, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8648319244384766, + "num_tokens": 599734925.0, + "step": 15721 + }, + { + "epoch": 2.0, + "ewc_loss": 0.008116318844258785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116319077089429e-05, + "grad_norm": 4.038788318634033, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.881106436252594, + "num_tokens": 599772613.0, + "step": 15722 + }, + { + "epoch": 2.0001272102785905, + "ewc_loss": 0.008121664635837078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.12166472314857e-05, + "grad_norm": 3.948145627975464, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8844139575958252, + "num_tokens": 599813892.0, + "step": 15723 + }, + { + "epoch": 2.000254420557181, + "ewc_loss": 0.008078506216406822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.078506652964279e-05, + "grad_norm": 4.022725582122803, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8802395462989807, + "num_tokens": 599852807.0, + "step": 15724 + }, + { + "epoch": 2.0003816308357716, + "ewc_loss": 0.008139305748045444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.139306009979919e-05, + "grad_norm": 4.002503871917725, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8882824182510376, + "num_tokens": 599893712.0, + "step": 15725 + }, + { + "epoch": 2.000508841114362, + "ewc_loss": 0.00811498612165451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11498612165451e-05, + "grad_norm": 4.046716690063477, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8825841546058655, + "num_tokens": 599929954.0, + "step": 15726 + }, + { + "epoch": 2.0006360513929526, + "ewc_loss": 0.008126572705805302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.126572356559336e-05, + "grad_norm": 3.997861862182617, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8803501129150391, + "num_tokens": 599967763.0, + "step": 15727 + }, + { + "epoch": 2.000763261671543, + "ewc_loss": 0.00809882115572691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.098821126623079e-05, + "grad_norm": 3.9644150733947754, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.888132631778717, + "num_tokens": 600008938.0, + "step": 15728 + }, + { + "epoch": 2.0008904719501337, + "ewc_loss": 0.00809249747544527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092497591860592e-05, + "grad_norm": 4.025036811828613, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8776280879974365, + "num_tokens": 600051849.0, + "step": 15729 + }, + { + "epoch": 2.0010176822287242, + "ewc_loss": 0.008127741515636444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.127741602947935e-05, + "grad_norm": 4.040634632110596, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8867803812026978, + "num_tokens": 600088568.0, + "step": 15730 + }, + { + "epoch": 2.0011448925073148, + "ewc_loss": 0.008106818422675133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.106818131636828e-05, + "grad_norm": 3.9875118732452393, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8852297067642212, + "num_tokens": 600131553.0, + "step": 15731 + }, + { + "epoch": 2.0012721027859053, + "ewc_loss": 0.008076018653810024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.076018275460228e-05, + "grad_norm": 4.020625591278076, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.881270170211792, + "num_tokens": 600173198.0, + "step": 15732 + }, + { + "epoch": 2.001399313064496, + "ewc_loss": 0.00810376089066267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103760774247348e-05, + "grad_norm": 4.08017110824585, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8864886164665222, + "num_tokens": 600211347.0, + "step": 15733 + }, + { + "epoch": 2.0015265233430863, + "ewc_loss": 0.008123346604406834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.123346924548969e-05, + "grad_norm": 4.044607639312744, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8743172287940979, + "num_tokens": 600249953.0, + "step": 15734 + }, + { + "epoch": 2.0016537336216764, + "ewc_loss": 0.008054777048528194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.054776844801381e-05, + "grad_norm": 4.061720371246338, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8744258880615234, + "num_tokens": 600291211.0, + "step": 15735 + }, + { + "epoch": 2.001780943900267, + "ewc_loss": 0.008112622424960136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.112622890621424e-05, + "grad_norm": 4.031686305999756, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8865987062454224, + "num_tokens": 600332906.0, + "step": 15736 + }, + { + "epoch": 2.0019081541788575, + "ewc_loss": 0.008075722493231297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.075722871581092e-05, + "grad_norm": 4.075013160705566, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8804527521133423, + "num_tokens": 600370570.0, + "step": 15737 + }, + { + "epoch": 2.002035364457448, + "ewc_loss": 0.008113939315080643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11393911135383e-05, + "grad_norm": 4.054924488067627, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8734716176986694, + "num_tokens": 600409368.0, + "step": 15738 + }, + { + "epoch": 2.0021625747360385, + "ewc_loss": 0.008088286966085434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.088286995189264e-05, + "grad_norm": 4.088807106018066, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.886033296585083, + "num_tokens": 600442019.0, + "step": 15739 + }, + { + "epoch": 2.002289785014629, + "ewc_loss": 0.008116633631289005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116633398458362e-05, + "grad_norm": 4.099992275238037, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8890228867530823, + "num_tokens": 600471665.0, + "step": 15740 + }, + { + "epoch": 2.0024169952932196, + "ewc_loss": 0.008116153068840504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.116153185255826e-05, + "grad_norm": 4.039176940917969, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8800742030143738, + "num_tokens": 600508804.0, + "step": 15741 + }, + { + "epoch": 2.00254420557181, + "ewc_loss": 0.008092920295894146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.092920324997976e-05, + "grad_norm": 4.075211048126221, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8710534572601318, + "num_tokens": 600543895.0, + "step": 15742 + }, + { + "epoch": 2.0026714158504006, + "ewc_loss": 0.008163856342434883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.163856546161696e-05, + "grad_norm": 4.00139045715332, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8818244934082031, + "num_tokens": 600585818.0, + "step": 15743 + }, + { + "epoch": 2.002798626128991, + "ewc_loss": 0.008118248544633389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.11824866104871e-05, + "grad_norm": 4.1029462814331055, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.889037013053894, + "num_tokens": 600623106.0, + "step": 15744 + }, + { + "epoch": 2.0029258364075817, + "ewc_loss": 0.008208579383790493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.208578947233036e-05, + "grad_norm": 3.981600284576416, + "learning_rate": 1e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.9014616012573242, + "num_tokens": 600662662.0, + "step": 15745 + }, + { + "epoch": 2.0030530466861722, + "ewc_loss": 0.008104994893074036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.104994776658714e-05, + "grad_norm": 4.072104454040527, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8682121634483337, + "num_tokens": 600699416.0, + "step": 15746 + }, + { + "epoch": 2.0031802569647628, + "ewc_loss": 0.008215859532356262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.215859270421788e-05, + "grad_norm": 4.014281749725342, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8752378225326538, + "num_tokens": 600739351.0, + "step": 15747 + }, + { + "epoch": 2.0033074672433533, + "ewc_loss": 0.008160801604390144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.160801371559501e-05, + "grad_norm": 4.0078935623168945, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8843246698379517, + "num_tokens": 600781198.0, + "step": 15748 + }, + { + "epoch": 2.003434677521944, + "ewc_loss": 0.008186249993741512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.186249760910869e-05, + "grad_norm": 4.05330228805542, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8802653551101685, + "num_tokens": 600821743.0, + "step": 15749 + }, + { + "epoch": 2.0035618878005343, + "ewc_loss": 0.008214599452912807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.214599802158773e-05, + "grad_norm": 4.030446529388428, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.894980251789093, + "num_tokens": 600862463.0, + "step": 15750 + }, + { + "epoch": 2.003689098079125, + "ewc_loss": 0.008169703185558319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.169702778104693e-05, + "grad_norm": 4.055654048919678, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8731988668441772, + "num_tokens": 600902629.0, + "step": 15751 + }, + { + "epoch": 2.0038163083577154, + "ewc_loss": 0.008202537894248962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.20253771962598e-05, + "grad_norm": 4.025631904602051, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8804767727851868, + "num_tokens": 600941778.0, + "step": 15752 + }, + { + "epoch": 2.003943518636306, + "ewc_loss": 0.008200548589229584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.200548472814262e-05, + "grad_norm": 4.074955463409424, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8897256851196289, + "num_tokens": 600979425.0, + "step": 15753 + }, + { + "epoch": 2.0040707289148965, + "ewc_loss": 0.008219698444008827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.219698065659031e-05, + "grad_norm": 4.029795169830322, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8795179724693298, + "num_tokens": 601017882.0, + "step": 15754 + }, + { + "epoch": 2.004197939193487, + "ewc_loss": 0.008178943768143654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.178943971870467e-05, + "grad_norm": 4.035913944244385, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8773942589759827, + "num_tokens": 601053394.0, + "step": 15755 + }, + { + "epoch": 2.0043251494720775, + "ewc_loss": 0.008198685012757778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.19868510006927e-05, + "grad_norm": 3.9680049419403076, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8850866556167603, + "num_tokens": 601094940.0, + "step": 15756 + }, + { + "epoch": 2.004452359750668, + "ewc_loss": 0.008191495202481747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.191494998754933e-05, + "grad_norm": 4.072046756744385, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8836770057678223, + "num_tokens": 601132963.0, + "step": 15757 + }, + { + "epoch": 2.0045795700292586, + "ewc_loss": 0.008244853466749191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244853233918548e-05, + "grad_norm": 4.046891212463379, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8813462853431702, + "num_tokens": 601168511.0, + "step": 15758 + }, + { + "epoch": 2.0047067803078487, + "ewc_loss": 0.008208435960114002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.208435610868037e-05, + "grad_norm": 4.014898777008057, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8824220299720764, + "num_tokens": 601208922.0, + "step": 15759 + }, + { + "epoch": 2.004833990586439, + "ewc_loss": 0.008180007338523865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.180006989277899e-05, + "grad_norm": 3.970186233520508, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8788241147994995, + "num_tokens": 601253814.0, + "step": 15760 + }, + { + "epoch": 2.0049612008650297, + "ewc_loss": 0.008192901499569416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.192901441361755e-05, + "grad_norm": 4.031655311584473, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8832471370697021, + "num_tokens": 601295584.0, + "step": 15761 + }, + { + "epoch": 2.0050884111436202, + "ewc_loss": 0.008230730891227722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.230731327785179e-05, + "grad_norm": 4.070125579833984, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8730065226554871, + "num_tokens": 601329134.0, + "step": 15762 + }, + { + "epoch": 2.0052156214222108, + "ewc_loss": 0.008231405168771744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.231405081460252e-05, + "grad_norm": 4.007176399230957, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8885757923126221, + "num_tokens": 601369100.0, + "step": 15763 + }, + { + "epoch": 2.0053428317008013, + "ewc_loss": 0.008192374370992184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.192373934434727e-05, + "grad_norm": 4.010291576385498, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8956121802330017, + "num_tokens": 601407876.0, + "step": 15764 + }, + { + "epoch": 2.005470041979392, + "ewc_loss": 0.0082163717597723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.216372225433588e-05, + "grad_norm": 4.024163246154785, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8948403000831604, + "num_tokens": 601446996.0, + "step": 15765 + }, + { + "epoch": 2.0055972522579824, + "ewc_loss": 0.00820726715028286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.207267092075199e-05, + "grad_norm": 4.0743513107299805, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8759695291519165, + "num_tokens": 601482158.0, + "step": 15766 + }, + { + "epoch": 2.005724462536573, + "ewc_loss": 0.008229768835008144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.229768718592823e-05, + "grad_norm": 4.025162220001221, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8873646259307861, + "num_tokens": 601517940.0, + "step": 15767 + }, + { + "epoch": 2.0058516728151634, + "ewc_loss": 0.008194457739591599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.194457768695429e-05, + "grad_norm": 4.057376384735107, + "learning_rate": 1e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.9006625413894653, + "num_tokens": 601553823.0, + "step": 15768 + }, + { + "epoch": 2.005978883093754, + "ewc_loss": 0.008225898258388042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.225898636737838e-05, + "grad_norm": 4.028688430786133, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8705021739006042, + "num_tokens": 601595482.0, + "step": 15769 + }, + { + "epoch": 2.0061060933723445, + "ewc_loss": 0.00821427907794714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.214278932427987e-05, + "grad_norm": 4.052628040313721, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8901935815811157, + "num_tokens": 601632451.0, + "step": 15770 + }, + { + "epoch": 2.006233303650935, + "ewc_loss": 0.008213664405047894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.213664114009589e-05, + "grad_norm": 4.074939727783203, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8813686966896057, + "num_tokens": 601667573.0, + "step": 15771 + }, + { + "epoch": 2.0063605139295255, + "ewc_loss": 0.008246839046478271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.246838842751458e-05, + "grad_norm": 4.074154853820801, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8879470825195312, + "num_tokens": 601704494.0, + "step": 15772 + }, + { + "epoch": 2.006487724208116, + "ewc_loss": 0.008217082358896732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.217082358896732e-05, + "grad_norm": 4.012176990509033, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8722199201583862, + "num_tokens": 601745307.0, + "step": 15773 + }, + { + "epoch": 2.0066149344867066, + "ewc_loss": 0.00819282978773117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.192830136977136e-05, + "grad_norm": 4.040159702301025, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8966110944747925, + "num_tokens": 601782551.0, + "step": 15774 + }, + { + "epoch": 2.006742144765297, + "ewc_loss": 0.008225338533520699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.225338388001546e-05, + "grad_norm": 4.032977104187012, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8841191530227661, + "num_tokens": 601823378.0, + "step": 15775 + }, + { + "epoch": 2.0068693550438876, + "ewc_loss": 0.008215351961553097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.21535213617608e-05, + "grad_norm": 4.040466785430908, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8822637796401978, + "num_tokens": 601860881.0, + "step": 15776 + }, + { + "epoch": 2.006996565322478, + "ewc_loss": 0.008223799988627434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.223800250561908e-05, + "grad_norm": 4.00878381729126, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8805750608444214, + "num_tokens": 601903502.0, + "step": 15777 + }, + { + "epoch": 2.0071237756010687, + "ewc_loss": 0.008207950741052628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.207950304495171e-05, + "grad_norm": 4.070122241973877, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.899459183216095, + "num_tokens": 601937097.0, + "step": 15778 + }, + { + "epoch": 2.007250985879659, + "ewc_loss": 0.008249297738075256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.249297388829291e-05, + "grad_norm": 4.018561840057373, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8852620124816895, + "num_tokens": 601978050.0, + "step": 15779 + }, + { + "epoch": 2.0073781961582498, + "ewc_loss": 0.008192935958504677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.192935638362542e-05, + "grad_norm": 4.009805202484131, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8801493048667908, + "num_tokens": 602018167.0, + "step": 15780 + }, + { + "epoch": 2.0075054064368403, + "ewc_loss": 0.008207743987441063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.207744394894689e-05, + "grad_norm": 4.05316162109375, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8682239055633545, + "num_tokens": 602054485.0, + "step": 15781 + }, + { + "epoch": 2.007632616715431, + "ewc_loss": 0.008229843340814114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.229842933360487e-05, + "grad_norm": 4.018694877624512, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8881269693374634, + "num_tokens": 602092881.0, + "step": 15782 + }, + { + "epoch": 2.0077598269940213, + "ewc_loss": 0.008205096237361431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.205096673918888e-05, + "grad_norm": 4.033329963684082, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8844011425971985, + "num_tokens": 602127424.0, + "step": 15783 + }, + { + "epoch": 2.0078870372726114, + "ewc_loss": 0.008235598914325237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.235598943429068e-05, + "grad_norm": 4.019562721252441, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8897601366043091, + "num_tokens": 602168340.0, + "step": 15784 + }, + { + "epoch": 2.008014247551202, + "ewc_loss": 0.008201060816645622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2010607002303e-05, + "grad_norm": 4.068055152893066, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8731029033660889, + "num_tokens": 602207894.0, + "step": 15785 + }, + { + "epoch": 2.0081414578297925, + "ewc_loss": 0.008256528526544571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.256528963102028e-05, + "grad_norm": 4.010340690612793, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8843178153038025, + "num_tokens": 602245924.0, + "step": 15786 + }, + { + "epoch": 2.008268668108383, + "ewc_loss": 0.008210458792746067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.210459054680541e-05, + "grad_norm": 4.097200393676758, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8805105090141296, + "num_tokens": 602278417.0, + "step": 15787 + }, + { + "epoch": 2.0083958783869735, + "ewc_loss": 0.008295537903904915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29553755465895e-05, + "grad_norm": 3.9912161827087402, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8863565921783447, + "num_tokens": 602318431.0, + "step": 15788 + }, + { + "epoch": 2.008523088665564, + "ewc_loss": 0.008192768320441246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.192768291337416e-05, + "grad_norm": 4.044595718383789, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8735692501068115, + "num_tokens": 602356315.0, + "step": 15789 + }, + { + "epoch": 2.0086502989441546, + "ewc_loss": 0.008283525705337524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.283525676233694e-05, + "grad_norm": 3.9632835388183594, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8963345885276794, + "num_tokens": 602398865.0, + "step": 15790 + }, + { + "epoch": 2.008777509222745, + "ewc_loss": 0.008214243687689304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.214243280235678e-05, + "grad_norm": 4.034063339233398, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8934971690177917, + "num_tokens": 602435835.0, + "step": 15791 + }, + { + "epoch": 2.0089047195013356, + "ewc_loss": 0.008292148821055889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.292149141198024e-05, + "grad_norm": 4.047163486480713, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8907119631767273, + "num_tokens": 602470359.0, + "step": 15792 + }, + { + "epoch": 2.009031929779926, + "ewc_loss": 0.008264980278909206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264980715466663e-05, + "grad_norm": 4.036990165710449, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8870674967765808, + "num_tokens": 602512132.0, + "step": 15793 + }, + { + "epoch": 2.0091591400585167, + "ewc_loss": 0.008231529965996742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.231529500335455e-05, + "grad_norm": 4.049415111541748, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8797524571418762, + "num_tokens": 602552700.0, + "step": 15794 + }, + { + "epoch": 2.0092863503371072, + "ewc_loss": 0.008256793953478336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.256793807959184e-05, + "grad_norm": 4.0570878982543945, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8845762014389038, + "num_tokens": 602587779.0, + "step": 15795 + }, + { + "epoch": 2.0094135606156978, + "ewc_loss": 0.00825837068259716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.258370507974178e-05, + "grad_norm": 4.084775924682617, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.875727653503418, + "num_tokens": 602623834.0, + "step": 15796 + }, + { + "epoch": 2.0095407708942883, + "ewc_loss": 0.008265993557870388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.265993528766558e-05, + "grad_norm": 4.026489734649658, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8927976489067078, + "num_tokens": 602660141.0, + "step": 15797 + }, + { + "epoch": 2.009667981172879, + "ewc_loss": 0.008219030685722828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.21903086034581e-05, + "grad_norm": 4.056520938873291, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8849493265151978, + "num_tokens": 602698990.0, + "step": 15798 + }, + { + "epoch": 2.0097951914514693, + "ewc_loss": 0.008269006386399269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.269006502814591e-05, + "grad_norm": 4.113399028778076, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8652869462966919, + "num_tokens": 602732053.0, + "step": 15799 + }, + { + "epoch": 2.00992240173006, + "ewc_loss": 0.008288883604109287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288883691420779e-05, + "grad_norm": 4.036823749542236, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8847351670265198, + "num_tokens": 602770975.0, + "step": 15800 + }, + { + "epoch": 2.0100496120086504, + "ewc_loss": 0.008259149268269539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.259149035438895e-05, + "grad_norm": 3.994412899017334, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.874245285987854, + "num_tokens": 602813207.0, + "step": 15801 + }, + { + "epoch": 2.010176822287241, + "ewc_loss": 0.008262830786406994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.262830669991672e-05, + "grad_norm": 4.053269386291504, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8877664804458618, + "num_tokens": 602849951.0, + "step": 15802 + }, + { + "epoch": 2.0103040325658315, + "ewc_loss": 0.008281812071800232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.281812188215554e-05, + "grad_norm": 4.008406639099121, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8758002519607544, + "num_tokens": 602891418.0, + "step": 15803 + }, + { + "epoch": 2.010431242844422, + "ewc_loss": 0.008263016119599342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.263016206910834e-05, + "grad_norm": 4.036680698394775, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8882619142532349, + "num_tokens": 602933039.0, + "step": 15804 + }, + { + "epoch": 2.0105584531230125, + "ewc_loss": 0.008281211368739605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.281211194116622e-05, + "grad_norm": 4.0550360679626465, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8978878259658813, + "num_tokens": 602965546.0, + "step": 15805 + }, + { + "epoch": 2.010685663401603, + "ewc_loss": 0.00829089805483818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.290898404084146e-05, + "grad_norm": 4.034308433532715, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.883406937122345, + "num_tokens": 603004473.0, + "step": 15806 + }, + { + "epoch": 2.0108128736801936, + "ewc_loss": 0.008271031081676483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271030674222857e-05, + "grad_norm": 4.069866180419922, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.875474214553833, + "num_tokens": 603041644.0, + "step": 15807 + }, + { + "epoch": 2.0109400839587837, + "ewc_loss": 0.008286470547318459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286470256280154e-05, + "grad_norm": 3.999236822128296, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.871056079864502, + "num_tokens": 603084698.0, + "step": 15808 + }, + { + "epoch": 2.011067294237374, + "ewc_loss": 0.00826436560600996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264365897048265e-05, + "grad_norm": 4.0793914794921875, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8685551881790161, + "num_tokens": 603121059.0, + "step": 15809 + }, + { + "epoch": 2.0111945045159647, + "ewc_loss": 0.008323476649820805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323476504301652e-05, + "grad_norm": 4.0912675857543945, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8804499506950378, + "num_tokens": 603156604.0, + "step": 15810 + }, + { + "epoch": 2.0113217147945552, + "ewc_loss": 0.00829553697258234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.295536827063188e-05, + "grad_norm": 4.129166603088379, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8829431533813477, + "num_tokens": 603183743.0, + "step": 15811 + }, + { + "epoch": 2.0114489250731458, + "ewc_loss": 0.008323042653501034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323042857227847e-05, + "grad_norm": 4.078892230987549, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8592040538787842, + "num_tokens": 603222099.0, + "step": 15812 + }, + { + "epoch": 2.0115761353517363, + "ewc_loss": 0.008305472321808338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305472147185355e-05, + "grad_norm": 4.04892110824585, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8654658794403076, + "num_tokens": 603263467.0, + "step": 15813 + }, + { + "epoch": 2.011703345630327, + "ewc_loss": 0.00832407083362341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324070950038731e-05, + "grad_norm": 4.105722904205322, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8638455271720886, + "num_tokens": 603296726.0, + "step": 15814 + }, + { + "epoch": 2.0118305559089174, + "ewc_loss": 0.008375848643481731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375848847208545e-05, + "grad_norm": 4.045277118682861, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8883316516876221, + "num_tokens": 603329691.0, + "step": 15815 + }, + { + "epoch": 2.011957766187508, + "ewc_loss": 0.008324838243424892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324838563567027e-05, + "grad_norm": 3.994198799133301, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8886176943778992, + "num_tokens": 603371619.0, + "step": 15816 + }, + { + "epoch": 2.0120849764660984, + "ewc_loss": 0.008307702839374542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307702955789864e-05, + "grad_norm": 4.183811664581299, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8843808174133301, + "num_tokens": 603400018.0, + "step": 15817 + }, + { + "epoch": 2.012212186744689, + "ewc_loss": 0.00846132542937994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461325342068449e-05, + "grad_norm": 4.00784969329834, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8854806423187256, + "num_tokens": 603436971.0, + "step": 15818 + }, + { + "epoch": 2.0123393970232795, + "ewc_loss": 0.008310380391776562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.310380508191884e-05, + "grad_norm": 4.081154823303223, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8828821778297424, + "num_tokens": 603472802.0, + "step": 15819 + }, + { + "epoch": 2.01246660730187, + "ewc_loss": 0.008439386263489723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439386147074401e-05, + "grad_norm": 4.092266082763672, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8875491619110107, + "num_tokens": 603506547.0, + "step": 15820 + }, + { + "epoch": 2.0125938175804605, + "ewc_loss": 0.008389238268136978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389238064410165e-05, + "grad_norm": 4.085285663604736, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8658828735351562, + "num_tokens": 603544967.0, + "step": 15821 + }, + { + "epoch": 2.012721027859051, + "ewc_loss": 0.00839855894446373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398559293709695e-05, + "grad_norm": 4.05934476852417, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8769416809082031, + "num_tokens": 603582522.0, + "step": 15822 + }, + { + "epoch": 2.0128482381376416, + "ewc_loss": 0.008391115814447403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391115989070386e-05, + "grad_norm": 4.05076265335083, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8910648226737976, + "num_tokens": 603619584.0, + "step": 15823 + }, + { + "epoch": 2.012975448416232, + "ewc_loss": 0.008389541879296303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389542199438438e-05, + "grad_norm": 4.058684349060059, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8701372146606445, + "num_tokens": 603660542.0, + "step": 15824 + }, + { + "epoch": 2.0131026586948226, + "ewc_loss": 0.008394517004489899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394516771659255e-05, + "grad_norm": 4.0322065353393555, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8828452825546265, + "num_tokens": 603698900.0, + "step": 15825 + }, + { + "epoch": 2.013229868973413, + "ewc_loss": 0.008369949646294117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369949500774965e-05, + "grad_norm": 4.143672466278076, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8665422797203064, + "num_tokens": 603733019.0, + "step": 15826 + }, + { + "epoch": 2.0133570792520037, + "ewc_loss": 0.008439387194812298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439386874670163e-05, + "grad_norm": 4.016633033752441, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8908745050430298, + "num_tokens": 603772071.0, + "step": 15827 + }, + { + "epoch": 2.013484289530594, + "ewc_loss": 0.008334454149007797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.334454469149932e-05, + "grad_norm": 4.057584285736084, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8762184977531433, + "num_tokens": 603811008.0, + "step": 15828 + }, + { + "epoch": 2.0136114998091847, + "ewc_loss": 0.008391411043703556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391411392949522e-05, + "grad_norm": 4.05219030380249, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8725031018257141, + "num_tokens": 603852647.0, + "step": 15829 + }, + { + "epoch": 2.0137387100877753, + "ewc_loss": 0.008365869522094727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365869871340692e-05, + "grad_norm": 4.0544514656066895, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8814992904663086, + "num_tokens": 603894198.0, + "step": 15830 + }, + { + "epoch": 2.013865920366366, + "ewc_loss": 0.008352487348020077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352487202500924e-05, + "grad_norm": 4.073405742645264, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8794354200363159, + "num_tokens": 603930168.0, + "step": 15831 + }, + { + "epoch": 2.0139931306449563, + "ewc_loss": 0.008349601179361343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349600830115378e-05, + "grad_norm": 4.044231414794922, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.891035795211792, + "num_tokens": 603966189.0, + "step": 15832 + }, + { + "epoch": 2.0141203409235464, + "ewc_loss": 0.0083330562338233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333056030096486e-05, + "grad_norm": 4.019408702850342, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8771775960922241, + "num_tokens": 604011110.0, + "step": 15833 + }, + { + "epoch": 2.014247551202137, + "ewc_loss": 0.008305992931127548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305993105750531e-05, + "grad_norm": 4.140810966491699, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.867615282535553, + "num_tokens": 604044256.0, + "step": 15834 + }, + { + "epoch": 2.0143747614807275, + "ewc_loss": 0.008390406146645546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390405855607241e-05, + "grad_norm": 4.080545902252197, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.890196681022644, + "num_tokens": 604078652.0, + "step": 15835 + }, + { + "epoch": 2.014501971759318, + "ewc_loss": 0.008321481756865978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321481436723843e-05, + "grad_norm": 4.081544876098633, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8918954133987427, + "num_tokens": 604112302.0, + "step": 15836 + }, + { + "epoch": 2.0146291820379085, + "ewc_loss": 0.008346968330442905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346968388650566e-05, + "grad_norm": 3.9916725158691406, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8974342346191406, + "num_tokens": 604150011.0, + "step": 15837 + }, + { + "epoch": 2.014756392316499, + "ewc_loss": 0.008319857530295849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319857442984357e-05, + "grad_norm": 4.039203643798828, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8831751346588135, + "num_tokens": 604188859.0, + "step": 15838 + }, + { + "epoch": 2.0148836025950896, + "ewc_loss": 0.008380495943129063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380496001336724e-05, + "grad_norm": 4.023797988891602, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8783484101295471, + "num_tokens": 604232273.0, + "step": 15839 + }, + { + "epoch": 2.01501081287368, + "ewc_loss": 0.008332298137247562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.332297875313088e-05, + "grad_norm": 4.077520370483398, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8876417279243469, + "num_tokens": 604265236.0, + "step": 15840 + }, + { + "epoch": 2.0151380231522706, + "ewc_loss": 0.008380189538002014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380189683521166e-05, + "grad_norm": 4.009118556976318, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8946050405502319, + "num_tokens": 604304784.0, + "step": 15841 + }, + { + "epoch": 2.015265233430861, + "ewc_loss": 0.008303912356495857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303912181872874e-05, + "grad_norm": 4.035407543182373, + "learning_rate": 1e-06, + "loss": 0.2688, + "mean_token_accuracy": 0.9064571857452393, + "num_tokens": 604341461.0, + "step": 15842 + }, + { + "epoch": 2.0153924437094517, + "ewc_loss": 0.008345150388777256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345150126842782e-05, + "grad_norm": 4.029598712921143, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8875503540039062, + "num_tokens": 604378628.0, + "step": 15843 + }, + { + "epoch": 2.0155196539880422, + "ewc_loss": 0.008328350260853767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328350668307394e-05, + "grad_norm": 4.033412456512451, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8919768333435059, + "num_tokens": 604418070.0, + "step": 15844 + }, + { + "epoch": 2.0156468642666328, + "ewc_loss": 0.008308690041303635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308689575642347e-05, + "grad_norm": 4.0307698249816895, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8943033218383789, + "num_tokens": 604454445.0, + "step": 15845 + }, + { + "epoch": 2.0157740745452233, + "ewc_loss": 0.00830545648932457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305456867674366e-05, + "grad_norm": 4.082284450531006, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8864642381668091, + "num_tokens": 604488559.0, + "step": 15846 + }, + { + "epoch": 2.015901284823814, + "ewc_loss": 0.008333715610206127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333715231856331e-05, + "grad_norm": 4.1201019287109375, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8767516613006592, + "num_tokens": 604522538.0, + "step": 15847 + }, + { + "epoch": 2.0160284951024043, + "ewc_loss": 0.008325210772454739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325211092596874e-05, + "grad_norm": 4.004237174987793, + "learning_rate": 1e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.8998726606369019, + "num_tokens": 604561750.0, + "step": 15848 + }, + { + "epoch": 2.016155705380995, + "ewc_loss": 0.008236519061028957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.236519352067262e-05, + "grad_norm": 4.080033779144287, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8950592279434204, + "num_tokens": 604594961.0, + "step": 15849 + }, + { + "epoch": 2.0162829156595854, + "ewc_loss": 0.008328106254339218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328106196131557e-05, + "grad_norm": 4.065608024597168, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8744139075279236, + "num_tokens": 604631301.0, + "step": 15850 + }, + { + "epoch": 2.016410125938176, + "ewc_loss": 0.008284954354166985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.284953946713358e-05, + "grad_norm": 4.162173271179199, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8642096519470215, + "num_tokens": 604666049.0, + "step": 15851 + }, + { + "epoch": 2.0165373362167665, + "ewc_loss": 0.008328703232109547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32870282465592e-05, + "grad_norm": 3.999469757080078, + "learning_rate": 1e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.9031338095664978, + "num_tokens": 604701984.0, + "step": 15852 + }, + { + "epoch": 2.016664546495357, + "ewc_loss": 0.008209298364818096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.209298539441079e-05, + "grad_norm": 4.028087139129639, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.890940248966217, + "num_tokens": 604738414.0, + "step": 15853 + }, + { + "epoch": 2.0167917567739475, + "ewc_loss": 0.008283572271466255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.283572242362425e-05, + "grad_norm": 4.018816947937012, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8850206136703491, + "num_tokens": 604779961.0, + "step": 15854 + }, + { + "epoch": 2.016918967052538, + "ewc_loss": 0.008272257633507252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.272257400676608e-05, + "grad_norm": 4.084692001342773, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.877166211605072, + "num_tokens": 604819729.0, + "step": 15855 + }, + { + "epoch": 2.0170461773311286, + "ewc_loss": 0.008303767070174217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303767390316352e-05, + "grad_norm": 4.012614727020264, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8918626308441162, + "num_tokens": 604864108.0, + "step": 15856 + }, + { + "epoch": 2.0171733876097186, + "ewc_loss": 0.00824776105582714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.247760706581175e-05, + "grad_norm": 4.035298824310303, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8764087557792664, + "num_tokens": 604905582.0, + "step": 15857 + }, + { + "epoch": 2.017300597888309, + "ewc_loss": 0.008271526545286179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271526166936383e-05, + "grad_norm": 4.02617883682251, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8911713361740112, + "num_tokens": 604943931.0, + "step": 15858 + }, + { + "epoch": 2.0174278081668997, + "ewc_loss": 0.00824920367449522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.249203528976068e-05, + "grad_norm": 4.0674285888671875, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8794034123420715, + "num_tokens": 604982100.0, + "step": 15859 + }, + { + "epoch": 2.0175550184454902, + "ewc_loss": 0.008289070799946785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289070683531463e-05, + "grad_norm": 4.033299446105957, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8709424734115601, + "num_tokens": 605026289.0, + "step": 15860 + }, + { + "epoch": 2.0176822287240808, + "ewc_loss": 0.008233149535953999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.233149856096134e-05, + "grad_norm": 4.0080766677856445, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8863275051116943, + "num_tokens": 605070214.0, + "step": 15861 + }, + { + "epoch": 2.0178094390026713, + "ewc_loss": 0.008231445215642452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.23144509922713e-05, + "grad_norm": 3.9969892501831055, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8775349259376526, + "num_tokens": 605113495.0, + "step": 15862 + }, + { + "epoch": 2.017936649281262, + "ewc_loss": 0.008237824775278568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.237824658863246e-05, + "grad_norm": 4.122635841369629, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8874289989471436, + "num_tokens": 605145717.0, + "step": 15863 + }, + { + "epoch": 2.0180638595598523, + "ewc_loss": 0.008320525288581848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32052537589334e-05, + "grad_norm": 4.173659801483154, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8758191466331482, + "num_tokens": 605180930.0, + "step": 15864 + }, + { + "epoch": 2.018191069838443, + "ewc_loss": 0.00827209372073412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.272093691630289e-05, + "grad_norm": 4.066252708435059, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8894706964492798, + "num_tokens": 605214004.0, + "step": 15865 + }, + { + "epoch": 2.0183182801170334, + "ewc_loss": 0.008201619610190392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.20161949377507e-05, + "grad_norm": 4.036223411560059, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8818333745002747, + "num_tokens": 605253244.0, + "step": 15866 + }, + { + "epoch": 2.018445490395624, + "ewc_loss": 0.00822425913065672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.224259363487363e-05, + "grad_norm": 4.04281759262085, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8786643743515015, + "num_tokens": 605293832.0, + "step": 15867 + }, + { + "epoch": 2.0185727006742145, + "ewc_loss": 0.008237182162702084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.237182191805914e-05, + "grad_norm": 4.080235004425049, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.893059492111206, + "num_tokens": 605333811.0, + "step": 15868 + }, + { + "epoch": 2.018699910952805, + "ewc_loss": 0.00824472401291132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244724449468777e-05, + "grad_norm": 4.0511040687561035, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8839185237884521, + "num_tokens": 605371560.0, + "step": 15869 + }, + { + "epoch": 2.0188271212313955, + "ewc_loss": 0.008227099664509296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.22709989733994e-05, + "grad_norm": 4.054763317108154, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8804135322570801, + "num_tokens": 605410555.0, + "step": 15870 + }, + { + "epoch": 2.018954331509986, + "ewc_loss": 0.008235448971390724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.235449058702216e-05, + "grad_norm": 4.04373025894165, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8771030902862549, + "num_tokens": 605451851.0, + "step": 15871 + }, + { + "epoch": 2.0190815417885766, + "ewc_loss": 0.00823826715350151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.238267037086189e-05, + "grad_norm": 4.030986785888672, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8754281997680664, + "num_tokens": 605490442.0, + "step": 15872 + }, + { + "epoch": 2.019208752067167, + "ewc_loss": 0.008229581639170647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.229581726482138e-05, + "grad_norm": 4.085456848144531, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8829513788223267, + "num_tokens": 605525237.0, + "step": 15873 + }, + { + "epoch": 2.0193359623457576, + "ewc_loss": 0.008276170119643211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276169683085755e-05, + "grad_norm": 4.037635326385498, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8885395526885986, + "num_tokens": 605563624.0, + "step": 15874 + }, + { + "epoch": 2.019463172624348, + "ewc_loss": 0.00822457205504179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.224572229664773e-05, + "grad_norm": 4.085082530975342, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.893246054649353, + "num_tokens": 605597419.0, + "step": 15875 + }, + { + "epoch": 2.0195903829029387, + "ewc_loss": 0.008279134519398212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.279134635813534e-05, + "grad_norm": 4.014739990234375, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8762800693511963, + "num_tokens": 605639954.0, + "step": 15876 + }, + { + "epoch": 2.019717593181529, + "ewc_loss": 0.008233352564275265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.233352127717808e-05, + "grad_norm": 4.030362606048584, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8833531737327576, + "num_tokens": 605678226.0, + "step": 15877 + }, + { + "epoch": 2.0198448034601197, + "ewc_loss": 0.008281904272735119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.281903865281492e-05, + "grad_norm": 4.0344767570495605, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8920730352401733, + "num_tokens": 605719925.0, + "step": 15878 + }, + { + "epoch": 2.0199720137387103, + "ewc_loss": 0.008270799182355404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270799298770726e-05, + "grad_norm": 4.074779033660889, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8652706146240234, + "num_tokens": 605761282.0, + "step": 15879 + }, + { + "epoch": 2.020099224017301, + "ewc_loss": 0.008284296840429306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.284296927740797e-05, + "grad_norm": 4.020925521850586, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8921748995780945, + "num_tokens": 605800681.0, + "step": 15880 + }, + { + "epoch": 2.0202264342958913, + "ewc_loss": 0.008237423375248909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.237423026002944e-05, + "grad_norm": 4.1330413818359375, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8664960861206055, + "num_tokens": 605836282.0, + "step": 15881 + }, + { + "epoch": 2.0203536445744814, + "ewc_loss": 0.008330573327839375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330573473358527e-05, + "grad_norm": 4.011181354522705, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.884533166885376, + "num_tokens": 605880360.0, + "step": 15882 + }, + { + "epoch": 2.020480854853072, + "ewc_loss": 0.008216525427997112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.216525020543486e-05, + "grad_norm": 4.043613910675049, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8921279907226562, + "num_tokens": 605918363.0, + "step": 15883 + }, + { + "epoch": 2.0206080651316625, + "ewc_loss": 0.00827389769256115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.273898129118606e-05, + "grad_norm": 4.050165176391602, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8959459066390991, + "num_tokens": 605953032.0, + "step": 15884 + }, + { + "epoch": 2.020735275410253, + "ewc_loss": 0.008267260156571865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.267260272987187e-05, + "grad_norm": 4.029373645782471, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8884568214416504, + "num_tokens": 605992633.0, + "step": 15885 + }, + { + "epoch": 2.0208624856888435, + "ewc_loss": 0.00823521614074707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.235216228058562e-05, + "grad_norm": 4.0710062980651855, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8833957314491272, + "num_tokens": 606031776.0, + "step": 15886 + }, + { + "epoch": 2.020989695967434, + "ewc_loss": 0.008274865336716175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274865103885531e-05, + "grad_norm": 4.085326194763184, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8757069110870361, + "num_tokens": 606070614.0, + "step": 15887 + }, + { + "epoch": 2.0211169062460246, + "ewc_loss": 0.00827108696103096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271086699096486e-05, + "grad_norm": 4.006692409515381, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8821415901184082, + "num_tokens": 606115898.0, + "step": 15888 + }, + { + "epoch": 2.021244116524615, + "ewc_loss": 0.008218711242079735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.218711445806548e-05, + "grad_norm": 4.073919773101807, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8874076008796692, + "num_tokens": 606150231.0, + "step": 15889 + }, + { + "epoch": 2.0213713268032056, + "ewc_loss": 0.008286360651254654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286361116915941e-05, + "grad_norm": 4.037095546722412, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8764347434043884, + "num_tokens": 606191527.0, + "step": 15890 + }, + { + "epoch": 2.021498537081796, + "ewc_loss": 0.008247701451182365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2477017713245e-05, + "grad_norm": 4.08252477645874, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.877771258354187, + "num_tokens": 606229070.0, + "step": 15891 + }, + { + "epoch": 2.0216257473603867, + "ewc_loss": 0.00826975330710411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.269753743661568e-05, + "grad_norm": 3.996239423751831, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8838843703269958, + "num_tokens": 606269555.0, + "step": 15892 + }, + { + "epoch": 2.021752957638977, + "ewc_loss": 0.008220141753554344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.220141899073496e-05, + "grad_norm": 4.115778923034668, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8772596120834351, + "num_tokens": 606302040.0, + "step": 15893 + }, + { + "epoch": 2.0218801679175677, + "ewc_loss": 0.008339248597621918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339248597621918e-05, + "grad_norm": 4.018733978271484, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8904525637626648, + "num_tokens": 606341769.0, + "step": 15894 + }, + { + "epoch": 2.0220073781961583, + "ewc_loss": 0.008215648122131824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.215648267650977e-05, + "grad_norm": 4.0551228523254395, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8693463802337646, + "num_tokens": 606379989.0, + "step": 15895 + }, + { + "epoch": 2.022134588474749, + "ewc_loss": 0.00828527845442295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.285278454422951e-05, + "grad_norm": 4.088700294494629, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8779573440551758, + "num_tokens": 606414396.0, + "step": 15896 + }, + { + "epoch": 2.0222617987533393, + "ewc_loss": 0.008291183970868587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291184349218383e-05, + "grad_norm": 4.113173007965088, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8869307041168213, + "num_tokens": 606445949.0, + "step": 15897 + }, + { + "epoch": 2.02238900903193, + "ewc_loss": 0.008275246247649193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.275246364064515e-05, + "grad_norm": 4.014108180999756, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8843037486076355, + "num_tokens": 606484751.0, + "step": 15898 + }, + { + "epoch": 2.0225162193105204, + "ewc_loss": 0.008223800919950008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.223800978157669e-05, + "grad_norm": 4.020345211029053, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8857220411300659, + "num_tokens": 606525390.0, + "step": 15899 + }, + { + "epoch": 2.022643429589111, + "ewc_loss": 0.008264975622296333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264975622296333e-05, + "grad_norm": 4.070933818817139, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8789499402046204, + "num_tokens": 606563364.0, + "step": 15900 + }, + { + "epoch": 2.0227706398677014, + "ewc_loss": 0.008293245919048786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293246355606243e-05, + "grad_norm": 4.106508731842041, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8850040435791016, + "num_tokens": 606598140.0, + "step": 15901 + }, + { + "epoch": 2.022897850146292, + "ewc_loss": 0.008279143832623959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.279144094558433e-05, + "grad_norm": 4.0318708419799805, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8765039443969727, + "num_tokens": 606636480.0, + "step": 15902 + }, + { + "epoch": 2.0230250604248825, + "ewc_loss": 0.008250116370618343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.250115934060887e-05, + "grad_norm": 4.0568928718566895, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8867960572242737, + "num_tokens": 606676044.0, + "step": 15903 + }, + { + "epoch": 2.023152270703473, + "ewc_loss": 0.008298799395561218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298799366457388e-05, + "grad_norm": 4.076098442077637, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8761259317398071, + "num_tokens": 606713262.0, + "step": 15904 + }, + { + "epoch": 2.0232794809820636, + "ewc_loss": 0.008270451799035072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270452235592529e-05, + "grad_norm": 4.026071071624756, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8800691962242126, + "num_tokens": 606754844.0, + "step": 15905 + }, + { + "epoch": 2.0234066912606536, + "ewc_loss": 0.008258205838501453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.258206071332097e-05, + "grad_norm": 3.9933969974517822, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8825997710227966, + "num_tokens": 606800910.0, + "step": 15906 + }, + { + "epoch": 2.023533901539244, + "ewc_loss": 0.008266863413155079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.266863733297214e-05, + "grad_norm": 4.049327850341797, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.878389835357666, + "num_tokens": 606841855.0, + "step": 15907 + }, + { + "epoch": 2.0236611118178347, + "ewc_loss": 0.008290269412100315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29026976134628e-05, + "grad_norm": 4.031383991241455, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8741218447685242, + "num_tokens": 606888261.0, + "step": 15908 + }, + { + "epoch": 2.0237883220964252, + "ewc_loss": 0.008264582604169846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264582720585167e-05, + "grad_norm": 4.109389305114746, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8732315301895142, + "num_tokens": 606926235.0, + "step": 15909 + }, + { + "epoch": 2.0239155323750158, + "ewc_loss": 0.00830098520964384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300985064124689e-05, + "grad_norm": 4.043334484100342, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.87770015001297, + "num_tokens": 606962527.0, + "step": 15910 + }, + { + "epoch": 2.0240427426536063, + "ewc_loss": 0.008249307051301003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.249306847574189e-05, + "grad_norm": 4.005990982055664, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8862875699996948, + "num_tokens": 607008924.0, + "step": 15911 + }, + { + "epoch": 2.024169952932197, + "ewc_loss": 0.008240197785198689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.240198076236993e-05, + "grad_norm": 3.9915637969970703, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8792136907577515, + "num_tokens": 607049788.0, + "step": 15912 + }, + { + "epoch": 2.0242971632107873, + "ewc_loss": 0.00826219655573368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.262196934083477e-05, + "grad_norm": 4.0882887840271, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8750618100166321, + "num_tokens": 607087592.0, + "step": 15913 + }, + { + "epoch": 2.024424373489378, + "ewc_loss": 0.00829815212637186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298152533825487e-05, + "grad_norm": 4.048069000244141, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.898400604724884, + "num_tokens": 607119954.0, + "step": 15914 + }, + { + "epoch": 2.0245515837679684, + "ewc_loss": 0.008244218304753304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.24421804281883e-05, + "grad_norm": 4.11668062210083, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8733856678009033, + "num_tokens": 607156511.0, + "step": 15915 + }, + { + "epoch": 2.024678794046559, + "ewc_loss": 0.008295867592096329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.295867883134633e-05, + "grad_norm": 3.9914493560791016, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8915858268737793, + "num_tokens": 607198292.0, + "step": 15916 + }, + { + "epoch": 2.0248060043251495, + "ewc_loss": 0.00820854865014553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.208549115806818e-05, + "grad_norm": 4.0544562339782715, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8791323304176331, + "num_tokens": 607237572.0, + "step": 15917 + }, + { + "epoch": 2.02493321460374, + "ewc_loss": 0.00829384382814169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293843711726367e-05, + "grad_norm": 4.0062479972839355, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8742605447769165, + "num_tokens": 607282321.0, + "step": 15918 + }, + { + "epoch": 2.0250604248823305, + "ewc_loss": 0.008238143287599087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.238143345806748e-05, + "grad_norm": 4.150613307952881, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8823790550231934, + "num_tokens": 607313355.0, + "step": 15919 + }, + { + "epoch": 2.025187635160921, + "ewc_loss": 0.008341196924448013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341197099070996e-05, + "grad_norm": 4.064725399017334, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8808594346046448, + "num_tokens": 607351961.0, + "step": 15920 + }, + { + "epoch": 2.0253148454395116, + "ewc_loss": 0.008229161612689495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.229161176132038e-05, + "grad_norm": 4.026336669921875, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8852217197418213, + "num_tokens": 607391368.0, + "step": 15921 + }, + { + "epoch": 2.025442055718102, + "ewc_loss": 0.00825208518654108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.252084808191285e-05, + "grad_norm": 3.9943809509277344, + "learning_rate": 1e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.8997551798820496, + "num_tokens": 607427328.0, + "step": 15922 + }, + { + "epoch": 2.0255692659966926, + "ewc_loss": 0.008242875337600708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.242875628639013e-05, + "grad_norm": 4.048627853393555, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8782246708869934, + "num_tokens": 607468243.0, + "step": 15923 + }, + { + "epoch": 2.025696476275283, + "ewc_loss": 0.008291330188512802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291330595966429e-05, + "grad_norm": 4.098841190338135, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8675782084465027, + "num_tokens": 607502466.0, + "step": 15924 + }, + { + "epoch": 2.0258236865538737, + "ewc_loss": 0.008298187516629696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298187458422035e-05, + "grad_norm": 4.056830883026123, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8740029335021973, + "num_tokens": 607544093.0, + "step": 15925 + }, + { + "epoch": 2.025950896832464, + "ewc_loss": 0.00825388915836811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.253889245679602e-05, + "grad_norm": 4.065563201904297, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8875196576118469, + "num_tokens": 607576054.0, + "step": 15926 + }, + { + "epoch": 2.0260781071110547, + "ewc_loss": 0.008303429000079632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303429058287293e-05, + "grad_norm": 4.085980415344238, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8942227959632874, + "num_tokens": 607608462.0, + "step": 15927 + }, + { + "epoch": 2.0262053173896453, + "ewc_loss": 0.008304865099489689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304865332320333e-05, + "grad_norm": 4.071745872497559, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8772563934326172, + "num_tokens": 607645272.0, + "step": 15928 + }, + { + "epoch": 2.026332527668236, + "ewc_loss": 0.00830990169197321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30990175018087e-05, + "grad_norm": 4.052322864532471, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8841720819473267, + "num_tokens": 607682585.0, + "step": 15929 + }, + { + "epoch": 2.0264597379468263, + "ewc_loss": 0.008298140950500965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298140892293304e-05, + "grad_norm": 3.9916725158691406, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.8988116383552551, + "num_tokens": 607721302.0, + "step": 15930 + }, + { + "epoch": 2.0265869482254164, + "ewc_loss": 0.008268692530691624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26869290904142e-05, + "grad_norm": 4.062344551086426, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8800164461135864, + "num_tokens": 607758357.0, + "step": 15931 + }, + { + "epoch": 2.026714158504007, + "ewc_loss": 0.008343872614204884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343872468685731e-05, + "grad_norm": 4.021168231964111, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8813768625259399, + "num_tokens": 607795467.0, + "step": 15932 + }, + { + "epoch": 2.0268413687825975, + "ewc_loss": 0.008269389159977436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.269389218185097e-05, + "grad_norm": 4.037272930145264, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8765631914138794, + "num_tokens": 607833192.0, + "step": 15933 + }, + { + "epoch": 2.026968579061188, + "ewc_loss": 0.008317137137055397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317136962432414e-05, + "grad_norm": 4.102331638336182, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8909408450126648, + "num_tokens": 607866672.0, + "step": 15934 + }, + { + "epoch": 2.0270957893397785, + "ewc_loss": 0.008348286151885986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348286064574495e-05, + "grad_norm": 4.008803844451904, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8827658891677856, + "num_tokens": 607907976.0, + "step": 15935 + }, + { + "epoch": 2.027222999618369, + "ewc_loss": 0.008270426653325558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.27042676974088e-05, + "grad_norm": 4.0490875244140625, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8866914510726929, + "num_tokens": 607946009.0, + "step": 15936 + }, + { + "epoch": 2.0273502098969596, + "ewc_loss": 0.00834396481513977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343964873347431e-05, + "grad_norm": 4.047236919403076, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8868698477745056, + "num_tokens": 607986800.0, + "step": 15937 + }, + { + "epoch": 2.02747742017555, + "ewc_loss": 0.008309704251587391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309703844133765e-05, + "grad_norm": 4.104033946990967, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8779523372650146, + "num_tokens": 608021807.0, + "step": 15938 + }, + { + "epoch": 2.0276046304541406, + "ewc_loss": 0.008328864350914955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328864350914955e-05, + "grad_norm": 4.062243938446045, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.881604790687561, + "num_tokens": 608060294.0, + "step": 15939 + }, + { + "epoch": 2.027731840732731, + "ewc_loss": 0.00829547643661499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29547643661499e-05, + "grad_norm": 4.043710231781006, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8815789818763733, + "num_tokens": 608100790.0, + "step": 15940 + }, + { + "epoch": 2.0278590510113217, + "ewc_loss": 0.008275154046714306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.275153959402815e-05, + "grad_norm": 4.036818981170654, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8796442747116089, + "num_tokens": 608142164.0, + "step": 15941 + }, + { + "epoch": 2.027986261289912, + "ewc_loss": 0.008287276141345501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.287276432383806e-05, + "grad_norm": 4.032926082611084, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8835262060165405, + "num_tokens": 608184175.0, + "step": 15942 + }, + { + "epoch": 2.0281134715685027, + "ewc_loss": 0.008280719630420208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280719339381903e-05, + "grad_norm": 4.050482273101807, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8772112131118774, + "num_tokens": 608226801.0, + "step": 15943 + }, + { + "epoch": 2.0282406818470933, + "ewc_loss": 0.00826120562851429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.261205221060663e-05, + "grad_norm": 4.085897445678711, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8901656866073608, + "num_tokens": 608265155.0, + "step": 15944 + }, + { + "epoch": 2.028367892125684, + "ewc_loss": 0.008264318108558655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264317875728011e-05, + "grad_norm": 4.020410060882568, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.892092227935791, + "num_tokens": 608303817.0, + "step": 15945 + }, + { + "epoch": 2.0284951024042743, + "ewc_loss": 0.00822385959327221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.223859185818583e-05, + "grad_norm": 4.040343284606934, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8892190456390381, + "num_tokens": 608338845.0, + "step": 15946 + }, + { + "epoch": 2.028622312682865, + "ewc_loss": 0.00825074128806591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.250741666415706e-05, + "grad_norm": 4.049978256225586, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8834132552146912, + "num_tokens": 608377661.0, + "step": 15947 + }, + { + "epoch": 2.0287495229614554, + "ewc_loss": 0.008240308612585068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.240308670792729e-05, + "grad_norm": 4.087530136108398, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8828067779541016, + "num_tokens": 608412897.0, + "step": 15948 + }, + { + "epoch": 2.028876733240046, + "ewc_loss": 0.00825146958231926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.251469262177125e-05, + "grad_norm": 4.028672695159912, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8736411333084106, + "num_tokens": 608453584.0, + "step": 15949 + }, + { + "epoch": 2.0290039435186364, + "ewc_loss": 0.008211934939026833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.211934618884698e-05, + "grad_norm": 4.110904216766357, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.879890501499176, + "num_tokens": 608492211.0, + "step": 15950 + }, + { + "epoch": 2.029131153797227, + "ewc_loss": 0.008282328955829144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.282328781206161e-05, + "grad_norm": 4.021576881408691, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8794974684715271, + "num_tokens": 608534317.0, + "step": 15951 + }, + { + "epoch": 2.0292583640758175, + "ewc_loss": 0.008199773728847504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.199773583328351e-05, + "grad_norm": 4.028791427612305, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8937107920646667, + "num_tokens": 608569763.0, + "step": 15952 + }, + { + "epoch": 2.029385574354408, + "ewc_loss": 0.00825513992458582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25513998279348e-05, + "grad_norm": 4.000080585479736, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8844853639602661, + "num_tokens": 608612145.0, + "step": 15953 + }, + { + "epoch": 2.0295127846329986, + "ewc_loss": 0.008215520530939102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.215520938392729e-05, + "grad_norm": 4.057242393493652, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8828041553497314, + "num_tokens": 608649528.0, + "step": 15954 + }, + { + "epoch": 2.0296399949115886, + "ewc_loss": 0.008257048204541206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25704846647568e-05, + "grad_norm": 4.063316345214844, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8771077394485474, + "num_tokens": 608686973.0, + "step": 15955 + }, + { + "epoch": 2.029767205190179, + "ewc_loss": 0.008234094828367233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.234095002990216e-05, + "grad_norm": 4.05305814743042, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8845111727714539, + "num_tokens": 608729169.0, + "step": 15956 + }, + { + "epoch": 2.0298944154687697, + "ewc_loss": 0.008222722448408604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.222722681239247e-05, + "grad_norm": 4.083358287811279, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8815526962280273, + "num_tokens": 608764502.0, + "step": 15957 + }, + { + "epoch": 2.0300216257473602, + "ewc_loss": 0.008282428607344627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.282428461825475e-05, + "grad_norm": 4.065194606781006, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8836420774459839, + "num_tokens": 608805190.0, + "step": 15958 + }, + { + "epoch": 2.0301488360259508, + "ewc_loss": 0.008260473608970642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.260473259724677e-05, + "grad_norm": 4.09237813949585, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.888107419013977, + "num_tokens": 608840006.0, + "step": 15959 + }, + { + "epoch": 2.0302760463045413, + "ewc_loss": 0.008284218609333038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.284219074994326e-05, + "grad_norm": 4.1120710372924805, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8892645239830017, + "num_tokens": 608871551.0, + "step": 15960 + }, + { + "epoch": 2.030403256583132, + "ewc_loss": 0.008280349895358086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280349720735103e-05, + "grad_norm": 4.062234401702881, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8617091178894043, + "num_tokens": 608909610.0, + "step": 15961 + }, + { + "epoch": 2.0305304668617223, + "ewc_loss": 0.008244805969297886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244805940194055e-05, + "grad_norm": 4.0289225578308105, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8780149817466736, + "num_tokens": 608950758.0, + "step": 15962 + }, + { + "epoch": 2.030657677140313, + "ewc_loss": 0.008250756189227104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.250756218330935e-05, + "grad_norm": 4.050760269165039, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8708541393280029, + "num_tokens": 608990695.0, + "step": 15963 + }, + { + "epoch": 2.0307848874189034, + "ewc_loss": 0.008299000561237335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299000910483301e-05, + "grad_norm": 4.034809589385986, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8961295485496521, + "num_tokens": 609030978.0, + "step": 15964 + }, + { + "epoch": 2.030912097697494, + "ewc_loss": 0.008261227048933506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.261227048933506e-05, + "grad_norm": 4.0175371170043945, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8931602239608765, + "num_tokens": 609071974.0, + "step": 15965 + }, + { + "epoch": 2.0310393079760845, + "ewc_loss": 0.008267709985375404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.267709927167743e-05, + "grad_norm": 4.094836235046387, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8856373429298401, + "num_tokens": 609111268.0, + "step": 15966 + }, + { + "epoch": 2.031166518254675, + "ewc_loss": 0.008312451653182507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31245124572888e-05, + "grad_norm": 4.07936954498291, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8777790069580078, + "num_tokens": 609149681.0, + "step": 15967 + }, + { + "epoch": 2.0312937285332655, + "ewc_loss": 0.00826411135494709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264111238531768e-05, + "grad_norm": 4.035861015319824, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8880587816238403, + "num_tokens": 609189121.0, + "step": 15968 + }, + { + "epoch": 2.031420938811856, + "ewc_loss": 0.008235710673034191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.235710993176326e-05, + "grad_norm": 4.037200450897217, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8920732140541077, + "num_tokens": 609228070.0, + "step": 15969 + }, + { + "epoch": 2.0315481490904466, + "ewc_loss": 0.00826873630285263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.268736564787105e-05, + "grad_norm": 4.023420810699463, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8852347135543823, + "num_tokens": 609269718.0, + "step": 15970 + }, + { + "epoch": 2.031675359369037, + "ewc_loss": 0.00824199989438057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.242000330938026e-05, + "grad_norm": 4.0525946617126465, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8868854641914368, + "num_tokens": 609307236.0, + "step": 15971 + }, + { + "epoch": 2.0318025696476276, + "ewc_loss": 0.008258099667727947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25809984235093e-05, + "grad_norm": 4.105736255645752, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8670660853385925, + "num_tokens": 609342993.0, + "step": 15972 + }, + { + "epoch": 2.031929779926218, + "ewc_loss": 0.008266936987638474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.266937220469117e-05, + "grad_norm": 4.082075119018555, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8900474309921265, + "num_tokens": 609376817.0, + "step": 15973 + }, + { + "epoch": 2.0320569902048087, + "ewc_loss": 0.008245396427810192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.245396747952327e-05, + "grad_norm": 4.101292133331299, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8815314769744873, + "num_tokens": 609411209.0, + "step": 15974 + }, + { + "epoch": 2.032184200483399, + "ewc_loss": 0.008282873779535294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.282873750431463e-05, + "grad_norm": 4.029023170471191, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.9000805616378784, + "num_tokens": 609449055.0, + "step": 15975 + }, + { + "epoch": 2.0323114107619897, + "ewc_loss": 0.008235642686486244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.235642599174753e-05, + "grad_norm": 4.075867652893066, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8826766014099121, + "num_tokens": 609486332.0, + "step": 15976 + }, + { + "epoch": 2.0324386210405803, + "ewc_loss": 0.008275584317743778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.275584696093574e-05, + "grad_norm": 4.063215255737305, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.870397686958313, + "num_tokens": 609526924.0, + "step": 15977 + }, + { + "epoch": 2.032565831319171, + "ewc_loss": 0.008249145932495594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.249146048910916e-05, + "grad_norm": 4.1143717765808105, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8967329263687134, + "num_tokens": 609559542.0, + "step": 15978 + }, + { + "epoch": 2.032693041597761, + "ewc_loss": 0.008281705901026726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.281705959234387e-05, + "grad_norm": 4.020124435424805, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8762803077697754, + "num_tokens": 609598989.0, + "step": 15979 + }, + { + "epoch": 2.0328202518763514, + "ewc_loss": 0.00821923092007637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.219230949180201e-05, + "grad_norm": 4.105925559997559, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8704559803009033, + "num_tokens": 609635790.0, + "step": 15980 + }, + { + "epoch": 2.032947462154942, + "ewc_loss": 0.00830141268670559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.301412890432402e-05, + "grad_norm": 4.102252006530762, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8741699457168579, + "num_tokens": 609670120.0, + "step": 15981 + }, + { + "epoch": 2.0330746724335325, + "ewc_loss": 0.0082846125587821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.284612704301253e-05, + "grad_norm": 4.010849952697754, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8820636868476868, + "num_tokens": 609713260.0, + "step": 15982 + }, + { + "epoch": 2.033201882712123, + "ewc_loss": 0.008223275654017925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.223275654017925e-05, + "grad_norm": 4.034677505493164, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8904744982719421, + "num_tokens": 609753236.0, + "step": 15983 + }, + { + "epoch": 2.0333290929907135, + "ewc_loss": 0.008294539526104927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.294539293274283e-05, + "grad_norm": 4.080183982849121, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8842658996582031, + "num_tokens": 609791629.0, + "step": 15984 + }, + { + "epoch": 2.033456303269304, + "ewc_loss": 0.008292109705507755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.292109851026908e-05, + "grad_norm": 4.059296131134033, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8680644035339355, + "num_tokens": 609833457.0, + "step": 15985 + }, + { + "epoch": 2.0335835135478946, + "ewc_loss": 0.008264814503490925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264814096037298e-05, + "grad_norm": 4.135594844818115, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8750182390213013, + "num_tokens": 609866007.0, + "step": 15986 + }, + { + "epoch": 2.033710723826485, + "ewc_loss": 0.008307628333568573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3076287410222e-05, + "grad_norm": 4.020341873168945, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8874955177307129, + "num_tokens": 609903346.0, + "step": 15987 + }, + { + "epoch": 2.0338379341050756, + "ewc_loss": 0.008227827958762646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.22782822069712e-05, + "grad_norm": 4.1072306632995605, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8865070343017578, + "num_tokens": 609935885.0, + "step": 15988 + }, + { + "epoch": 2.033965144383666, + "ewc_loss": 0.008316989056766033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316989260492846e-05, + "grad_norm": 4.11041259765625, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8766586184501648, + "num_tokens": 609971127.0, + "step": 15989 + }, + { + "epoch": 2.0340923546622567, + "ewc_loss": 0.00828627124428749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286270895041525e-05, + "grad_norm": 3.994724988937378, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8757632970809937, + "num_tokens": 610016719.0, + "step": 15990 + }, + { + "epoch": 2.034219564940847, + "ewc_loss": 0.008235144428908825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.235144196078181e-05, + "grad_norm": 4.073225498199463, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8728724718093872, + "num_tokens": 610057002.0, + "step": 15991 + }, + { + "epoch": 2.0343467752194377, + "ewc_loss": 0.008314164355397224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314164006151259e-05, + "grad_norm": 4.080498695373535, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8869594931602478, + "num_tokens": 610092779.0, + "step": 15992 + }, + { + "epoch": 2.0344739854980283, + "ewc_loss": 0.008274070918560028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274070569314063e-05, + "grad_norm": 4.049131870269775, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8739776015281677, + "num_tokens": 610137092.0, + "step": 15993 + }, + { + "epoch": 2.034601195776619, + "ewc_loss": 0.008272862061858177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.272862032754347e-05, + "grad_norm": 4.005372047424316, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8901219964027405, + "num_tokens": 610178632.0, + "step": 15994 + }, + { + "epoch": 2.0347284060552093, + "ewc_loss": 0.008256391622126102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25639144750312e-05, + "grad_norm": 4.056047439575195, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8828901648521423, + "num_tokens": 610219789.0, + "step": 15995 + }, + { + "epoch": 2.0348556163338, + "ewc_loss": 0.008294458501040936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.294458530144766e-05, + "grad_norm": 4.061206340789795, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8697543740272522, + "num_tokens": 610259104.0, + "step": 15996 + }, + { + "epoch": 2.0349828266123904, + "ewc_loss": 0.008259299211204052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.259299647761509e-05, + "grad_norm": 4.109074592590332, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8756621479988098, + "num_tokens": 610296317.0, + "step": 15997 + }, + { + "epoch": 2.035110036890981, + "ewc_loss": 0.00827618595212698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276185690192506e-05, + "grad_norm": 4.050848484039307, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8798018097877502, + "num_tokens": 610339300.0, + "step": 15998 + }, + { + "epoch": 2.0352372471695714, + "ewc_loss": 0.008250324055552483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.250324026448652e-05, + "grad_norm": 4.115576267242432, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8793673515319824, + "num_tokens": 610373917.0, + "step": 15999 + }, + { + "epoch": 2.035364457448162, + "ewc_loss": 0.008300228044390678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300227636937052e-05, + "grad_norm": 4.003235340118408, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8823361396789551, + "num_tokens": 610419571.0, + "step": 16000 + }, + { + "epoch": 2.0354916677267525, + "ewc_loss": 0.008222547359764576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.222547330660746e-05, + "grad_norm": 4.071086406707764, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8728909492492676, + "num_tokens": 610458073.0, + "step": 16001 + }, + { + "epoch": 2.035618878005343, + "ewc_loss": 0.008298109285533428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298109605675563e-05, + "grad_norm": 4.093547344207764, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8755900859832764, + "num_tokens": 610497223.0, + "step": 16002 + }, + { + "epoch": 2.0357460882839336, + "ewc_loss": 0.008286620490252972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286620141007006e-05, + "grad_norm": 4.1424641609191895, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8631888628005981, + "num_tokens": 610532563.0, + "step": 16003 + }, + { + "epoch": 2.0358732985625236, + "ewc_loss": 0.008290223777294159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.290223922813311e-05, + "grad_norm": 4.032351016998291, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8939858675003052, + "num_tokens": 610566644.0, + "step": 16004 + }, + { + "epoch": 2.036000508841114, + "ewc_loss": 0.00824867095798254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.24867092887871e-05, + "grad_norm": 4.0928473472595215, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8696926236152649, + "num_tokens": 610604755.0, + "step": 16005 + }, + { + "epoch": 2.0361277191197047, + "ewc_loss": 0.008304739370942116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304739458253607e-05, + "grad_norm": 4.013119220733643, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.886502742767334, + "num_tokens": 610645905.0, + "step": 16006 + }, + { + "epoch": 2.036254929398295, + "ewc_loss": 0.008249989710748196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.249990059994161e-05, + "grad_norm": 4.044589042663574, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8978887796401978, + "num_tokens": 610686800.0, + "step": 16007 + }, + { + "epoch": 2.0363821396768857, + "ewc_loss": 0.008293419145047665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29341952339746e-05, + "grad_norm": 4.026111602783203, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8825018405914307, + "num_tokens": 610728291.0, + "step": 16008 + }, + { + "epoch": 2.0365093499554763, + "ewc_loss": 0.008268959820270538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26895993668586e-05, + "grad_norm": 4.183539390563965, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8810083866119385, + "num_tokens": 610759818.0, + "step": 16009 + }, + { + "epoch": 2.036636560234067, + "ewc_loss": 0.008382657542824745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382657688343897e-05, + "grad_norm": 4.041396141052246, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8690507411956787, + "num_tokens": 610800125.0, + "step": 16010 + }, + { + "epoch": 2.0367637705126573, + "ewc_loss": 0.008231310173869133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.231310494011268e-05, + "grad_norm": 4.063440322875977, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8749752044677734, + "num_tokens": 610839261.0, + "step": 16011 + }, + { + "epoch": 2.036890980791248, + "ewc_loss": 0.008307382464408875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30738281365484e-05, + "grad_norm": 4.024501800537109, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8867256045341492, + "num_tokens": 610879092.0, + "step": 16012 + }, + { + "epoch": 2.0370181910698384, + "ewc_loss": 0.00827646441757679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276464359369129e-05, + "grad_norm": 4.0438055992126465, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8802972435951233, + "num_tokens": 610914494.0, + "step": 16013 + }, + { + "epoch": 2.037145401348429, + "ewc_loss": 0.00830813404172659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308134420076385e-05, + "grad_norm": 4.077317237854004, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8914445638656616, + "num_tokens": 610950784.0, + "step": 16014 + }, + { + "epoch": 2.0372726116270194, + "ewc_loss": 0.008315755985677242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315755985677242e-05, + "grad_norm": 4.038732528686523, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8730365037918091, + "num_tokens": 610989254.0, + "step": 16015 + }, + { + "epoch": 2.03739982190561, + "ewc_loss": 0.008304455317556858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304455695906654e-05, + "grad_norm": 4.111659049987793, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8759880065917969, + "num_tokens": 611022668.0, + "step": 16016 + }, + { + "epoch": 2.0375270321842005, + "ewc_loss": 0.00833580270409584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33580270409584e-05, + "grad_norm": 4.076368808746338, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.881850004196167, + "num_tokens": 611062110.0, + "step": 16017 + }, + { + "epoch": 2.037654242462791, + "ewc_loss": 0.008327525109052658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327525574713945e-05, + "grad_norm": 4.055459499359131, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8860911726951599, + "num_tokens": 611098663.0, + "step": 16018 + }, + { + "epoch": 2.0377814527413816, + "ewc_loss": 0.008308679796755314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308679389301687e-05, + "grad_norm": 4.069003582000732, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8795653581619263, + "num_tokens": 611134058.0, + "step": 16019 + }, + { + "epoch": 2.037908663019972, + "ewc_loss": 0.008339610882103443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339610940311104e-05, + "grad_norm": 4.004184722900391, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8821454048156738, + "num_tokens": 611174748.0, + "step": 16020 + }, + { + "epoch": 2.0380358732985626, + "ewc_loss": 0.00829624105244875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.296241139760241e-05, + "grad_norm": 4.008171558380127, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8980569839477539, + "num_tokens": 611216792.0, + "step": 16021 + }, + { + "epoch": 2.038163083577153, + "ewc_loss": 0.008330798707902431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330798300448805e-05, + "grad_norm": 4.058289527893066, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8911498785018921, + "num_tokens": 611253996.0, + "step": 16022 + }, + { + "epoch": 2.0382902938557437, + "ewc_loss": 0.008344889618456364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344889647560194e-05, + "grad_norm": 4.011581897735596, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8910437822341919, + "num_tokens": 611295351.0, + "step": 16023 + }, + { + "epoch": 2.038417504134334, + "ewc_loss": 0.00829209852963686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.292098209494725e-05, + "grad_norm": 4.095170974731445, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8824125528335571, + "num_tokens": 611330513.0, + "step": 16024 + }, + { + "epoch": 2.0385447144129247, + "ewc_loss": 0.008356689475476742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356689795618877e-05, + "grad_norm": 4.035314083099365, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.890265941619873, + "num_tokens": 611369679.0, + "step": 16025 + }, + { + "epoch": 2.0386719246915153, + "ewc_loss": 0.008265811018645763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.265810902230442e-05, + "grad_norm": 4.0428643226623535, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.89019376039505, + "num_tokens": 611410398.0, + "step": 16026 + }, + { + "epoch": 2.038799134970106, + "ewc_loss": 0.008295884355902672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.295883890241385e-05, + "grad_norm": 4.075239658355713, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8739655017852783, + "num_tokens": 611447151.0, + "step": 16027 + }, + { + "epoch": 2.0389263452486963, + "ewc_loss": 0.00831117294728756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311172859976068e-05, + "grad_norm": 4.083492279052734, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8756657838821411, + "num_tokens": 611486923.0, + "step": 16028 + }, + { + "epoch": 2.0390535555272864, + "ewc_loss": 0.008279303088784218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.279302710434422e-05, + "grad_norm": 4.092019557952881, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8886441588401794, + "num_tokens": 611519502.0, + "step": 16029 + }, + { + "epoch": 2.039180765805877, + "ewc_loss": 0.00828513316810131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.285132935270667e-05, + "grad_norm": 4.076857566833496, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8692867755889893, + "num_tokens": 611557647.0, + "step": 16030 + }, + { + "epoch": 2.0393079760844675, + "ewc_loss": 0.00829970370978117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299703767988831e-05, + "grad_norm": 4.049328327178955, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8798161745071411, + "num_tokens": 611597405.0, + "step": 16031 + }, + { + "epoch": 2.039435186363058, + "ewc_loss": 0.008275764063000679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.275763684650883e-05, + "grad_norm": 4.058560848236084, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8770551085472107, + "num_tokens": 611635996.0, + "step": 16032 + }, + { + "epoch": 2.0395623966416485, + "ewc_loss": 0.008286370895802975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286371303256601e-05, + "grad_norm": 4.036507606506348, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8894485831260681, + "num_tokens": 611677900.0, + "step": 16033 + }, + { + "epoch": 2.039689606920239, + "ewc_loss": 0.008259106427431107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.259106107288972e-05, + "grad_norm": 4.0221638679504395, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8860465288162231, + "num_tokens": 611719184.0, + "step": 16034 + }, + { + "epoch": 2.0398168171988296, + "ewc_loss": 0.008266041986644268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.266041550086811e-05, + "grad_norm": 4.050866603851318, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8808097243309021, + "num_tokens": 611761270.0, + "step": 16035 + }, + { + "epoch": 2.03994402747742, + "ewc_loss": 0.00829634815454483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29634809633717e-05, + "grad_norm": 4.116642475128174, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8824371099472046, + "num_tokens": 611794939.0, + "step": 16036 + }, + { + "epoch": 2.0400712377560106, + "ewc_loss": 0.008311199955642223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31119978101924e-05, + "grad_norm": 4.004868984222412, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8906921744346619, + "num_tokens": 611834093.0, + "step": 16037 + }, + { + "epoch": 2.040198448034601, + "ewc_loss": 0.008234597742557526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.234597771661356e-05, + "grad_norm": 4.070590019226074, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8774546384811401, + "num_tokens": 611871327.0, + "step": 16038 + }, + { + "epoch": 2.0403256583131917, + "ewc_loss": 0.008316493593156338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31649376777932e-05, + "grad_norm": 4.089387893676758, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8769441843032837, + "num_tokens": 611907837.0, + "step": 16039 + }, + { + "epoch": 2.040452868591782, + "ewc_loss": 0.008291603066027164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291602716781199e-05, + "grad_norm": 4.034366607666016, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8660455942153931, + "num_tokens": 611950964.0, + "step": 16040 + }, + { + "epoch": 2.0405800788703727, + "ewc_loss": 0.008249134756624699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.249134407378733e-05, + "grad_norm": 4.086808681488037, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8792715072631836, + "num_tokens": 611985867.0, + "step": 16041 + }, + { + "epoch": 2.0407072891489633, + "ewc_loss": 0.008314928039908409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314927981700748e-05, + "grad_norm": 4.114161014556885, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.886621356010437, + "num_tokens": 612017819.0, + "step": 16042 + }, + { + "epoch": 2.040834499427554, + "ewc_loss": 0.008318410255014896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318410255014896e-05, + "grad_norm": 4.027096748352051, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8675850629806519, + "num_tokens": 612060669.0, + "step": 16043 + }, + { + "epoch": 2.0409617097061443, + "ewc_loss": 0.008263183757662773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26318355393596e-05, + "grad_norm": 4.097650527954102, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8689724206924438, + "num_tokens": 612099801.0, + "step": 16044 + }, + { + "epoch": 2.041088919984735, + "ewc_loss": 0.00834400113672018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344001253135502e-05, + "grad_norm": 4.043432235717773, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8853452801704407, + "num_tokens": 612139454.0, + "step": 16045 + }, + { + "epoch": 2.0412161302633254, + "ewc_loss": 0.00826268084347248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26268078526482e-05, + "grad_norm": 4.0570292472839355, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8641827702522278, + "num_tokens": 612181695.0, + "step": 16046 + }, + { + "epoch": 2.041343340541916, + "ewc_loss": 0.008294674567878246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.294674626085907e-05, + "grad_norm": 3.9893956184387207, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.895180881023407, + "num_tokens": 612223200.0, + "step": 16047 + }, + { + "epoch": 2.0414705508205064, + "ewc_loss": 0.008252467028796673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.252466795966029e-05, + "grad_norm": 4.058560371398926, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8833091855049133, + "num_tokens": 612265946.0, + "step": 16048 + }, + { + "epoch": 2.041597761099097, + "ewc_loss": 0.008305663242936134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305663504870608e-05, + "grad_norm": 4.044573783874512, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8971278667449951, + "num_tokens": 612301246.0, + "step": 16049 + }, + { + "epoch": 2.0417249713776875, + "ewc_loss": 0.008293822407722473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293822611449286e-05, + "grad_norm": 4.110193252563477, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8685813546180725, + "num_tokens": 612337789.0, + "step": 16050 + }, + { + "epoch": 2.041852181656278, + "ewc_loss": 0.008318295702338219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318296022480354e-05, + "grad_norm": 4.022305965423584, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.895948588848114, + "num_tokens": 612372391.0, + "step": 16051 + }, + { + "epoch": 2.0419793919348685, + "ewc_loss": 0.008275588043034077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.27558760647662e-05, + "grad_norm": 4.080162525177002, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8872634172439575, + "num_tokens": 612408806.0, + "step": 16052 + }, + { + "epoch": 2.0421066022134586, + "ewc_loss": 0.008330409415066242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330409036716446e-05, + "grad_norm": 4.030887603759766, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8919732570648193, + "num_tokens": 612446349.0, + "step": 16053 + }, + { + "epoch": 2.042233812492049, + "ewc_loss": 0.008306373842060566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.306373638333753e-05, + "grad_norm": 4.016165256500244, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8951048254966736, + "num_tokens": 612489899.0, + "step": 16054 + }, + { + "epoch": 2.0423610227706397, + "ewc_loss": 0.0082970280200243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.297027670778334e-05, + "grad_norm": 4.052384376525879, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8771458864212036, + "num_tokens": 612532314.0, + "step": 16055 + }, + { + "epoch": 2.04248823304923, + "ewc_loss": 0.008319994434714317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319994230987504e-05, + "grad_norm": 4.076302528381348, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8905096650123596, + "num_tokens": 612568033.0, + "step": 16056 + }, + { + "epoch": 2.0426154433278207, + "ewc_loss": 0.008308256044983864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308255928568542e-05, + "grad_norm": 4.076119899749756, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8911218047142029, + "num_tokens": 612605160.0, + "step": 16057 + }, + { + "epoch": 2.0427426536064113, + "ewc_loss": 0.008288496173918247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288496610475704e-05, + "grad_norm": 4.02006721496582, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8710634112358093, + "num_tokens": 612647851.0, + "step": 16058 + }, + { + "epoch": 2.042869863885002, + "ewc_loss": 0.008248995058238506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.248994708992541e-05, + "grad_norm": 4.092963695526123, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8825122117996216, + "num_tokens": 612684770.0, + "step": 16059 + }, + { + "epoch": 2.0429970741635923, + "ewc_loss": 0.00830454658716917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304546645376831e-05, + "grad_norm": 4.082996845245361, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8716088533401489, + "num_tokens": 612721689.0, + "step": 16060 + }, + { + "epoch": 2.043124284442183, + "ewc_loss": 0.008284831419587135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28483171062544e-05, + "grad_norm": 4.089595794677734, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8816414475440979, + "num_tokens": 612757256.0, + "step": 16061 + }, + { + "epoch": 2.0432514947207734, + "ewc_loss": 0.008291926234960556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291925769299269e-05, + "grad_norm": 4.0215606689453125, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8896591067314148, + "num_tokens": 612797806.0, + "step": 16062 + }, + { + "epoch": 2.043378704999364, + "ewc_loss": 0.00825637485831976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.256374712800607e-05, + "grad_norm": 4.058984756469727, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8933846950531006, + "num_tokens": 612833507.0, + "step": 16063 + }, + { + "epoch": 2.0435059152779544, + "ewc_loss": 0.008303619921207428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303619688376784e-05, + "grad_norm": 4.061710834503174, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.889244794845581, + "num_tokens": 612872920.0, + "step": 16064 + }, + { + "epoch": 2.043633125556545, + "ewc_loss": 0.008291789330542088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291789708891883e-05, + "grad_norm": 4.054809093475342, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8879656791687012, + "num_tokens": 612909534.0, + "step": 16065 + }, + { + "epoch": 2.0437603358351355, + "ewc_loss": 0.008293353952467442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293354039778933e-05, + "grad_norm": 4.059390544891357, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8794724941253662, + "num_tokens": 612950531.0, + "step": 16066 + }, + { + "epoch": 2.043887546113726, + "ewc_loss": 0.008277343586087227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.277344022644684e-05, + "grad_norm": 4.093629360198975, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8841953277587891, + "num_tokens": 612984638.0, + "step": 16067 + }, + { + "epoch": 2.0440147563923166, + "ewc_loss": 0.008317646570503712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317647007061169e-05, + "grad_norm": 4.074474334716797, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8938868641853333, + "num_tokens": 613020632.0, + "step": 16068 + }, + { + "epoch": 2.044141966670907, + "ewc_loss": 0.00829608365893364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.296083251480013e-05, + "grad_norm": 4.0505805015563965, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8860801458358765, + "num_tokens": 613057381.0, + "step": 16069 + }, + { + "epoch": 2.0442691769494976, + "ewc_loss": 0.008296748623251915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.296749001601711e-05, + "grad_norm": 4.070728778839111, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8830080628395081, + "num_tokens": 613094287.0, + "step": 16070 + }, + { + "epoch": 2.044396387228088, + "ewc_loss": 0.008305653929710388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30565404612571e-05, + "grad_norm": 4.076851844787598, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8887562155723572, + "num_tokens": 613126315.0, + "step": 16071 + }, + { + "epoch": 2.0445235975066787, + "ewc_loss": 0.008325213566422462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325213275384158e-05, + "grad_norm": 4.1028008460998535, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8780907988548279, + "num_tokens": 613160970.0, + "step": 16072 + }, + { + "epoch": 2.044650807785269, + "ewc_loss": 0.008323757909238338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323758083861321e-05, + "grad_norm": 4.005220890045166, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8847838640213013, + "num_tokens": 613204685.0, + "step": 16073 + }, + { + "epoch": 2.0447780180638597, + "ewc_loss": 0.008264423348009586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264423377113417e-05, + "grad_norm": 4.042405128479004, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8904911279678345, + "num_tokens": 613242394.0, + "step": 16074 + }, + { + "epoch": 2.0449052283424503, + "ewc_loss": 0.008315067738294601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31506768008694e-05, + "grad_norm": 4.005790710449219, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.890831470489502, + "num_tokens": 613283517.0, + "step": 16075 + }, + { + "epoch": 2.045032438621041, + "ewc_loss": 0.008271930739283562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271930710179731e-05, + "grad_norm": 4.080749034881592, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8883500099182129, + "num_tokens": 613317237.0, + "step": 16076 + }, + { + "epoch": 2.045159648899631, + "ewc_loss": 0.008324737660586834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32473742775619e-05, + "grad_norm": 4.061166763305664, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8859808444976807, + "num_tokens": 613355741.0, + "step": 16077 + }, + { + "epoch": 2.0452868591782214, + "ewc_loss": 0.008276472799479961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276473090518266e-05, + "grad_norm": 4.07377815246582, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8747367858886719, + "num_tokens": 613392131.0, + "step": 16078 + }, + { + "epoch": 2.045414069456812, + "ewc_loss": 0.008310454897582531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.310454722959548e-05, + "grad_norm": 4.052093982696533, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8746448159217834, + "num_tokens": 613433687.0, + "step": 16079 + }, + { + "epoch": 2.0455412797354025, + "ewc_loss": 0.008271153084933758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271152910310775e-05, + "grad_norm": 4.03236198425293, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.887814998626709, + "num_tokens": 613472882.0, + "step": 16080 + }, + { + "epoch": 2.045668490013993, + "ewc_loss": 0.00829540565609932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.295405859826133e-05, + "grad_norm": 4.020785808563232, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8857073187828064, + "num_tokens": 613515869.0, + "step": 16081 + }, + { + "epoch": 2.0457957002925835, + "ewc_loss": 0.008257078938186169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25707902549766e-05, + "grad_norm": 4.056205749511719, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8842346668243408, + "num_tokens": 613551835.0, + "step": 16082 + }, + { + "epoch": 2.045922910571174, + "ewc_loss": 0.008300243876874447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300243644043803e-05, + "grad_norm": 4.042757034301758, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8809714317321777, + "num_tokens": 613592277.0, + "step": 16083 + }, + { + "epoch": 2.0460501208497646, + "ewc_loss": 0.00827484205365181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274841820821166e-05, + "grad_norm": 4.100614547729492, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8839608430862427, + "num_tokens": 613625594.0, + "step": 16084 + }, + { + "epoch": 2.046177331128355, + "ewc_loss": 0.008319800719618797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319800690514967e-05, + "grad_norm": 4.083709239959717, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8848918080329895, + "num_tokens": 613659924.0, + "step": 16085 + }, + { + "epoch": 2.0463045414069456, + "ewc_loss": 0.008288324810564518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28832489787601e-05, + "grad_norm": 4.0991902351379395, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8652411699295044, + "num_tokens": 613697067.0, + "step": 16086 + }, + { + "epoch": 2.046431751685536, + "ewc_loss": 0.00830861460417509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308614633278921e-05, + "grad_norm": 4.004627704620361, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8896740674972534, + "num_tokens": 613738211.0, + "step": 16087 + }, + { + "epoch": 2.0465589619641267, + "ewc_loss": 0.008240765891969204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2407656009309e-05, + "grad_norm": 3.9976606369018555, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8873447179794312, + "num_tokens": 613782590.0, + "step": 16088 + }, + { + "epoch": 2.046686172242717, + "ewc_loss": 0.008267015218734741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.267015073215589e-05, + "grad_norm": 4.004513263702393, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8900326490402222, + "num_tokens": 613825770.0, + "step": 16089 + }, + { + "epoch": 2.0468133825213077, + "ewc_loss": 0.008287792094051838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.287792297778651e-05, + "grad_norm": 4.048880577087402, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8904938101768494, + "num_tokens": 613864025.0, + "step": 16090 + }, + { + "epoch": 2.0469405927998983, + "ewc_loss": 0.008308802731335163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308803080581129e-05, + "grad_norm": 4.071498394012451, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8857083916664124, + "num_tokens": 613907181.0, + "step": 16091 + }, + { + "epoch": 2.047067803078489, + "ewc_loss": 0.00827430933713913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274309220723808e-05, + "grad_norm": 4.052864074707031, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8742473125457764, + "num_tokens": 613951258.0, + "step": 16092 + }, + { + "epoch": 2.0471950133570793, + "ewc_loss": 0.008260231465101242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.260231697931886e-05, + "grad_norm": 4.035667896270752, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8822740316390991, + "num_tokens": 613991221.0, + "step": 16093 + }, + { + "epoch": 2.04732222363567, + "ewc_loss": 0.008256559260189533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.256559522124007e-05, + "grad_norm": 4.068269729614258, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8919044733047485, + "num_tokens": 614026862.0, + "step": 16094 + }, + { + "epoch": 2.0474494339142604, + "ewc_loss": 0.008256030268967152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.256030560005456e-05, + "grad_norm": 4.071131229400635, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8751052618026733, + "num_tokens": 614067560.0, + "step": 16095 + }, + { + "epoch": 2.047576644192851, + "ewc_loss": 0.008239179849624634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.239179442171007e-05, + "grad_norm": 4.041093349456787, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8878323435783386, + "num_tokens": 614106974.0, + "step": 16096 + }, + { + "epoch": 2.0477038544714414, + "ewc_loss": 0.00820610299706459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.20610293885693e-05, + "grad_norm": 3.990269660949707, + "learning_rate": 1e-06, + "loss": 0.2587, + "mean_token_accuracy": 0.9082685708999634, + "num_tokens": 614145641.0, + "step": 16097 + }, + { + "epoch": 2.047831064750032, + "ewc_loss": 0.008190728724002838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.19072884041816e-05, + "grad_norm": 4.068434238433838, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8880367279052734, + "num_tokens": 614181476.0, + "step": 16098 + }, + { + "epoch": 2.0479582750286225, + "ewc_loss": 0.00824713334441185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.247133519034833e-05, + "grad_norm": 4.064768314361572, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8776520490646362, + "num_tokens": 614219212.0, + "step": 16099 + }, + { + "epoch": 2.048085485307213, + "ewc_loss": 0.008211786858737469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.21178691694513e-05, + "grad_norm": 4.036598205566406, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8560885190963745, + "num_tokens": 614264017.0, + "step": 16100 + }, + { + "epoch": 2.0482126955858035, + "ewc_loss": 0.00820617750287056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.206177881220356e-05, + "grad_norm": 4.085479259490967, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8693241477012634, + "num_tokens": 614299031.0, + "step": 16101 + }, + { + "epoch": 2.0483399058643936, + "ewc_loss": 0.008230473846197128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.230473758885637e-05, + "grad_norm": 4.102311134338379, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8873574733734131, + "num_tokens": 614334387.0, + "step": 16102 + }, + { + "epoch": 2.048467116142984, + "ewc_loss": 0.00824093259871006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.240932220360264e-05, + "grad_norm": 4.071444511413574, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8838338851928711, + "num_tokens": 614372438.0, + "step": 16103 + }, + { + "epoch": 2.0485943264215747, + "ewc_loss": 0.008225341327488422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.225341298384592e-05, + "grad_norm": 4.060284614562988, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8792818784713745, + "num_tokens": 614409976.0, + "step": 16104 + }, + { + "epoch": 2.048721536700165, + "ewc_loss": 0.00823313370347023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.233133848989382e-05, + "grad_norm": 4.0384063720703125, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8936274647712708, + "num_tokens": 614448740.0, + "step": 16105 + }, + { + "epoch": 2.0488487469787557, + "ewc_loss": 0.008231918327510357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.231918036472052e-05, + "grad_norm": 4.072732448577881, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8870809078216553, + "num_tokens": 614484644.0, + "step": 16106 + }, + { + "epoch": 2.0489759572573463, + "ewc_loss": 0.008247848600149155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.247848745668307e-05, + "grad_norm": 4.163253307342529, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8654072284698486, + "num_tokens": 614521210.0, + "step": 16107 + }, + { + "epoch": 2.049103167535937, + "ewc_loss": 0.008299493230879307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299493492813781e-05, + "grad_norm": 4.023784160614014, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.879593014717102, + "num_tokens": 614561783.0, + "step": 16108 + }, + { + "epoch": 2.0492303778145273, + "ewc_loss": 0.008195887319743633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.195887494366616e-05, + "grad_norm": 4.05973482131958, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8831440210342407, + "num_tokens": 614598643.0, + "step": 16109 + }, + { + "epoch": 2.049357588093118, + "ewc_loss": 0.00829814001917839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298140164697543e-05, + "grad_norm": 4.094641208648682, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8834333419799805, + "num_tokens": 614633520.0, + "step": 16110 + }, + { + "epoch": 2.0494847983717084, + "ewc_loss": 0.00829927995800972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299280307255685e-05, + "grad_norm": 4.025838375091553, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8705841302871704, + "num_tokens": 614673266.0, + "step": 16111 + }, + { + "epoch": 2.049612008650299, + "ewc_loss": 0.00824135821312666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.241358591476455e-05, + "grad_norm": 4.0657958984375, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8738218545913696, + "num_tokens": 614707201.0, + "step": 16112 + }, + { + "epoch": 2.0497392189288894, + "ewc_loss": 0.008331692777574062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331692515639588e-05, + "grad_norm": 4.009680271148682, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8866461515426636, + "num_tokens": 614746542.0, + "step": 16113 + }, + { + "epoch": 2.04986642920748, + "ewc_loss": 0.008277476765215397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.277477172669023e-05, + "grad_norm": 4.0783281326293945, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8714419603347778, + "num_tokens": 614781804.0, + "step": 16114 + }, + { + "epoch": 2.0499936394860705, + "ewc_loss": 0.008360983803868294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360983338207006e-05, + "grad_norm": 4.086523056030273, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8819819688796997, + "num_tokens": 614817281.0, + "step": 16115 + }, + { + "epoch": 2.050120849764661, + "ewc_loss": 0.008331825956702232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331825665663928e-05, + "grad_norm": 4.071247577667236, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8740073442459106, + "num_tokens": 614853666.0, + "step": 16116 + }, + { + "epoch": 2.0502480600432516, + "ewc_loss": 0.00832116324454546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321163477376103e-05, + "grad_norm": 4.041691303253174, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8823633790016174, + "num_tokens": 614891221.0, + "step": 16117 + }, + { + "epoch": 2.050375270321842, + "ewc_loss": 0.008322157897055149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322158100781962e-05, + "grad_norm": 4.044137477874756, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8896394968032837, + "num_tokens": 614925956.0, + "step": 16118 + }, + { + "epoch": 2.0505024806004326, + "ewc_loss": 0.008338768035173416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33876765682362e-05, + "grad_norm": 4.107990741729736, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.888083815574646, + "num_tokens": 614959934.0, + "step": 16119 + }, + { + "epoch": 2.050629690879023, + "ewc_loss": 0.008364601992070675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36460167192854e-05, + "grad_norm": 4.021506309509277, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8804216384887695, + "num_tokens": 615000579.0, + "step": 16120 + }, + { + "epoch": 2.0507569011576137, + "ewc_loss": 0.008307049050927162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307048847200349e-05, + "grad_norm": 4.101111888885498, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.874878466129303, + "num_tokens": 615037371.0, + "step": 16121 + }, + { + "epoch": 2.050884111436204, + "ewc_loss": 0.008388267830014229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388268179260194e-05, + "grad_norm": 4.049900531768799, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8732185363769531, + "num_tokens": 615075948.0, + "step": 16122 + }, + { + "epoch": 2.0510113217147947, + "ewc_loss": 0.00834893062710762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348930714419112e-05, + "grad_norm": 4.006044864654541, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8925532102584839, + "num_tokens": 615114619.0, + "step": 16123 + }, + { + "epoch": 2.0511385319933853, + "ewc_loss": 0.008336926810443401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336926839547232e-05, + "grad_norm": 4.112358570098877, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8834870457649231, + "num_tokens": 615149260.0, + "step": 16124 + }, + { + "epoch": 2.051265742271976, + "ewc_loss": 0.008425360545516014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425361011177301e-05, + "grad_norm": 4.221960067749023, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8863791227340698, + "num_tokens": 615183284.0, + "step": 16125 + }, + { + "epoch": 2.0513929525505663, + "ewc_loss": 0.008436378091573715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436378266196698e-05, + "grad_norm": 4.051669597625732, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8878384828567505, + "num_tokens": 615220015.0, + "step": 16126 + }, + { + "epoch": 2.0515201628291564, + "ewc_loss": 0.008306633681058884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.306633390020579e-05, + "grad_norm": 4.072057723999023, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8779928088188171, + "num_tokens": 615255614.0, + "step": 16127 + }, + { + "epoch": 2.051647373107747, + "ewc_loss": 0.00839146226644516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391462324652821e-05, + "grad_norm": 4.093607425689697, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8833639621734619, + "num_tokens": 615290232.0, + "step": 16128 + }, + { + "epoch": 2.0517745833863374, + "ewc_loss": 0.008382334373891354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382334635825828e-05, + "grad_norm": 4.062069892883301, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8876256346702576, + "num_tokens": 615328204.0, + "step": 16129 + }, + { + "epoch": 2.051901793664928, + "ewc_loss": 0.008363524451851845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363524102605879e-05, + "grad_norm": 4.130558490753174, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8720008730888367, + "num_tokens": 615362846.0, + "step": 16130 + }, + { + "epoch": 2.0520290039435185, + "ewc_loss": 0.008418967947363853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41896835481748e-05, + "grad_norm": 4.052000045776367, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8826582431793213, + "num_tokens": 615405404.0, + "step": 16131 + }, + { + "epoch": 2.052156214222109, + "ewc_loss": 0.008340392261743546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.340392378158867e-05, + "grad_norm": 4.124521255493164, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8724989295005798, + "num_tokens": 615439835.0, + "step": 16132 + }, + { + "epoch": 2.0522834245006996, + "ewc_loss": 0.008425842970609665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425843407167122e-05, + "grad_norm": 4.012266635894775, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.893696665763855, + "num_tokens": 615477832.0, + "step": 16133 + }, + { + "epoch": 2.05241063477929, + "ewc_loss": 0.008331822231411934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331822027685121e-05, + "grad_norm": 4.088047981262207, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8755571246147156, + "num_tokens": 615518089.0, + "step": 16134 + }, + { + "epoch": 2.0525378450578806, + "ewc_loss": 0.008418715558946133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418715879088268e-05, + "grad_norm": 4.038957595825195, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8886260986328125, + "num_tokens": 615557286.0, + "step": 16135 + }, + { + "epoch": 2.052665055336471, + "ewc_loss": 0.008358214981853962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35821483633481e-05, + "grad_norm": 4.0730180740356445, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8841392993927002, + "num_tokens": 615593284.0, + "step": 16136 + }, + { + "epoch": 2.0527922656150617, + "ewc_loss": 0.008382277563214302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382277883356437e-05, + "grad_norm": 4.079014778137207, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8783119916915894, + "num_tokens": 615633579.0, + "step": 16137 + }, + { + "epoch": 2.052919475893652, + "ewc_loss": 0.008378897793591022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378897473448887e-05, + "grad_norm": 3.9765384197235107, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8865641951560974, + "num_tokens": 615680135.0, + "step": 16138 + }, + { + "epoch": 2.0530466861722427, + "ewc_loss": 0.008296101354062557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.296101441374049e-05, + "grad_norm": 4.158687114715576, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8749116659164429, + "num_tokens": 615714801.0, + "step": 16139 + }, + { + "epoch": 2.0531738964508333, + "ewc_loss": 0.008457454852759838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457455260213464e-05, + "grad_norm": 4.110645771026611, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8751341104507446, + "num_tokens": 615750538.0, + "step": 16140 + }, + { + "epoch": 2.053301106729424, + "ewc_loss": 0.008345091715455055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345091919181868e-05, + "grad_norm": 4.036133766174316, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8879160284996033, + "num_tokens": 615789465.0, + "step": 16141 + }, + { + "epoch": 2.0534283170080143, + "ewc_loss": 0.008316464722156525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316464663948864e-05, + "grad_norm": 4.048796653747559, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8692014217376709, + "num_tokens": 615832553.0, + "step": 16142 + }, + { + "epoch": 2.053555527286605, + "ewc_loss": 0.008348625153303146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348625124199316e-05, + "grad_norm": 4.105648994445801, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8802134990692139, + "num_tokens": 615868341.0, + "step": 16143 + }, + { + "epoch": 2.0536827375651954, + "ewc_loss": 0.008370621129870415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370621071662754e-05, + "grad_norm": 4.0668792724609375, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8930421471595764, + "num_tokens": 615904324.0, + "step": 16144 + }, + { + "epoch": 2.053809947843786, + "ewc_loss": 0.008311763405799866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311762940138578e-05, + "grad_norm": 3.9927093982696533, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.88853919506073, + "num_tokens": 615947656.0, + "step": 16145 + }, + { + "epoch": 2.0539371581223764, + "ewc_loss": 0.008285370655357838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.285370859084651e-05, + "grad_norm": 4.061784267425537, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8759182691574097, + "num_tokens": 615989050.0, + "step": 16146 + }, + { + "epoch": 2.054064368400967, + "ewc_loss": 0.008369441144168377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369440911337733e-05, + "grad_norm": 4.085330009460449, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8859320878982544, + "num_tokens": 616027638.0, + "step": 16147 + }, + { + "epoch": 2.0541915786795575, + "ewc_loss": 0.008327899500727654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327899558935314e-05, + "grad_norm": 4.0425801277160645, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8908189535140991, + "num_tokens": 616066392.0, + "step": 16148 + }, + { + "epoch": 2.054318788958148, + "ewc_loss": 0.008299913257360458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299913315568119e-05, + "grad_norm": 4.00905704498291, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8918665051460266, + "num_tokens": 616108303.0, + "step": 16149 + }, + { + "epoch": 2.0544459992367385, + "ewc_loss": 0.008286859840154648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286859520012513e-05, + "grad_norm": 4.029108047485352, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8750877380371094, + "num_tokens": 616154174.0, + "step": 16150 + }, + { + "epoch": 2.0545732095153286, + "ewc_loss": 0.008283390663564205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28339034342207e-05, + "grad_norm": 4.162511348724365, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8811131715774536, + "num_tokens": 616193954.0, + "step": 16151 + }, + { + "epoch": 2.054700419793919, + "ewc_loss": 0.008341506123542786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341506327269599e-05, + "grad_norm": 4.0752272605896, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8871979713439941, + "num_tokens": 616232771.0, + "step": 16152 + }, + { + "epoch": 2.0548276300725097, + "ewc_loss": 0.008235367015004158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.235366840381175e-05, + "grad_norm": 4.101529121398926, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8922700881958008, + "num_tokens": 616265151.0, + "step": 16153 + }, + { + "epoch": 2.0549548403511, + "ewc_loss": 0.008288266137242317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288265962619334e-05, + "grad_norm": 4.05238676071167, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8786838054656982, + "num_tokens": 616303886.0, + "step": 16154 + }, + { + "epoch": 2.0550820506296907, + "ewc_loss": 0.00823257490992546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.232575055444613e-05, + "grad_norm": 4.070040702819824, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8906964063644409, + "num_tokens": 616340040.0, + "step": 16155 + }, + { + "epoch": 2.0552092609082813, + "ewc_loss": 0.008265551179647446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.265551150543615e-05, + "grad_norm": 4.091813087463379, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8872315883636475, + "num_tokens": 616371329.0, + "step": 16156 + }, + { + "epoch": 2.055336471186872, + "ewc_loss": 0.00828571617603302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.285715739475563e-05, + "grad_norm": 4.059398651123047, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8953880071640015, + "num_tokens": 616406383.0, + "step": 16157 + }, + { + "epoch": 2.0554636814654623, + "ewc_loss": 0.00825822539627552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.258225716417655e-05, + "grad_norm": 4.094966411590576, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8705894947052002, + "num_tokens": 616445600.0, + "step": 16158 + }, + { + "epoch": 2.055590891744053, + "ewc_loss": 0.008288301527500153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288301614811644e-05, + "grad_norm": 4.0542192459106445, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8747310638427734, + "num_tokens": 616485376.0, + "step": 16159 + }, + { + "epoch": 2.0557181020226434, + "ewc_loss": 0.008256141096353531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.256141154561192e-05, + "grad_norm": 4.076188564300537, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8733490705490112, + "num_tokens": 616527104.0, + "step": 16160 + }, + { + "epoch": 2.055845312301234, + "ewc_loss": 0.008298331871628761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298331522382796e-05, + "grad_norm": 4.030022144317627, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8839823007583618, + "num_tokens": 616567557.0, + "step": 16161 + }, + { + "epoch": 2.0559725225798244, + "ewc_loss": 0.008261265233159065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.261265611508861e-05, + "grad_norm": 4.1132965087890625, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8887625932693481, + "num_tokens": 616601612.0, + "step": 16162 + }, + { + "epoch": 2.056099732858415, + "ewc_loss": 0.008314757607877254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314757724292576e-05, + "grad_norm": 4.053135395050049, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8797256946563721, + "num_tokens": 616638019.0, + "step": 16163 + }, + { + "epoch": 2.0562269431370055, + "ewc_loss": 0.008274959400296211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274959691334516e-05, + "grad_norm": 4.090826034545898, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8818685412406921, + "num_tokens": 616675224.0, + "step": 16164 + }, + { + "epoch": 2.056354153415596, + "ewc_loss": 0.008305827155709267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305827213916928e-05, + "grad_norm": 4.048976898193359, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8800415992736816, + "num_tokens": 616713628.0, + "step": 16165 + }, + { + "epoch": 2.0564813636941865, + "ewc_loss": 0.008287638425827026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.287638775072992e-05, + "grad_norm": 4.096560001373291, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8817878365516663, + "num_tokens": 616755985.0, + "step": 16166 + }, + { + "epoch": 2.056608573972777, + "ewc_loss": 0.00832031387835741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320313645526767e-05, + "grad_norm": 4.094543933868408, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8846046924591064, + "num_tokens": 616786564.0, + "step": 16167 + }, + { + "epoch": 2.0567357842513676, + "ewc_loss": 0.00831760186702013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31760189612396e-05, + "grad_norm": 4.119806289672852, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8569060564041138, + "num_tokens": 616823285.0, + "step": 16168 + }, + { + "epoch": 2.056862994529958, + "ewc_loss": 0.008349996991455555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349997369805351e-05, + "grad_norm": 4.009688377380371, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8883423209190369, + "num_tokens": 616864465.0, + "step": 16169 + }, + { + "epoch": 2.0569902048085487, + "ewc_loss": 0.008257286623120308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.257286390289664e-05, + "grad_norm": 4.04354190826416, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8749094009399414, + "num_tokens": 616906670.0, + "step": 16170 + }, + { + "epoch": 2.057117415087139, + "ewc_loss": 0.008338280022144318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33828016766347e-05, + "grad_norm": 4.042645454406738, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8820448517799377, + "num_tokens": 616947656.0, + "step": 16171 + }, + { + "epoch": 2.0572446253657297, + "ewc_loss": 0.00831513199955225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315131708513945e-05, + "grad_norm": 4.161221981048584, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8765510320663452, + "num_tokens": 616980161.0, + "step": 16172 + }, + { + "epoch": 2.0573718356443202, + "ewc_loss": 0.008383404463529587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383404201595113e-05, + "grad_norm": 4.08645486831665, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8828927278518677, + "num_tokens": 617015281.0, + "step": 16173 + }, + { + "epoch": 2.0574990459229108, + "ewc_loss": 0.008302473463118076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30247372505255e-05, + "grad_norm": 4.026961803436279, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8837222456932068, + "num_tokens": 617054174.0, + "step": 16174 + }, + { + "epoch": 2.057626256201501, + "ewc_loss": 0.008290814235806465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.290814002975821e-05, + "grad_norm": 4.005764484405518, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.881737470626831, + "num_tokens": 617095160.0, + "step": 16175 + }, + { + "epoch": 2.0577534664800914, + "ewc_loss": 0.008307414129376411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307414100272581e-05, + "grad_norm": 4.140521049499512, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8863542675971985, + "num_tokens": 617130781.0, + "step": 16176 + }, + { + "epoch": 2.057880676758682, + "ewc_loss": 0.00839087925851345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390878792852163e-05, + "grad_norm": 4.114887237548828, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.880159854888916, + "num_tokens": 617169473.0, + "step": 16177 + }, + { + "epoch": 2.0580078870372724, + "ewc_loss": 0.008307325653731823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30732606118545e-05, + "grad_norm": 4.067986488342285, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8764699697494507, + "num_tokens": 617212208.0, + "step": 16178 + }, + { + "epoch": 2.058135097315863, + "ewc_loss": 0.008286647498607635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28664778964594e-05, + "grad_norm": 4.07502555847168, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8842142224311829, + "num_tokens": 617246055.0, + "step": 16179 + }, + { + "epoch": 2.0582623075944535, + "ewc_loss": 0.008322397246956825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322397479787469e-05, + "grad_norm": 4.083251953125, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8788855075836182, + "num_tokens": 617285614.0, + "step": 16180 + }, + { + "epoch": 2.058389517873044, + "ewc_loss": 0.008287941105663776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.287941454909742e-05, + "grad_norm": 4.023723602294922, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8907148838043213, + "num_tokens": 617323401.0, + "step": 16181 + }, + { + "epoch": 2.0585167281516346, + "ewc_loss": 0.008275882340967655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.275882282759994e-05, + "grad_norm": 4.001554012298584, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.869308352470398, + "num_tokens": 617372550.0, + "step": 16182 + }, + { + "epoch": 2.058643938430225, + "ewc_loss": 0.008280046284198761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280046313302591e-05, + "grad_norm": 4.067530632019043, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8894228935241699, + "num_tokens": 617407894.0, + "step": 16183 + }, + { + "epoch": 2.0587711487088156, + "ewc_loss": 0.008328023366630077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328023250214756e-05, + "grad_norm": 4.052736282348633, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8758724331855774, + "num_tokens": 617450117.0, + "step": 16184 + }, + { + "epoch": 2.058898358987406, + "ewc_loss": 0.008295858278870583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.295858424389735e-05, + "grad_norm": 4.0928778648376465, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8882863521575928, + "num_tokens": 617485885.0, + "step": 16185 + }, + { + "epoch": 2.0590255692659967, + "ewc_loss": 0.008307532407343388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307531970785931e-05, + "grad_norm": 4.125649929046631, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8681472539901733, + "num_tokens": 617520511.0, + "step": 16186 + }, + { + "epoch": 2.059152779544587, + "ewc_loss": 0.008317489176988602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317489118780941e-05, + "grad_norm": 4.085808277130127, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8760375380516052, + "num_tokens": 617558288.0, + "step": 16187 + }, + { + "epoch": 2.0592799898231777, + "ewc_loss": 0.008306440897285938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.306441304739565e-05, + "grad_norm": 3.9799294471740723, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8810659646987915, + "num_tokens": 617602991.0, + "step": 16188 + }, + { + "epoch": 2.0594072001017683, + "ewc_loss": 0.008247283287346363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.247283403761685e-05, + "grad_norm": 4.059241771697998, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8796383738517761, + "num_tokens": 617639629.0, + "step": 16189 + }, + { + "epoch": 2.059534410380359, + "ewc_loss": 0.008338071405887604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338071347679943e-05, + "grad_norm": 4.077392101287842, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8756698369979858, + "num_tokens": 617676218.0, + "step": 16190 + }, + { + "epoch": 2.0596616206589493, + "ewc_loss": 0.008308829739689827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30882927402854e-05, + "grad_norm": 4.022502899169922, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.875597357749939, + "num_tokens": 617716270.0, + "step": 16191 + }, + { + "epoch": 2.05978883093754, + "ewc_loss": 0.008267262019217014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26726172817871e-05, + "grad_norm": 4.03934907913208, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8903520107269287, + "num_tokens": 617755559.0, + "step": 16192 + }, + { + "epoch": 2.0599160412161304, + "ewc_loss": 0.008286923170089722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286923548439518e-05, + "grad_norm": 4.031177520751953, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8831582069396973, + "num_tokens": 617793522.0, + "step": 16193 + }, + { + "epoch": 2.060043251494721, + "ewc_loss": 0.008285206742584705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.285207150038332e-05, + "grad_norm": 4.103636264801025, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8852603435516357, + "num_tokens": 617828005.0, + "step": 16194 + }, + { + "epoch": 2.0601704617733114, + "ewc_loss": 0.008332237601280212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33223748486489e-05, + "grad_norm": 4.074260234832764, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8801947832107544, + "num_tokens": 617863824.0, + "step": 16195 + }, + { + "epoch": 2.060297672051902, + "ewc_loss": 0.008283419534564018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.283419447252527e-05, + "grad_norm": 4.032233238220215, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8762960433959961, + "num_tokens": 617903648.0, + "step": 16196 + }, + { + "epoch": 2.0604248823304925, + "ewc_loss": 0.008276421576738358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276421431219205e-05, + "grad_norm": 4.0006794929504395, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8793464303016663, + "num_tokens": 617946229.0, + "step": 16197 + }, + { + "epoch": 2.060552092609083, + "ewc_loss": 0.008275308646261692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.275308937299997e-05, + "grad_norm": 4.025655269622803, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8821595907211304, + "num_tokens": 617990486.0, + "step": 16198 + }, + { + "epoch": 2.0606793028876735, + "ewc_loss": 0.008296096697449684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29609707579948e-05, + "grad_norm": 4.039783954620361, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8931648135185242, + "num_tokens": 618031743.0, + "step": 16199 + }, + { + "epoch": 2.0608065131662636, + "ewc_loss": 0.00829275045543909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.292750862892717e-05, + "grad_norm": 4.054728031158447, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8803868293762207, + "num_tokens": 618070181.0, + "step": 16200 + }, + { + "epoch": 2.060933723444854, + "ewc_loss": 0.008287494070827961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28749398351647e-05, + "grad_norm": 4.08584451675415, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8879537582397461, + "num_tokens": 618106396.0, + "step": 16201 + }, + { + "epoch": 2.0610609337234447, + "ewc_loss": 0.008316811174154282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316810999531299e-05, + "grad_norm": 4.056363105773926, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8870091438293457, + "num_tokens": 618148281.0, + "step": 16202 + }, + { + "epoch": 2.061188144002035, + "ewc_loss": 0.008272047154605389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.272047125501558e-05, + "grad_norm": 4.101505279541016, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8757454752922058, + "num_tokens": 618182721.0, + "step": 16203 + }, + { + "epoch": 2.0613153542806257, + "ewc_loss": 0.00831680465489626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316804451169446e-05, + "grad_norm": 4.018793106079102, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.896613359451294, + "num_tokens": 618219497.0, + "step": 16204 + }, + { + "epoch": 2.0614425645592163, + "ewc_loss": 0.008264073170721531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264073403552175e-05, + "grad_norm": 4.039164066314697, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8969830274581909, + "num_tokens": 618258643.0, + "step": 16205 + }, + { + "epoch": 2.061569774837807, + "ewc_loss": 0.008288493379950523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288493700092658e-05, + "grad_norm": 4.075417518615723, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8953972458839417, + "num_tokens": 618292682.0, + "step": 16206 + }, + { + "epoch": 2.0616969851163973, + "ewc_loss": 0.008309517055749893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30951685202308e-05, + "grad_norm": 4.120673179626465, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8699119687080383, + "num_tokens": 618331375.0, + "step": 16207 + }, + { + "epoch": 2.061824195394988, + "ewc_loss": 0.008311154320836067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311153942486271e-05, + "grad_norm": 4.064872741699219, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8867135643959045, + "num_tokens": 618367452.0, + "step": 16208 + }, + { + "epoch": 2.0619514056735784, + "ewc_loss": 0.008278966881334782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.278966561192647e-05, + "grad_norm": 4.042939186096191, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8816508054733276, + "num_tokens": 618410470.0, + "step": 16209 + }, + { + "epoch": 2.062078615952169, + "ewc_loss": 0.008275194093585014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.275193977169693e-05, + "grad_norm": 4.091202259063721, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8737055659294128, + "num_tokens": 618447524.0, + "step": 16210 + }, + { + "epoch": 2.0622058262307594, + "ewc_loss": 0.008332226425409317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.332226570928469e-05, + "grad_norm": 4.141678810119629, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8645807504653931, + "num_tokens": 618481793.0, + "step": 16211 + }, + { + "epoch": 2.06233303650935, + "ewc_loss": 0.00831989198923111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319892367580906e-05, + "grad_norm": 4.038053035736084, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8896963000297546, + "num_tokens": 618517097.0, + "step": 16212 + }, + { + "epoch": 2.0624602467879405, + "ewc_loss": 0.008266064338386059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.266064105555415e-05, + "grad_norm": 4.061542987823486, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8797895908355713, + "num_tokens": 618554221.0, + "step": 16213 + }, + { + "epoch": 2.062587457066531, + "ewc_loss": 0.008329846896231174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329847332788631e-05, + "grad_norm": 4.0426926612854, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.891780436038971, + "num_tokens": 618591279.0, + "step": 16214 + }, + { + "epoch": 2.0627146673451215, + "ewc_loss": 0.008327626623213291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327626710524783e-05, + "grad_norm": 4.061852931976318, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8709173798561096, + "num_tokens": 618628168.0, + "step": 16215 + }, + { + "epoch": 2.062841877623712, + "ewc_loss": 0.00832982175052166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329821866936982e-05, + "grad_norm": 4.024191379547119, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8884567022323608, + "num_tokens": 618664580.0, + "step": 16216 + }, + { + "epoch": 2.0629690879023026, + "ewc_loss": 0.008314371109008789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314371370943263e-05, + "grad_norm": 4.126253604888916, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8708772659301758, + "num_tokens": 618699007.0, + "step": 16217 + }, + { + "epoch": 2.063096298180893, + "ewc_loss": 0.008378427475690842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378427446587011e-05, + "grad_norm": 4.06829309463501, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8835328221321106, + "num_tokens": 618736630.0, + "step": 16218 + }, + { + "epoch": 2.0632235084594837, + "ewc_loss": 0.008308779448270798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308779069921002e-05, + "grad_norm": 4.007019996643066, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8893849849700928, + "num_tokens": 618777868.0, + "step": 16219 + }, + { + "epoch": 2.063350718738074, + "ewc_loss": 0.008289354853332043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289355173474178e-05, + "grad_norm": 4.0179443359375, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8862067461013794, + "num_tokens": 618819134.0, + "step": 16220 + }, + { + "epoch": 2.0634779290166647, + "ewc_loss": 0.008309416472911835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309416443808004e-05, + "grad_norm": 4.000831604003906, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8819766044616699, + "num_tokens": 618862231.0, + "step": 16221 + }, + { + "epoch": 2.0636051392952552, + "ewc_loss": 0.00828761700540781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.287616947200149e-05, + "grad_norm": 4.036783218383789, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8755457401275635, + "num_tokens": 618906816.0, + "step": 16222 + }, + { + "epoch": 2.0637323495738458, + "ewc_loss": 0.008280488662421703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280488691525534e-05, + "grad_norm": 4.057175159454346, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8809226751327515, + "num_tokens": 618943239.0, + "step": 16223 + }, + { + "epoch": 2.0638595598524363, + "ewc_loss": 0.008295600302517414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.295600127894431e-05, + "grad_norm": 4.07089900970459, + "learning_rate": 1e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8970580697059631, + "num_tokens": 618978641.0, + "step": 16224 + }, + { + "epoch": 2.0639867701310264, + "ewc_loss": 0.008278002962470055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.278003224404529e-05, + "grad_norm": 4.05742073059082, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8838365077972412, + "num_tokens": 619020522.0, + "step": 16225 + }, + { + "epoch": 2.064113980409617, + "ewc_loss": 0.00824926421046257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.249263919424266e-05, + "grad_norm": 4.04578161239624, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.893545925617218, + "num_tokens": 619060769.0, + "step": 16226 + }, + { + "epoch": 2.0642411906882074, + "ewc_loss": 0.008252192288637161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.252192492363974e-05, + "grad_norm": 4.027641296386719, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8926076889038086, + "num_tokens": 619098711.0, + "step": 16227 + }, + { + "epoch": 2.064368400966798, + "ewc_loss": 0.008237154223024845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.237153815571219e-05, + "grad_norm": 4.109360218048096, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8783726692199707, + "num_tokens": 619133673.0, + "step": 16228 + }, + { + "epoch": 2.0644956112453885, + "ewc_loss": 0.008265594951808453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2655948062893e-05, + "grad_norm": 4.06704044342041, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.876861572265625, + "num_tokens": 619174554.0, + "step": 16229 + }, + { + "epoch": 2.064622821523979, + "ewc_loss": 0.008238843642175198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.238843292929232e-05, + "grad_norm": 4.089316368103027, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.872529149055481, + "num_tokens": 619211095.0, + "step": 16230 + }, + { + "epoch": 2.0647500318025696, + "ewc_loss": 0.008250216022133827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.250215614680201e-05, + "grad_norm": 4.0477190017700195, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8928686380386353, + "num_tokens": 619246346.0, + "step": 16231 + }, + { + "epoch": 2.06487724208116, + "ewc_loss": 0.008214760571718216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.214760600822046e-05, + "grad_norm": 4.11804723739624, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8796241879463196, + "num_tokens": 619280629.0, + "step": 16232 + }, + { + "epoch": 2.0650044523597506, + "ewc_loss": 0.008264502510428429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264502685051411e-05, + "grad_norm": 4.08726692199707, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8639841079711914, + "num_tokens": 619320928.0, + "step": 16233 + }, + { + "epoch": 2.065131662638341, + "ewc_loss": 0.008234377950429916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.234378037741408e-05, + "grad_norm": 3.964839220046997, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8821314573287964, + "num_tokens": 619369747.0, + "step": 16234 + }, + { + "epoch": 2.0652588729169317, + "ewc_loss": 0.008189566433429718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.189566869987175e-05, + "grad_norm": 4.061224937438965, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.883841872215271, + "num_tokens": 619408694.0, + "step": 16235 + }, + { + "epoch": 2.065386083195522, + "ewc_loss": 0.008299757726490498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299757610075176e-05, + "grad_norm": 4.041247367858887, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8912678360939026, + "num_tokens": 619446858.0, + "step": 16236 + }, + { + "epoch": 2.0655132934741127, + "ewc_loss": 0.008237199857831001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.237199654104188e-05, + "grad_norm": 4.0755205154418945, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8837333917617798, + "num_tokens": 619482281.0, + "step": 16237 + }, + { + "epoch": 2.0656405037527032, + "ewc_loss": 0.008276908658444881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276908920379356e-05, + "grad_norm": 4.034437656402588, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8907774090766907, + "num_tokens": 619520309.0, + "step": 16238 + }, + { + "epoch": 2.065767714031294, + "ewc_loss": 0.008236691355705261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.236691064666957e-05, + "grad_norm": 4.080826282501221, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.886316180229187, + "num_tokens": 619554662.0, + "step": 16239 + }, + { + "epoch": 2.0658949243098843, + "ewc_loss": 0.008285466581583023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.285466174129397e-05, + "grad_norm": 4.025938034057617, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8797011375427246, + "num_tokens": 619599632.0, + "step": 16240 + }, + { + "epoch": 2.066022134588475, + "ewc_loss": 0.00823202170431614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.232021355070174e-05, + "grad_norm": 4.043249607086182, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8950879573822021, + "num_tokens": 619639414.0, + "step": 16241 + }, + { + "epoch": 2.0661493448670654, + "ewc_loss": 0.00827019289135933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270193211501464e-05, + "grad_norm": 4.091846466064453, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8865678310394287, + "num_tokens": 619675878.0, + "step": 16242 + }, + { + "epoch": 2.066276555145656, + "ewc_loss": 0.008292434737086296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.292435086332262e-05, + "grad_norm": 4.09537935256958, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8860867619514465, + "num_tokens": 619709486.0, + "step": 16243 + }, + { + "epoch": 2.0664037654242464, + "ewc_loss": 0.008252733387053013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.252733096014708e-05, + "grad_norm": 4.0342841148376465, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8835723400115967, + "num_tokens": 619753576.0, + "step": 16244 + }, + { + "epoch": 2.066530975702837, + "ewc_loss": 0.008224917575716972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.224917837651446e-05, + "grad_norm": 4.093029022216797, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8836493492126465, + "num_tokens": 619792520.0, + "step": 16245 + }, + { + "epoch": 2.0666581859814275, + "ewc_loss": 0.008286911994218826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286911906907335e-05, + "grad_norm": 4.146759033203125, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.862498939037323, + "num_tokens": 619827871.0, + "step": 16246 + }, + { + "epoch": 2.066785396260018, + "ewc_loss": 0.00830511562526226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30511562526226e-05, + "grad_norm": 4.10184907913208, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8788276314735413, + "num_tokens": 619864527.0, + "step": 16247 + }, + { + "epoch": 2.0669126065386085, + "ewc_loss": 0.00826544314622879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.265443466370925e-05, + "grad_norm": 4.072573184967041, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8834215402603149, + "num_tokens": 619904934.0, + "step": 16248 + }, + { + "epoch": 2.0670398168171986, + "ewc_loss": 0.00825143326073885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.251432882389054e-05, + "grad_norm": 4.041810512542725, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8885544538497925, + "num_tokens": 619945472.0, + "step": 16249 + }, + { + "epoch": 2.067167027095789, + "ewc_loss": 0.008244316093623638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244316268246621e-05, + "grad_norm": 4.026369571685791, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8951984643936157, + "num_tokens": 619981738.0, + "step": 16250 + }, + { + "epoch": 2.0672942373743797, + "ewc_loss": 0.008250514976680279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.250515384133905e-05, + "grad_norm": 4.027804374694824, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8850958943367004, + "num_tokens": 620024593.0, + "step": 16251 + }, + { + "epoch": 2.06742144765297, + "ewc_loss": 0.008265475742518902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.265475480584428e-05, + "grad_norm": 4.052547454833984, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8881486654281616, + "num_tokens": 620064059.0, + "step": 16252 + }, + { + "epoch": 2.0675486579315607, + "ewc_loss": 0.008270049467682838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270049147540703e-05, + "grad_norm": 4.056096076965332, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8837146759033203, + "num_tokens": 620100028.0, + "step": 16253 + }, + { + "epoch": 2.0676758682101513, + "ewc_loss": 0.008261265233159065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2612648839131e-05, + "grad_norm": 4.075925350189209, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.896098256111145, + "num_tokens": 620135181.0, + "step": 16254 + }, + { + "epoch": 2.067803078488742, + "ewc_loss": 0.008263354189693928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.263353811344132e-05, + "grad_norm": 4.087676525115967, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8846647143363953, + "num_tokens": 620174887.0, + "step": 16255 + }, + { + "epoch": 2.0679302887673323, + "ewc_loss": 0.008255520835518837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.255521242972463e-05, + "grad_norm": 4.079548358917236, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8825053572654724, + "num_tokens": 620212924.0, + "step": 16256 + }, + { + "epoch": 2.068057499045923, + "ewc_loss": 0.008229635655879974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.229635568568483e-05, + "grad_norm": 4.0651984214782715, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8749677538871765, + "num_tokens": 620251088.0, + "step": 16257 + }, + { + "epoch": 2.0681847093245134, + "ewc_loss": 0.008245833218097687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.245833305409178e-05, + "grad_norm": 4.113231658935547, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.883745551109314, + "num_tokens": 620287888.0, + "step": 16258 + }, + { + "epoch": 2.068311919603104, + "ewc_loss": 0.008269908837974072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26990872155875e-05, + "grad_norm": 4.065804958343506, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8806938529014587, + "num_tokens": 620324937.0, + "step": 16259 + }, + { + "epoch": 2.0684391298816944, + "ewc_loss": 0.008208829909563065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.208829967770725e-05, + "grad_norm": 4.111298084259033, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8837676048278809, + "num_tokens": 620363390.0, + "step": 16260 + }, + { + "epoch": 2.068566340160285, + "ewc_loss": 0.0082786800339818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.278679888462648e-05, + "grad_norm": 4.020438194274902, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.891122043132782, + "num_tokens": 620403707.0, + "step": 16261 + }, + { + "epoch": 2.0686935504388755, + "ewc_loss": 0.008198440074920654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.198439900297672e-05, + "grad_norm": 4.011478424072266, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8870779275894165, + "num_tokens": 620445994.0, + "step": 16262 + }, + { + "epoch": 2.068820760717466, + "ewc_loss": 0.008220604620873928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.220604649977759e-05, + "grad_norm": 4.054778099060059, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8887737393379211, + "num_tokens": 620480883.0, + "step": 16263 + }, + { + "epoch": 2.0689479709960565, + "ewc_loss": 0.008253761567175388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.253761188825592e-05, + "grad_norm": 4.087095737457275, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8953871726989746, + "num_tokens": 620518935.0, + "step": 16264 + }, + { + "epoch": 2.069075181274647, + "ewc_loss": 0.008244305849075317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244306081905961e-05, + "grad_norm": 4.144603729248047, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8766941428184509, + "num_tokens": 620551985.0, + "step": 16265 + }, + { + "epoch": 2.0692023915532376, + "ewc_loss": 0.00830902624875307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309026452479884e-05, + "grad_norm": 4.037713050842285, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.879054069519043, + "num_tokens": 620592809.0, + "step": 16266 + }, + { + "epoch": 2.069329601831828, + "ewc_loss": 0.008217010647058487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.217011054512113e-05, + "grad_norm": 4.115056037902832, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8724101781845093, + "num_tokens": 620627951.0, + "step": 16267 + }, + { + "epoch": 2.0694568121104187, + "ewc_loss": 0.00831283163279295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312831778312102e-05, + "grad_norm": 4.119919300079346, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.887198805809021, + "num_tokens": 620660892.0, + "step": 16268 + }, + { + "epoch": 2.069584022389009, + "ewc_loss": 0.008299028500914574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299028559122235e-05, + "grad_norm": 4.139710903167725, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8614407777786255, + "num_tokens": 620698165.0, + "step": 16269 + }, + { + "epoch": 2.0697112326675997, + "ewc_loss": 0.008325012400746346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325012458954006e-05, + "grad_norm": 4.115137577056885, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8738663792610168, + "num_tokens": 620734758.0, + "step": 16270 + }, + { + "epoch": 2.0698384429461902, + "ewc_loss": 0.00832164566963911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321645873365924e-05, + "grad_norm": 4.121066570281982, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8660527467727661, + "num_tokens": 620770224.0, + "step": 16271 + }, + { + "epoch": 2.0699656532247808, + "ewc_loss": 0.008342400193214417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342400542460382e-05, + "grad_norm": 4.04050350189209, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8911541104316711, + "num_tokens": 620805724.0, + "step": 16272 + }, + { + "epoch": 2.070092863503371, + "ewc_loss": 0.008308921940624714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30892167869024e-05, + "grad_norm": 4.051488876342773, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8728011250495911, + "num_tokens": 620847622.0, + "step": 16273 + }, + { + "epoch": 2.0702200737819614, + "ewc_loss": 0.008347094990313053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347094990313053e-05, + "grad_norm": 4.095371246337891, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8796425461769104, + "num_tokens": 620879042.0, + "step": 16274 + }, + { + "epoch": 2.070347284060552, + "ewc_loss": 0.008369402959942818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36940307635814e-05, + "grad_norm": 4.0326619148254395, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8802382946014404, + "num_tokens": 620922685.0, + "step": 16275 + }, + { + "epoch": 2.0704744943391424, + "ewc_loss": 0.008341671898961067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341672219103202e-05, + "grad_norm": 4.077376842498779, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.877618670463562, + "num_tokens": 620954854.0, + "step": 16276 + }, + { + "epoch": 2.070601704617733, + "ewc_loss": 0.008407781831920147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407781569985673e-05, + "grad_norm": 4.07738733291626, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8807437419891357, + "num_tokens": 620994581.0, + "step": 16277 + }, + { + "epoch": 2.0707289148963235, + "ewc_loss": 0.008391344919800758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391344454139471e-05, + "grad_norm": 4.075395584106445, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8829783797264099, + "num_tokens": 621030052.0, + "step": 16278 + }, + { + "epoch": 2.070856125174914, + "ewc_loss": 0.008389336057007313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389336289837956e-05, + "grad_norm": 4.017759323120117, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8915013074874878, + "num_tokens": 621072474.0, + "step": 16279 + }, + { + "epoch": 2.0709833354535045, + "ewc_loss": 0.008367466740310192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367466944037005e-05, + "grad_norm": 4.155769348144531, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8868259191513062, + "num_tokens": 621104365.0, + "step": 16280 + }, + { + "epoch": 2.071110545732095, + "ewc_loss": 0.008461394347250462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461394463665783e-05, + "grad_norm": 4.0510454177856445, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8867108821868896, + "num_tokens": 621140497.0, + "step": 16281 + }, + { + "epoch": 2.0712377560106856, + "ewc_loss": 0.008338768035173416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33876765682362e-05, + "grad_norm": 4.090669631958008, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8831243515014648, + "num_tokens": 621177431.0, + "step": 16282 + }, + { + "epoch": 2.071364966289276, + "ewc_loss": 0.008397683501243591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397683268412948e-05, + "grad_norm": 4.082513332366943, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8852387070655823, + "num_tokens": 621212874.0, + "step": 16283 + }, + { + "epoch": 2.0714921765678667, + "ewc_loss": 0.00836934894323349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369348506676033e-05, + "grad_norm": 4.074302673339844, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8886876106262207, + "num_tokens": 621249927.0, + "step": 16284 + }, + { + "epoch": 2.071619386846457, + "ewc_loss": 0.008378342725336552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378343045478687e-05, + "grad_norm": 4.155237674713135, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8592158555984497, + "num_tokens": 621280794.0, + "step": 16285 + }, + { + "epoch": 2.0717465971250477, + "ewc_loss": 0.008421997539699078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421997335972264e-05, + "grad_norm": 4.078853607177734, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8672982454299927, + "num_tokens": 621321167.0, + "step": 16286 + }, + { + "epoch": 2.0718738074036382, + "ewc_loss": 0.008363451808691025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363452070625499e-05, + "grad_norm": 4.035670280456543, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8929958343505859, + "num_tokens": 621357962.0, + "step": 16287 + }, + { + "epoch": 2.0720010176822288, + "ewc_loss": 0.008360727690160275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360727952094749e-05, + "grad_norm": 4.014564037322998, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8988261222839355, + "num_tokens": 621398933.0, + "step": 16288 + }, + { + "epoch": 2.0721282279608193, + "ewc_loss": 0.008363638073205948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363638335140422e-05, + "grad_norm": 4.004453182220459, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.9006597399711609, + "num_tokens": 621442334.0, + "step": 16289 + }, + { + "epoch": 2.07225543823941, + "ewc_loss": 0.008364569395780563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364568930119276e-05, + "grad_norm": 4.0743818283081055, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.885688304901123, + "num_tokens": 621478706.0, + "step": 16290 + }, + { + "epoch": 2.0723826485180004, + "ewc_loss": 0.00839940831065178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39940839796327e-05, + "grad_norm": 4.094496726989746, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8885053396224976, + "num_tokens": 621513911.0, + "step": 16291 + }, + { + "epoch": 2.072509858796591, + "ewc_loss": 0.008393843658268452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393843745579943e-05, + "grad_norm": 4.055535793304443, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8784494400024414, + "num_tokens": 621554080.0, + "step": 16292 + }, + { + "epoch": 2.0726370690751814, + "ewc_loss": 0.008331989869475365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331990102306008e-05, + "grad_norm": 4.066717147827148, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8857147693634033, + "num_tokens": 621592059.0, + "step": 16293 + }, + { + "epoch": 2.072764279353772, + "ewc_loss": 0.008360743522644043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3607439592015e-05, + "grad_norm": 4.06381368637085, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8942227363586426, + "num_tokens": 621629626.0, + "step": 16294 + }, + { + "epoch": 2.0728914896323625, + "ewc_loss": 0.00834731012582779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347310358658433e-05, + "grad_norm": 4.021275997161865, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8874274492263794, + "num_tokens": 621671341.0, + "step": 16295 + }, + { + "epoch": 2.073018699910953, + "ewc_loss": 0.00832977332174778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329773118020967e-05, + "grad_norm": 4.027937889099121, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8957196474075317, + "num_tokens": 621708756.0, + "step": 16296 + }, + { + "epoch": 2.0731459101895435, + "ewc_loss": 0.008330967277288437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330967102665454e-05, + "grad_norm": 4.067614555358887, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8872607946395874, + "num_tokens": 621748009.0, + "step": 16297 + }, + { + "epoch": 2.0732731204681336, + "ewc_loss": 0.008365054614841938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365054236492142e-05, + "grad_norm": 4.0925374031066895, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8909928798675537, + "num_tokens": 621788695.0, + "step": 16298 + }, + { + "epoch": 2.073400330746724, + "ewc_loss": 0.008323369547724724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323369547724724e-05, + "grad_norm": 4.051936626434326, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8827744126319885, + "num_tokens": 621828099.0, + "step": 16299 + }, + { + "epoch": 2.0735275410253147, + "ewc_loss": 0.00828800443559885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288004755740985e-05, + "grad_norm": 4.101186275482178, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8661355972290039, + "num_tokens": 621867033.0, + "step": 16300 + }, + { + "epoch": 2.073654751303905, + "ewc_loss": 0.008331756107509136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331755816470832e-05, + "grad_norm": 4.042318344116211, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8741766810417175, + "num_tokens": 621907435.0, + "step": 16301 + }, + { + "epoch": 2.0737819615824957, + "ewc_loss": 0.008257459849119186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.257460285676643e-05, + "grad_norm": 4.069101810455322, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.876292884349823, + "num_tokens": 621948995.0, + "step": 16302 + }, + { + "epoch": 2.0739091718610863, + "ewc_loss": 0.008290900848805904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29090058687143e-05, + "grad_norm": 4.004595756530762, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8937355279922485, + "num_tokens": 621991109.0, + "step": 16303 + }, + { + "epoch": 2.074036382139677, + "ewc_loss": 0.008240942843258381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.240943134296685e-05, + "grad_norm": 4.090909481048584, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8902524709701538, + "num_tokens": 622026585.0, + "step": 16304 + }, + { + "epoch": 2.0741635924182673, + "ewc_loss": 0.008312616497278214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312616409966722e-05, + "grad_norm": 4.076258182525635, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8920989632606506, + "num_tokens": 622065208.0, + "step": 16305 + }, + { + "epoch": 2.074290802696858, + "ewc_loss": 0.008268482983112335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26848263386637e-05, + "grad_norm": 4.074331283569336, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8763160705566406, + "num_tokens": 622104838.0, + "step": 16306 + }, + { + "epoch": 2.0744180129754484, + "ewc_loss": 0.008270984515547752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270984108094126e-05, + "grad_norm": 4.158802032470703, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8739861845970154, + "num_tokens": 622136879.0, + "step": 16307 + }, + { + "epoch": 2.074545223254039, + "ewc_loss": 0.00832429714500904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324297232320532e-05, + "grad_norm": 4.076473236083984, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.894126296043396, + "num_tokens": 622175938.0, + "step": 16308 + }, + { + "epoch": 2.0746724335326294, + "ewc_loss": 0.008254519663751125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25451934360899e-05, + "grad_norm": 4.041576862335205, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8924599885940552, + "num_tokens": 622216663.0, + "step": 16309 + }, + { + "epoch": 2.07479964381122, + "ewc_loss": 0.008258922025561333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.258922025561333e-05, + "grad_norm": 4.068249702453613, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8915965557098389, + "num_tokens": 622250450.0, + "step": 16310 + }, + { + "epoch": 2.0749268540898105, + "ewc_loss": 0.008290582336485386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29058262752369e-05, + "grad_norm": 4.054518699645996, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8776875138282776, + "num_tokens": 622291919.0, + "step": 16311 + }, + { + "epoch": 2.075054064368401, + "ewc_loss": 0.008259602822363377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25960305519402e-05, + "grad_norm": 4.07895040512085, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8830296993255615, + "num_tokens": 622325892.0, + "step": 16312 + }, + { + "epoch": 2.0751812746469915, + "ewc_loss": 0.00828329287469387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28329284559004e-05, + "grad_norm": 4.128431797027588, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8848110437393188, + "num_tokens": 622356966.0, + "step": 16313 + }, + { + "epoch": 2.075308484925582, + "ewc_loss": 0.008320157416164875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320157212438062e-05, + "grad_norm": 4.087881088256836, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.885005533695221, + "num_tokens": 622389931.0, + "step": 16314 + }, + { + "epoch": 2.0754356952041726, + "ewc_loss": 0.008270427584648132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270427497336641e-05, + "grad_norm": 4.079136848449707, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8825507760047913, + "num_tokens": 622427817.0, + "step": 16315 + }, + { + "epoch": 2.075562905482763, + "ewc_loss": 0.008306262083351612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.306262316182256e-05, + "grad_norm": 4.074373722076416, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8727675676345825, + "num_tokens": 622470353.0, + "step": 16316 + }, + { + "epoch": 2.0756901157613536, + "ewc_loss": 0.008286773227155209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286772936116904e-05, + "grad_norm": 4.021843910217285, + "learning_rate": 1e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.8987900018692017, + "num_tokens": 622507577.0, + "step": 16317 + }, + { + "epoch": 2.075817326039944, + "ewc_loss": 0.00826925691217184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.269256795756519e-05, + "grad_norm": 4.102320671081543, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8792800903320312, + "num_tokens": 622544746.0, + "step": 16318 + }, + { + "epoch": 2.0759445363185347, + "ewc_loss": 0.008348051458597183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348051778739318e-05, + "grad_norm": 4.075202941894531, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.87738037109375, + "num_tokens": 622583688.0, + "step": 16319 + }, + { + "epoch": 2.0760717465971252, + "ewc_loss": 0.008276804350316525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276804146589711e-05, + "grad_norm": 4.086755752563477, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8845889568328857, + "num_tokens": 622619911.0, + "step": 16320 + }, + { + "epoch": 2.0761989568757158, + "ewc_loss": 0.008336369879543781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336370228789747e-05, + "grad_norm": 4.033918857574463, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8881423473358154, + "num_tokens": 622665732.0, + "step": 16321 + }, + { + "epoch": 2.0763261671543063, + "ewc_loss": 0.008278807625174522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.278807945316657e-05, + "grad_norm": 4.025045871734619, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8893041610717773, + "num_tokens": 622705746.0, + "step": 16322 + }, + { + "epoch": 2.0764533774328964, + "ewc_loss": 0.00829889066517353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298891043523327e-05, + "grad_norm": 4.092044830322266, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8858867287635803, + "num_tokens": 622745481.0, + "step": 16323 + }, + { + "epoch": 2.076580587711487, + "ewc_loss": 0.008346181362867355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346181130036712e-05, + "grad_norm": 4.063581466674805, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8882373571395874, + "num_tokens": 622786910.0, + "step": 16324 + }, + { + "epoch": 2.0767077979900774, + "ewc_loss": 0.00828272569924593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.282726048491895e-05, + "grad_norm": 4.080434322357178, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8775193691253662, + "num_tokens": 622826976.0, + "step": 16325 + }, + { + "epoch": 2.076835008268668, + "ewc_loss": 0.008311588317155838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311588317155838e-05, + "grad_norm": 4.194114685058594, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8844248056411743, + "num_tokens": 622858945.0, + "step": 16326 + }, + { + "epoch": 2.0769622185472585, + "ewc_loss": 0.008354936726391315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354936289833859e-05, + "grad_norm": 4.0641303062438965, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8935768604278564, + "num_tokens": 622893229.0, + "step": 16327 + }, + { + "epoch": 2.077089428825849, + "ewc_loss": 0.008233594708144665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.233595144702122e-05, + "grad_norm": 4.03490686416626, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8949075937271118, + "num_tokens": 622933701.0, + "step": 16328 + }, + { + "epoch": 2.0772166391044395, + "ewc_loss": 0.008278767578303814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.278767199954018e-05, + "grad_norm": 4.122122287750244, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8697543144226074, + "num_tokens": 622971226.0, + "step": 16329 + }, + { + "epoch": 2.07734384938303, + "ewc_loss": 0.008336680941283703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336680912179872e-05, + "grad_norm": 4.064842700958252, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8803848028182983, + "num_tokens": 623009232.0, + "step": 16330 + }, + { + "epoch": 2.0774710596616206, + "ewc_loss": 0.008266786113381386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.266785880550742e-05, + "grad_norm": 4.0515456199646, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8789384365081787, + "num_tokens": 623048005.0, + "step": 16331 + }, + { + "epoch": 2.077598269940211, + "ewc_loss": 0.008300481364130974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300481567857787e-05, + "grad_norm": 4.0547308921813965, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8836798667907715, + "num_tokens": 623088474.0, + "step": 16332 + }, + { + "epoch": 2.0777254802188017, + "ewc_loss": 0.008291696198284626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29169584903866e-05, + "grad_norm": 4.068309783935547, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8832528591156006, + "num_tokens": 623123700.0, + "step": 16333 + }, + { + "epoch": 2.077852690497392, + "ewc_loss": 0.00830497033894062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304970106109977e-05, + "grad_norm": 4.146071434020996, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8862106800079346, + "num_tokens": 623161406.0, + "step": 16334 + }, + { + "epoch": 2.0779799007759827, + "ewc_loss": 0.008334032259881496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33403246360831e-05, + "grad_norm": 4.116995334625244, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8742400407791138, + "num_tokens": 623199770.0, + "step": 16335 + }, + { + "epoch": 2.0781071110545732, + "ewc_loss": 0.008284307084977627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.284307114081457e-05, + "grad_norm": 4.105774402618408, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8856161236763, + "num_tokens": 623234635.0, + "step": 16336 + }, + { + "epoch": 2.0782343213331638, + "ewc_loss": 0.008298100903630257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298100874526426e-05, + "grad_norm": 4.074573993682861, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8845285773277283, + "num_tokens": 623273695.0, + "step": 16337 + }, + { + "epoch": 2.0783615316117543, + "ewc_loss": 0.00827367976307869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.273679850390181e-05, + "grad_norm": 4.061200141906738, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8691691160202026, + "num_tokens": 623315278.0, + "step": 16338 + }, + { + "epoch": 2.078488741890345, + "ewc_loss": 0.008281784132122993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.281783811980858e-05, + "grad_norm": 4.066066265106201, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8891115784645081, + "num_tokens": 623351753.0, + "step": 16339 + }, + { + "epoch": 2.0786159521689354, + "ewc_loss": 0.008298031985759735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298031752929091e-05, + "grad_norm": 4.131502628326416, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8663991689682007, + "num_tokens": 623387812.0, + "step": 16340 + }, + { + "epoch": 2.078743162447526, + "ewc_loss": 0.008323540911078453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323541260324419e-05, + "grad_norm": 4.0437517166137695, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8873870372772217, + "num_tokens": 623426784.0, + "step": 16341 + }, + { + "epoch": 2.0788703727261164, + "ewc_loss": 0.008280470967292786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280470501631498e-05, + "grad_norm": 4.16868257522583, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.861830472946167, + "num_tokens": 623463362.0, + "step": 16342 + }, + { + "epoch": 2.078997583004707, + "ewc_loss": 0.008383775129914284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383775275433436e-05, + "grad_norm": 4.113862037658691, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8779447078704834, + "num_tokens": 623500108.0, + "step": 16343 + }, + { + "epoch": 2.0791247932832975, + "ewc_loss": 0.008302971720695496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30297140055336e-05, + "grad_norm": 4.084035873413086, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8799654841423035, + "num_tokens": 623537159.0, + "step": 16344 + }, + { + "epoch": 2.079252003561888, + "ewc_loss": 0.008338983170688152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338983025169e-05, + "grad_norm": 4.0184478759765625, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.9005672931671143, + "num_tokens": 623576256.0, + "step": 16345 + }, + { + "epoch": 2.0793792138404785, + "ewc_loss": 0.008309666998684406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309666736749932e-05, + "grad_norm": 4.11096715927124, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8807724714279175, + "num_tokens": 623612454.0, + "step": 16346 + }, + { + "epoch": 2.0795064241190686, + "ewc_loss": 0.008373106829822063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37310726637952e-05, + "grad_norm": 4.084599494934082, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8943209648132324, + "num_tokens": 623654392.0, + "step": 16347 + }, + { + "epoch": 2.079633634397659, + "ewc_loss": 0.008326816372573376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326816168846563e-05, + "grad_norm": 4.0505781173706055, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8657256960868835, + "num_tokens": 623697350.0, + "step": 16348 + }, + { + "epoch": 2.0797608446762497, + "ewc_loss": 0.008321757428348064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321757195517421e-05, + "grad_norm": 4.086463928222656, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.887474536895752, + "num_tokens": 623736272.0, + "step": 16349 + }, + { + "epoch": 2.07988805495484, + "ewc_loss": 0.008322063833475113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322063513332978e-05, + "grad_norm": 4.038125038146973, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8742550015449524, + "num_tokens": 623781319.0, + "step": 16350 + }, + { + "epoch": 2.0800152652334307, + "ewc_loss": 0.008270694874227047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270694524981081e-05, + "grad_norm": 4.086768627166748, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8837644457817078, + "num_tokens": 623821944.0, + "step": 16351 + }, + { + "epoch": 2.0801424755120212, + "ewc_loss": 0.008302937261760235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.302937203552574e-05, + "grad_norm": 4.099413871765137, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.880393385887146, + "num_tokens": 623858399.0, + "step": 16352 + }, + { + "epoch": 2.0802696857906118, + "ewc_loss": 0.008298764005303383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298763714265078e-05, + "grad_norm": 4.0656328201293945, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8836325407028198, + "num_tokens": 623897839.0, + "step": 16353 + }, + { + "epoch": 2.0803968960692023, + "ewc_loss": 0.008281747810542583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.281747432192788e-05, + "grad_norm": 4.149684429168701, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8683410286903381, + "num_tokens": 623933127.0, + "step": 16354 + }, + { + "epoch": 2.080524106347793, + "ewc_loss": 0.008317767642438412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317767787957564e-05, + "grad_norm": 4.092128753662109, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8831044435501099, + "num_tokens": 623972140.0, + "step": 16355 + }, + { + "epoch": 2.0806513166263834, + "ewc_loss": 0.008249074220657349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.249074016930535e-05, + "grad_norm": 4.100057601928711, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8744359016418457, + "num_tokens": 624008875.0, + "step": 16356 + }, + { + "epoch": 2.080778526904974, + "ewc_loss": 0.00828499160706997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.284991781692952e-05, + "grad_norm": 4.133147239685059, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8806861042976379, + "num_tokens": 624045162.0, + "step": 16357 + }, + { + "epoch": 2.0809057371835644, + "ewc_loss": 0.008304036222398281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304035873152316e-05, + "grad_norm": 4.0409159660339355, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8910281658172607, + "num_tokens": 624085275.0, + "step": 16358 + }, + { + "epoch": 2.081032947462155, + "ewc_loss": 0.00824783369898796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.247833466157317e-05, + "grad_norm": 4.061897277832031, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8805564641952515, + "num_tokens": 624127479.0, + "step": 16359 + }, + { + "epoch": 2.0811601577407455, + "ewc_loss": 0.00828142836689949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.281428745249286e-05, + "grad_norm": 4.071836471557617, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.884169340133667, + "num_tokens": 624165231.0, + "step": 16360 + }, + { + "epoch": 2.081287368019336, + "ewc_loss": 0.008264768868684769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26476898510009e-05, + "grad_norm": 4.0791215896606445, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8719055652618408, + "num_tokens": 624205109.0, + "step": 16361 + }, + { + "epoch": 2.0814145782979265, + "ewc_loss": 0.008272778242826462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.272778359241784e-05, + "grad_norm": 4.06558895111084, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8794842958450317, + "num_tokens": 624241849.0, + "step": 16362 + }, + { + "epoch": 2.081541788576517, + "ewc_loss": 0.008261485025286674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26148534542881e-05, + "grad_norm": 4.092761039733887, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8860020637512207, + "num_tokens": 624274174.0, + "step": 16363 + }, + { + "epoch": 2.0816689988551076, + "ewc_loss": 0.008294752798974514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.294752478832379e-05, + "grad_norm": 4.074366569519043, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8807905912399292, + "num_tokens": 624311526.0, + "step": 16364 + }, + { + "epoch": 2.081796209133698, + "ewc_loss": 0.008269445039331913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.269445243058726e-05, + "grad_norm": 4.1219682693481445, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8702054619789124, + "num_tokens": 624346657.0, + "step": 16365 + }, + { + "epoch": 2.0819234194122886, + "ewc_loss": 0.008330442011356354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33044177852571e-05, + "grad_norm": 4.066056728363037, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8673955202102661, + "num_tokens": 624386563.0, + "step": 16366 + }, + { + "epoch": 2.082050629690879, + "ewc_loss": 0.008262459188699722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.262459596153349e-05, + "grad_norm": 4.000477313995361, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8918827772140503, + "num_tokens": 624427299.0, + "step": 16367 + }, + { + "epoch": 2.0821778399694697, + "ewc_loss": 0.008270176127552986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270176476798952e-05, + "grad_norm": 4.10512113571167, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.867332935333252, + "num_tokens": 624461253.0, + "step": 16368 + }, + { + "epoch": 2.0823050502480602, + "ewc_loss": 0.008354263380169868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354263263754547e-05, + "grad_norm": 4.085481643676758, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8919659852981567, + "num_tokens": 624498699.0, + "step": 16369 + }, + { + "epoch": 2.0824322605266508, + "ewc_loss": 0.008299386128783226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299385808641091e-05, + "grad_norm": 4.088411331176758, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8800094723701477, + "num_tokens": 624534655.0, + "step": 16370 + }, + { + "epoch": 2.082559470805241, + "ewc_loss": 0.008327147923409939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327147952513769e-05, + "grad_norm": 4.0389556884765625, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8838810920715332, + "num_tokens": 624575342.0, + "step": 16371 + }, + { + "epoch": 2.0826866810838314, + "ewc_loss": 0.008301668800413609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.301669004140422e-05, + "grad_norm": 4.167276859283447, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8877279758453369, + "num_tokens": 624608493.0, + "step": 16372 + }, + { + "epoch": 2.082813891362422, + "ewc_loss": 0.008381791412830353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38179184938781e-05, + "grad_norm": 4.020634174346924, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8649337887763977, + "num_tokens": 624655777.0, + "step": 16373 + }, + { + "epoch": 2.0829411016410124, + "ewc_loss": 0.008271221071481705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271221304312348e-05, + "grad_norm": 4.022800445556641, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8915930986404419, + "num_tokens": 624699349.0, + "step": 16374 + }, + { + "epoch": 2.083068311919603, + "ewc_loss": 0.008319809101521969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319809421664104e-05, + "grad_norm": 4.138487815856934, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.890055775642395, + "num_tokens": 624730146.0, + "step": 16375 + }, + { + "epoch": 2.0831955221981935, + "ewc_loss": 0.008375498466193676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375498146051541e-05, + "grad_norm": 4.021920680999756, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8756691217422485, + "num_tokens": 624773186.0, + "step": 16376 + }, + { + "epoch": 2.083322732476784, + "ewc_loss": 0.008268047124147415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.268047531601042e-05, + "grad_norm": 4.103580474853516, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8816603422164917, + "num_tokens": 624810256.0, + "step": 16377 + }, + { + "epoch": 2.0834499427553745, + "ewc_loss": 0.00836829375475645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368293492821977e-05, + "grad_norm": 4.102294921875, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8727017641067505, + "num_tokens": 624848166.0, + "step": 16378 + }, + { + "epoch": 2.083577153033965, + "ewc_loss": 0.008322173729538918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322174107888713e-05, + "grad_norm": 4.035427093505859, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8821616172790527, + "num_tokens": 624887601.0, + "step": 16379 + }, + { + "epoch": 2.0837043633125556, + "ewc_loss": 0.00826745480298996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.267454541055486e-05, + "grad_norm": 4.086841583251953, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8834857940673828, + "num_tokens": 624921160.0, + "step": 16380 + }, + { + "epoch": 2.083831573591146, + "ewc_loss": 0.008345312438905239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345312380697578e-05, + "grad_norm": 4.109385967254639, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8758381009101868, + "num_tokens": 624959053.0, + "step": 16381 + }, + { + "epoch": 2.0839587838697367, + "ewc_loss": 0.00831662304699421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316623279824853e-05, + "grad_norm": 4.0285725593566895, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8943531513214111, + "num_tokens": 624995272.0, + "step": 16382 + }, + { + "epoch": 2.084085994148327, + "ewc_loss": 0.008264361880719662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264362259069458e-05, + "grad_norm": 4.047891616821289, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8963233232498169, + "num_tokens": 625033388.0, + "step": 16383 + }, + { + "epoch": 2.0842132044269177, + "ewc_loss": 0.008318447507917881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318447362398729e-05, + "grad_norm": 4.13668966293335, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8725050091743469, + "num_tokens": 625071103.0, + "step": 16384 + }, + { + "epoch": 2.0843404147055082, + "ewc_loss": 0.008358784951269627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358785271411762e-05, + "grad_norm": 4.064683437347412, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8939113616943359, + "num_tokens": 625109562.0, + "step": 16385 + }, + { + "epoch": 2.0844676249840988, + "ewc_loss": 0.008265373297035694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.265372889582068e-05, + "grad_norm": 4.090164661407471, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8882746696472168, + "num_tokens": 625147438.0, + "step": 16386 + }, + { + "epoch": 2.0845948352626893, + "ewc_loss": 0.00831614714115858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316147432196885e-05, + "grad_norm": 4.075730800628662, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8899185657501221, + "num_tokens": 625184293.0, + "step": 16387 + }, + { + "epoch": 2.08472204554128, + "ewc_loss": 0.008283445611596107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.283445640699938e-05, + "grad_norm": 4.101142883300781, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8848527669906616, + "num_tokens": 625218201.0, + "step": 16388 + }, + { + "epoch": 2.0848492558198704, + "ewc_loss": 0.008315724320709705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315724699059501e-05, + "grad_norm": 4.131681442260742, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8807862401008606, + "num_tokens": 625252072.0, + "step": 16389 + }, + { + "epoch": 2.084976466098461, + "ewc_loss": 0.008316957391798496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316957246279344e-05, + "grad_norm": 4.120533466339111, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8944998979568481, + "num_tokens": 625286990.0, + "step": 16390 + }, + { + "epoch": 2.0851036763770514, + "ewc_loss": 0.00829753652215004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.297536260215566e-05, + "grad_norm": 4.041906356811523, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8915239572525024, + "num_tokens": 625326579.0, + "step": 16391 + }, + { + "epoch": 2.085230886655642, + "ewc_loss": 0.008274704217910767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274704305222258e-05, + "grad_norm": 4.0613508224487305, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8867550492286682, + "num_tokens": 625367317.0, + "step": 16392 + }, + { + "epoch": 2.0853580969342325, + "ewc_loss": 0.008298576809465885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298576722154394e-05, + "grad_norm": 4.161398410797119, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.877670407295227, + "num_tokens": 625401047.0, + "step": 16393 + }, + { + "epoch": 2.085485307212823, + "ewc_loss": 0.008344961330294609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344961679540575e-05, + "grad_norm": 4.092556476593018, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8684446811676025, + "num_tokens": 625441878.0, + "step": 16394 + }, + { + "epoch": 2.0856125174914135, + "ewc_loss": 0.008279822766780853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.279822941403836e-05, + "grad_norm": 4.125403881072998, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8832483887672424, + "num_tokens": 625477007.0, + "step": 16395 + }, + { + "epoch": 2.0857397277700036, + "ewc_loss": 0.008307419717311859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307419921038672e-05, + "grad_norm": 4.030693054199219, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8938080668449402, + "num_tokens": 625519908.0, + "step": 16396 + }, + { + "epoch": 2.085866938048594, + "ewc_loss": 0.008241940289735794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.24194066808559e-05, + "grad_norm": 4.128539562225342, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8846290111541748, + "num_tokens": 625554946.0, + "step": 16397 + }, + { + "epoch": 2.0859941483271847, + "ewc_loss": 0.0083504319190979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350431744474918e-05, + "grad_norm": 4.064497947692871, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8932533860206604, + "num_tokens": 625592121.0, + "step": 16398 + }, + { + "epoch": 2.086121358605775, + "ewc_loss": 0.008261575363576412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.261575567303225e-05, + "grad_norm": 4.047211647033691, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8803816437721252, + "num_tokens": 625630596.0, + "step": 16399 + }, + { + "epoch": 2.0862485688843657, + "ewc_loss": 0.00826610904186964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.266109216492623e-05, + "grad_norm": 4.103715896606445, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8668780326843262, + "num_tokens": 625669535.0, + "step": 16400 + }, + { + "epoch": 2.0863757791629562, + "ewc_loss": 0.008321856148540974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321856148540974e-05, + "grad_norm": 4.066986083984375, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8811497688293457, + "num_tokens": 625710072.0, + "step": 16401 + }, + { + "epoch": 2.0865029894415468, + "ewc_loss": 0.00825259368866682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.252594125224277e-05, + "grad_norm": 4.062502384185791, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8891044855117798, + "num_tokens": 625746328.0, + "step": 16402 + }, + { + "epoch": 2.0866301997201373, + "ewc_loss": 0.008277217857539654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.277218148577958e-05, + "grad_norm": 4.030305862426758, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8826496601104736, + "num_tokens": 625787108.0, + "step": 16403 + }, + { + "epoch": 2.086757409998728, + "ewc_loss": 0.0082510095089674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.251009421655908e-05, + "grad_norm": 4.073335647583008, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8861340284347534, + "num_tokens": 625827887.0, + "step": 16404 + }, + { + "epoch": 2.0868846202773184, + "ewc_loss": 0.008286942727863789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286942465929314e-05, + "grad_norm": 4.1307172775268555, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8830693960189819, + "num_tokens": 625863475.0, + "step": 16405 + }, + { + "epoch": 2.087011830555909, + "ewc_loss": 0.008309571072459221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309571421705186e-05, + "grad_norm": 4.027336597442627, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8979049921035767, + "num_tokens": 625902369.0, + "step": 16406 + }, + { + "epoch": 2.0871390408344994, + "ewc_loss": 0.008218231610953808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.218231232604012e-05, + "grad_norm": 4.24936580657959, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8595197796821594, + "num_tokens": 625941054.0, + "step": 16407 + }, + { + "epoch": 2.08726625111309, + "ewc_loss": 0.008401835337281227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401835657423362e-05, + "grad_norm": 4.068869113922119, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8647032380104065, + "num_tokens": 625981113.0, + "step": 16408 + }, + { + "epoch": 2.0873934613916805, + "ewc_loss": 0.008215678855776787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.215678826672956e-05, + "grad_norm": 4.088627338409424, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8796547651290894, + "num_tokens": 626014758.0, + "step": 16409 + }, + { + "epoch": 2.087520671670271, + "ewc_loss": 0.00830342061817646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303420327138156e-05, + "grad_norm": 4.112521648406982, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8684406280517578, + "num_tokens": 626054442.0, + "step": 16410 + }, + { + "epoch": 2.0876478819488615, + "ewc_loss": 0.00832301564514637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323015936184675e-05, + "grad_norm": 4.119824409484863, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.867577314376831, + "num_tokens": 626093993.0, + "step": 16411 + }, + { + "epoch": 2.087775092227452, + "ewc_loss": 0.00830385647714138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303856156999245e-05, + "grad_norm": 4.115372180938721, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.865724503993988, + "num_tokens": 626131242.0, + "step": 16412 + }, + { + "epoch": 2.0879023025060426, + "ewc_loss": 0.008319037035107613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319036714965478e-05, + "grad_norm": 4.177562713623047, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8664078116416931, + "num_tokens": 626162137.0, + "step": 16413 + }, + { + "epoch": 2.088029512784633, + "ewc_loss": 0.008363069966435432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363070082850754e-05, + "grad_norm": 4.052589416503906, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8827891945838928, + "num_tokens": 626201031.0, + "step": 16414 + }, + { + "epoch": 2.0881567230632236, + "ewc_loss": 0.008289081044495106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289080869872123e-05, + "grad_norm": 4.028518199920654, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8923022747039795, + "num_tokens": 626243647.0, + "step": 16415 + }, + { + "epoch": 2.088283933341814, + "ewc_loss": 0.008324269205331802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324269583681598e-05, + "grad_norm": 4.103086948394775, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8884639739990234, + "num_tokens": 626281346.0, + "step": 16416 + }, + { + "epoch": 2.0884111436204047, + "ewc_loss": 0.00836359616369009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36359613458626e-05, + "grad_norm": 4.096397399902344, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8907555937767029, + "num_tokens": 626315964.0, + "step": 16417 + }, + { + "epoch": 2.0885383538989952, + "ewc_loss": 0.008336810395121574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336810424225405e-05, + "grad_norm": 3.9957921504974365, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8753158450126648, + "num_tokens": 626367920.0, + "step": 16418 + }, + { + "epoch": 2.0886655641775858, + "ewc_loss": 0.008261973969638348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.261973562184721e-05, + "grad_norm": 4.0255961418151855, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8791325688362122, + "num_tokens": 626412123.0, + "step": 16419 + }, + { + "epoch": 2.0887927744561763, + "ewc_loss": 0.008336146362125874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33614612929523e-05, + "grad_norm": 4.108037948608398, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8568708896636963, + "num_tokens": 626451560.0, + "step": 16420 + }, + { + "epoch": 2.0889199847347664, + "ewc_loss": 0.00834970735013485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349707059096545e-05, + "grad_norm": 4.098984241485596, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8749213218688965, + "num_tokens": 626489361.0, + "step": 16421 + }, + { + "epoch": 2.089047195013357, + "ewc_loss": 0.008319989778101444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319989865412936e-05, + "grad_norm": 4.031641960144043, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8957738280296326, + "num_tokens": 626527150.0, + "step": 16422 + }, + { + "epoch": 2.0891744052919474, + "ewc_loss": 0.008293849416077137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293849532492459e-05, + "grad_norm": 4.07965087890625, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8749460577964783, + "num_tokens": 626568110.0, + "step": 16423 + }, + { + "epoch": 2.089301615570538, + "ewc_loss": 0.008342236280441284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342236105818301e-05, + "grad_norm": 4.107683181762695, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8835469484329224, + "num_tokens": 626607215.0, + "step": 16424 + }, + { + "epoch": 2.0894288258491285, + "ewc_loss": 0.008323201909661293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323201473103836e-05, + "grad_norm": 4.084652423858643, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8875596523284912, + "num_tokens": 626644424.0, + "step": 16425 + }, + { + "epoch": 2.089556036127719, + "ewc_loss": 0.008300591260194778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300591434817761e-05, + "grad_norm": 4.066171646118164, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8613483905792236, + "num_tokens": 626689509.0, + "step": 16426 + }, + { + "epoch": 2.0896832464063095, + "ewc_loss": 0.008291280828416348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291281119454652e-05, + "grad_norm": 4.025418758392334, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8732286691665649, + "num_tokens": 626733866.0, + "step": 16427 + }, + { + "epoch": 2.0898104566849, + "ewc_loss": 0.008258170448243618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.258170419139788e-05, + "grad_norm": 4.123876094818115, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8896238803863525, + "num_tokens": 626767465.0, + "step": 16428 + }, + { + "epoch": 2.0899376669634906, + "ewc_loss": 0.008320351131260395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32035148050636e-05, + "grad_norm": 4.095553874969482, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8676772117614746, + "num_tokens": 626806049.0, + "step": 16429 + }, + { + "epoch": 2.090064877242081, + "ewc_loss": 0.008286012336611748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28601187095046e-05, + "grad_norm": 4.076303482055664, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8707751035690308, + "num_tokens": 626848917.0, + "step": 16430 + }, + { + "epoch": 2.0901920875206716, + "ewc_loss": 0.00827348418533802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.27348412713036e-05, + "grad_norm": 4.172324180603027, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8687126636505127, + "num_tokens": 626884442.0, + "step": 16431 + }, + { + "epoch": 2.090319297799262, + "ewc_loss": 0.008324709720909595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324709779117256e-05, + "grad_norm": 4.102921962738037, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8781814575195312, + "num_tokens": 626922517.0, + "step": 16432 + }, + { + "epoch": 2.0904465080778527, + "ewc_loss": 0.008265266194939613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.265265933005139e-05, + "grad_norm": 4.018797874450684, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.881284236907959, + "num_tokens": 626962880.0, + "step": 16433 + }, + { + "epoch": 2.0905737183564432, + "ewc_loss": 0.008248263038694859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.248263475252315e-05, + "grad_norm": 4.0081586837768555, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8888506293296814, + "num_tokens": 627008115.0, + "step": 16434 + }, + { + "epoch": 2.0907009286350338, + "ewc_loss": 0.008273312821984291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.273313142126426e-05, + "grad_norm": 4.075511932373047, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8929915428161621, + "num_tokens": 627043935.0, + "step": 16435 + }, + { + "epoch": 2.0908281389136243, + "ewc_loss": 0.008318651467561722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318651089211926e-05, + "grad_norm": 4.106184482574463, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8838613033294678, + "num_tokens": 627078495.0, + "step": 16436 + }, + { + "epoch": 2.090955349192215, + "ewc_loss": 0.008298560045659542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298559987451881e-05, + "grad_norm": 4.092165946960449, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8859463334083557, + "num_tokens": 627116652.0, + "step": 16437 + }, + { + "epoch": 2.0910825594708053, + "ewc_loss": 0.008294387720525265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.294387953355908e-05, + "grad_norm": 4.129419326782227, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8902561068534851, + "num_tokens": 627152326.0, + "step": 16438 + }, + { + "epoch": 2.091209769749396, + "ewc_loss": 0.008315535262227058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315535524161533e-05, + "grad_norm": 4.0392913818359375, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8895093202590942, + "num_tokens": 627189502.0, + "step": 16439 + }, + { + "epoch": 2.0913369800279864, + "ewc_loss": 0.008256125263869762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25612514745444e-05, + "grad_norm": 4.075046539306641, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8973845839500427, + "num_tokens": 627225438.0, + "step": 16440 + }, + { + "epoch": 2.091464190306577, + "ewc_loss": 0.008309983648359776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30998396850191e-05, + "grad_norm": 4.1646294593811035, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8789033889770508, + "num_tokens": 627258154.0, + "step": 16441 + }, + { + "epoch": 2.0915914005851675, + "ewc_loss": 0.008346067741513252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34606762509793e-05, + "grad_norm": 4.0665483474731445, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8759109377861023, + "num_tokens": 627301804.0, + "step": 16442 + }, + { + "epoch": 2.091718610863758, + "ewc_loss": 0.00826303567737341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.263035851996392e-05, + "grad_norm": 4.009863376617432, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8858870267868042, + "num_tokens": 627344301.0, + "step": 16443 + }, + { + "epoch": 2.0918458211423485, + "ewc_loss": 0.008289097808301449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289097604574636e-05, + "grad_norm": 4.1081624031066895, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8901508450508118, + "num_tokens": 627381068.0, + "step": 16444 + }, + { + "epoch": 2.0919730314209386, + "ewc_loss": 0.008352546021342278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3525461377576e-05, + "grad_norm": 4.075416088104248, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8754161596298218, + "num_tokens": 627423410.0, + "step": 16445 + }, + { + "epoch": 2.092100241699529, + "ewc_loss": 0.00828665029257536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286650700028986e-05, + "grad_norm": 4.083967208862305, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.892357349395752, + "num_tokens": 627456593.0, + "step": 16446 + }, + { + "epoch": 2.0922274519781197, + "ewc_loss": 0.008321275934576988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321276254719123e-05, + "grad_norm": 4.089354991912842, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8834408521652222, + "num_tokens": 627493742.0, + "step": 16447 + }, + { + "epoch": 2.09235466225671, + "ewc_loss": 0.008325979113578796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325979433720931e-05, + "grad_norm": 4.085934162139893, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8887239694595337, + "num_tokens": 627529809.0, + "step": 16448 + }, + { + "epoch": 2.0924818725353007, + "ewc_loss": 0.008330093696713448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330093987751752e-05, + "grad_norm": 4.053704738616943, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8840699195861816, + "num_tokens": 627570693.0, + "step": 16449 + }, + { + "epoch": 2.0926090828138912, + "ewc_loss": 0.008319300599396229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319300832226872e-05, + "grad_norm": 4.111314296722412, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8884499073028564, + "num_tokens": 627607608.0, + "step": 16450 + }, + { + "epoch": 2.0927362930924818, + "ewc_loss": 0.008342541754245758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342541696038097e-05, + "grad_norm": 4.103582859039307, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8808280229568481, + "num_tokens": 627642102.0, + "step": 16451 + }, + { + "epoch": 2.0928635033710723, + "ewc_loss": 0.008353141136467457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35314131109044e-05, + "grad_norm": 4.098874092102051, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8777511119842529, + "num_tokens": 627678838.0, + "step": 16452 + }, + { + "epoch": 2.092990713649663, + "ewc_loss": 0.008350403979420662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350404095835984e-05, + "grad_norm": 4.065429210662842, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8843129873275757, + "num_tokens": 627714952.0, + "step": 16453 + }, + { + "epoch": 2.0931179239282534, + "ewc_loss": 0.00833137333393097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331373101100326e-05, + "grad_norm": 4.113470077514648, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.873765230178833, + "num_tokens": 627754710.0, + "step": 16454 + }, + { + "epoch": 2.093245134206844, + "ewc_loss": 0.008365986868739128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36598701425828e-05, + "grad_norm": 4.106375217437744, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8860561847686768, + "num_tokens": 627791504.0, + "step": 16455 + }, + { + "epoch": 2.0933723444854344, + "ewc_loss": 0.008340991102159023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.340991189470515e-05, + "grad_norm": 4.077939510345459, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8944634795188904, + "num_tokens": 627829683.0, + "step": 16456 + }, + { + "epoch": 2.093499554764025, + "ewc_loss": 0.008332032710313797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.332033030455932e-05, + "grad_norm": 4.068436145782471, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8836734294891357, + "num_tokens": 627869826.0, + "step": 16457 + }, + { + "epoch": 2.0936267650426155, + "ewc_loss": 0.008343547582626343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343547233380377e-05, + "grad_norm": 4.11191463470459, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8799120187759399, + "num_tokens": 627904165.0, + "step": 16458 + }, + { + "epoch": 2.093753975321206, + "ewc_loss": 0.00836194958537817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36194958537817e-05, + "grad_norm": 4.1359453201293945, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8866016268730164, + "num_tokens": 627936559.0, + "step": 16459 + }, + { + "epoch": 2.0938811855997965, + "ewc_loss": 0.008345677517354488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34567763376981e-05, + "grad_norm": 4.1103715896606445, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8864078521728516, + "num_tokens": 627971742.0, + "step": 16460 + }, + { + "epoch": 2.094008395878387, + "ewc_loss": 0.008335372433066368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.335372695000842e-05, + "grad_norm": 4.063814640045166, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8906627893447876, + "num_tokens": 628013104.0, + "step": 16461 + }, + { + "epoch": 2.0941356061569776, + "ewc_loss": 0.0083086546510458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308654651045799e-05, + "grad_norm": 4.093028545379639, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8792944550514221, + "num_tokens": 628049378.0, + "step": 16462 + }, + { + "epoch": 2.094262816435568, + "ewc_loss": 0.0083430465310812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343046647496521e-05, + "grad_norm": 4.0235161781311035, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.8998657464981079, + "num_tokens": 628087213.0, + "step": 16463 + }, + { + "epoch": 2.0943900267141586, + "ewc_loss": 0.008293855004012585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29385535325855e-05, + "grad_norm": 4.1078314781188965, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8891420364379883, + "num_tokens": 628122060.0, + "step": 16464 + }, + { + "epoch": 2.094517236992749, + "ewc_loss": 0.00835630763322115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356307807844132e-05, + "grad_norm": 4.052231788635254, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8856076598167419, + "num_tokens": 628166554.0, + "step": 16465 + }, + { + "epoch": 2.0946444472713397, + "ewc_loss": 0.008298917673528194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298917236970738e-05, + "grad_norm": 4.1750078201293945, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8814408779144287, + "num_tokens": 628199953.0, + "step": 16466 + }, + { + "epoch": 2.09477165754993, + "ewc_loss": 0.008386596105992794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386596164200455e-05, + "grad_norm": 4.088253498077393, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8853636384010315, + "num_tokens": 628237044.0, + "step": 16467 + }, + { + "epoch": 2.0948988678285207, + "ewc_loss": 0.00830013770610094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300137415062636e-05, + "grad_norm": 4.10488748550415, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8841246962547302, + "num_tokens": 628275773.0, + "step": 16468 + }, + { + "epoch": 2.095026078107111, + "ewc_loss": 0.008328555151820183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328555122716352e-05, + "grad_norm": 4.1098809242248535, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8686548471450806, + "num_tokens": 628314549.0, + "step": 16469 + }, + { + "epoch": 2.0951532883857014, + "ewc_loss": 0.008349604904651642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349605195689946e-05, + "grad_norm": 4.123295307159424, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8795034885406494, + "num_tokens": 628349417.0, + "step": 16470 + }, + { + "epoch": 2.095280498664292, + "ewc_loss": 0.008337665349245071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.337665349245071e-05, + "grad_norm": 4.089575290679932, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.885995090007782, + "num_tokens": 628384435.0, + "step": 16471 + }, + { + "epoch": 2.0954077089428824, + "ewc_loss": 0.008315018378198147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315018203575164e-05, + "grad_norm": 4.117515563964844, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8702951669692993, + "num_tokens": 628424124.0, + "step": 16472 + }, + { + "epoch": 2.095534919221473, + "ewc_loss": 0.008347882889211178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347882976522669e-05, + "grad_norm": 4.047694683074951, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8829128742218018, + "num_tokens": 628462412.0, + "step": 16473 + }, + { + "epoch": 2.0956621295000635, + "ewc_loss": 0.008307285606861115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30728531582281e-05, + "grad_norm": 4.097967624664307, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8640146255493164, + "num_tokens": 628499066.0, + "step": 16474 + }, + { + "epoch": 2.095789339778654, + "ewc_loss": 0.008367563597857952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367563714273274e-05, + "grad_norm": 4.1119866371154785, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8949487805366516, + "num_tokens": 628532379.0, + "step": 16475 + }, + { + "epoch": 2.0959165500572445, + "ewc_loss": 0.008358942344784737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358942432096228e-05, + "grad_norm": 4.0408782958984375, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8911578059196472, + "num_tokens": 628570497.0, + "step": 16476 + }, + { + "epoch": 2.096043760335835, + "ewc_loss": 0.008320348337292671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320348570123315e-05, + "grad_norm": 4.067711353302002, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8883600234985352, + "num_tokens": 628608298.0, + "step": 16477 + }, + { + "epoch": 2.0961709706144256, + "ewc_loss": 0.00837322324514389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373223681701347e-05, + "grad_norm": 4.0942559242248535, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8995221853256226, + "num_tokens": 628641217.0, + "step": 16478 + }, + { + "epoch": 2.096298180893016, + "ewc_loss": 0.00837802141904831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37802144815214e-05, + "grad_norm": 4.0498199462890625, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8832281827926636, + "num_tokens": 628682480.0, + "step": 16479 + }, + { + "epoch": 2.0964253911716066, + "ewc_loss": 0.008322023786604404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3220234955661e-05, + "grad_norm": 4.084033966064453, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8802286386489868, + "num_tokens": 628718359.0, + "step": 16480 + }, + { + "epoch": 2.096552601450197, + "ewc_loss": 0.00836364459246397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363644883502275e-05, + "grad_norm": 4.098567962646484, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8882529735565186, + "num_tokens": 628752214.0, + "step": 16481 + }, + { + "epoch": 2.0966798117287877, + "ewc_loss": 0.008371616713702679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371616422664374e-05, + "grad_norm": 4.09041690826416, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8956706523895264, + "num_tokens": 628787882.0, + "step": 16482 + }, + { + "epoch": 2.0968070220073782, + "ewc_loss": 0.008357377722859383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357377373613417e-05, + "grad_norm": 4.112699508666992, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8798282742500305, + "num_tokens": 628824910.0, + "step": 16483 + }, + { + "epoch": 2.0969342322859688, + "ewc_loss": 0.008381171151995659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38117121020332e-05, + "grad_norm": 4.051383972167969, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8943195343017578, + "num_tokens": 628860657.0, + "step": 16484 + }, + { + "epoch": 2.0970614425645593, + "ewc_loss": 0.008334442973136902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33444282761775e-05, + "grad_norm": 4.07416296005249, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.867761492729187, + "num_tokens": 628900619.0, + "step": 16485 + }, + { + "epoch": 2.09718865284315, + "ewc_loss": 0.008377187885344028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377187623409554e-05, + "grad_norm": 4.062509536743164, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.894926905632019, + "num_tokens": 628944304.0, + "step": 16486 + }, + { + "epoch": 2.0973158631217403, + "ewc_loss": 0.00836039800196886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360398351214826e-05, + "grad_norm": 4.060774803161621, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8852936029434204, + "num_tokens": 628980490.0, + "step": 16487 + }, + { + "epoch": 2.097443073400331, + "ewc_loss": 0.008351380936801434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.351381256943569e-05, + "grad_norm": 4.124218463897705, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8871701955795288, + "num_tokens": 629011924.0, + "step": 16488 + }, + { + "epoch": 2.0975702836789214, + "ewc_loss": 0.008386089466512203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386089029954746e-05, + "grad_norm": 4.125683307647705, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8601961731910706, + "num_tokens": 629049514.0, + "step": 16489 + }, + { + "epoch": 2.097697493957512, + "ewc_loss": 0.008372711017727852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372711454285309e-05, + "grad_norm": 4.065230369567871, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.9002752304077148, + "num_tokens": 629085374.0, + "step": 16490 + }, + { + "epoch": 2.0978247042361025, + "ewc_loss": 0.008349994197487831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349994459422305e-05, + "grad_norm": 4.075127124786377, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8799024224281311, + "num_tokens": 629126307.0, + "step": 16491 + }, + { + "epoch": 2.097951914514693, + "ewc_loss": 0.008371936157345772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371936564799398e-05, + "grad_norm": 4.1225056648254395, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.883309543132782, + "num_tokens": 629159693.0, + "step": 16492 + }, + { + "epoch": 2.0980791247932835, + "ewc_loss": 0.008393485099077225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393485040869564e-05, + "grad_norm": 4.063706874847412, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8891751170158386, + "num_tokens": 629198544.0, + "step": 16493 + }, + { + "epoch": 2.0982063350718736, + "ewc_loss": 0.008340221829712391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.340221393154934e-05, + "grad_norm": 4.052102088928223, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.874861478805542, + "num_tokens": 629240956.0, + "step": 16494 + }, + { + "epoch": 2.098333545350464, + "ewc_loss": 0.008353235200047493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353235170943663e-05, + "grad_norm": 4.112398624420166, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8849372863769531, + "num_tokens": 629276196.0, + "step": 16495 + }, + { + "epoch": 2.0984607556290547, + "ewc_loss": 0.008373774588108063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373774471692741e-05, + "grad_norm": 4.129764556884766, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8711509108543396, + "num_tokens": 629311398.0, + "step": 16496 + }, + { + "epoch": 2.098587965907645, + "ewc_loss": 0.008367960341274738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367960253963247e-05, + "grad_norm": 4.119017601013184, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8780133128166199, + "num_tokens": 629349488.0, + "step": 16497 + }, + { + "epoch": 2.0987151761862357, + "ewc_loss": 0.0083555206656456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355520549230278e-05, + "grad_norm": 4.066843032836914, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8934447765350342, + "num_tokens": 629388023.0, + "step": 16498 + }, + { + "epoch": 2.0988423864648262, + "ewc_loss": 0.008337246254086494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.337246254086494e-05, + "grad_norm": 4.060428619384766, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8809806108474731, + "num_tokens": 629432140.0, + "step": 16499 + }, + { + "epoch": 2.0989695967434168, + "ewc_loss": 0.008333083242177963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333082951139659e-05, + "grad_norm": 3.9999969005584717, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8702422380447388, + "num_tokens": 629480690.0, + "step": 16500 + }, + { + "epoch": 2.0990968070220073, + "ewc_loss": 0.008298870176076889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298869943246245e-05, + "grad_norm": 4.024701118469238, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8995969891548157, + "num_tokens": 629522500.0, + "step": 16501 + }, + { + "epoch": 2.099224017300598, + "ewc_loss": 0.00835184846073389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.351848373422399e-05, + "grad_norm": 4.057438850402832, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8844859600067139, + "num_tokens": 629563450.0, + "step": 16502 + }, + { + "epoch": 2.0993512275791884, + "ewc_loss": 0.008343331515789032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343331865034997e-05, + "grad_norm": 4.042281150817871, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8905669450759888, + "num_tokens": 629603753.0, + "step": 16503 + }, + { + "epoch": 2.099478437857779, + "ewc_loss": 0.008306684903800488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30668504931964e-05, + "grad_norm": 4.14331579208374, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8854503035545349, + "num_tokens": 629637828.0, + "step": 16504 + }, + { + "epoch": 2.0996056481363694, + "ewc_loss": 0.00835143867880106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35143873700872e-05, + "grad_norm": 4.075475215911865, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8905572891235352, + "num_tokens": 629674636.0, + "step": 16505 + }, + { + "epoch": 2.09973285841496, + "ewc_loss": 0.008268795907497406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.268796227639541e-05, + "grad_norm": 4.059781074523926, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8737025260925293, + "num_tokens": 629717046.0, + "step": 16506 + }, + { + "epoch": 2.0998600686935505, + "ewc_loss": 0.008282473310828209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.282472845166922e-05, + "grad_norm": 4.125341415405273, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8920927047729492, + "num_tokens": 629750331.0, + "step": 16507 + }, + { + "epoch": 2.099987278972141, + "ewc_loss": 0.008320970460772514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320970664499328e-05, + "grad_norm": 4.086486339569092, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8626495599746704, + "num_tokens": 629790095.0, + "step": 16508 + }, + { + "epoch": 2.1001144892507315, + "ewc_loss": 0.00827012863010168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270128455478698e-05, + "grad_norm": 4.075440883636475, + "learning_rate": 1e-06, + "loss": 0.2867, + "mean_token_accuracy": 0.8974167108535767, + "num_tokens": 629823035.0, + "step": 16509 + }, + { + "epoch": 2.100241699529322, + "ewc_loss": 0.008268888108432293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26888790470548e-05, + "grad_norm": 4.117978096008301, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.878862738609314, + "num_tokens": 629857096.0, + "step": 16510 + }, + { + "epoch": 2.1003689098079126, + "ewc_loss": 0.008298411965370178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298411557916552e-05, + "grad_norm": 4.115907669067383, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.883030354976654, + "num_tokens": 629893448.0, + "step": 16511 + }, + { + "epoch": 2.100496120086503, + "ewc_loss": 0.008275536820292473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.27553667477332e-05, + "grad_norm": 4.102160453796387, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.880305290222168, + "num_tokens": 629931301.0, + "step": 16512 + }, + { + "epoch": 2.1006233303650936, + "ewc_loss": 0.008282588794827461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.282588532892987e-05, + "grad_norm": 4.08853816986084, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8908218145370483, + "num_tokens": 629963481.0, + "step": 16513 + }, + { + "epoch": 2.100750540643684, + "ewc_loss": 0.008294526487588882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.294526924146339e-05, + "grad_norm": 4.089385986328125, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8795124888420105, + "num_tokens": 630002739.0, + "step": 16514 + }, + { + "epoch": 2.1008777509222747, + "ewc_loss": 0.008294899016618729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.294898725580424e-05, + "grad_norm": 4.055558204650879, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8765869140625, + "num_tokens": 630041206.0, + "step": 16515 + }, + { + "epoch": 2.101004961200865, + "ewc_loss": 0.008292743936181068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.292744314530864e-05, + "grad_norm": 4.108479976654053, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8702318668365479, + "num_tokens": 630083811.0, + "step": 16516 + }, + { + "epoch": 2.1011321714794557, + "ewc_loss": 0.00832399632781744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323996735271066e-05, + "grad_norm": 4.081794261932373, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.881163477897644, + "num_tokens": 630119823.0, + "step": 16517 + }, + { + "epoch": 2.1012593817580463, + "ewc_loss": 0.008302800357341766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.302800415549427e-05, + "grad_norm": 4.005591869354248, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8831151723861694, + "num_tokens": 630163807.0, + "step": 16518 + }, + { + "epoch": 2.1013865920366364, + "ewc_loss": 0.00828450545668602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.284505020128563e-05, + "grad_norm": 4.111945629119873, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8783935308456421, + "num_tokens": 630203038.0, + "step": 16519 + }, + { + "epoch": 2.101513802315227, + "ewc_loss": 0.008378435857594013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378435450140387e-05, + "grad_norm": 4.08151912689209, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8859451413154602, + "num_tokens": 630238505.0, + "step": 16520 + }, + { + "epoch": 2.1016410125938174, + "ewc_loss": 0.008299765177071095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299765613628551e-05, + "grad_norm": 4.0616044998168945, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8822377920150757, + "num_tokens": 630275279.0, + "step": 16521 + }, + { + "epoch": 2.101768222872408, + "ewc_loss": 0.008301371708512306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.301371417474002e-05, + "grad_norm": 4.061714172363281, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8782944679260254, + "num_tokens": 630317288.0, + "step": 16522 + }, + { + "epoch": 2.1018954331509985, + "ewc_loss": 0.008334058336913586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.334057929459959e-05, + "grad_norm": 4.1061177253723145, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8831101655960083, + "num_tokens": 630352786.0, + "step": 16523 + }, + { + "epoch": 2.102022643429589, + "ewc_loss": 0.008346331305801868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346331014763564e-05, + "grad_norm": 4.056180477142334, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.883601188659668, + "num_tokens": 630391138.0, + "step": 16524 + }, + { + "epoch": 2.1021498537081795, + "ewc_loss": 0.008298877626657486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298877946799621e-05, + "grad_norm": 4.168429374694824, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8715623021125793, + "num_tokens": 630426567.0, + "step": 16525 + }, + { + "epoch": 2.10227706398677, + "ewc_loss": 0.008393712341785431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393712778342888e-05, + "grad_norm": 4.101242542266846, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8725127577781677, + "num_tokens": 630467308.0, + "step": 16526 + }, + { + "epoch": 2.1024042742653606, + "ewc_loss": 0.008308815769851208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308816177304834e-05, + "grad_norm": 4.050678253173828, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8934750556945801, + "num_tokens": 630503333.0, + "step": 16527 + }, + { + "epoch": 2.102531484543951, + "ewc_loss": 0.008319754153490067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319754124386236e-05, + "grad_norm": 4.129907608032227, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8789118528366089, + "num_tokens": 630536838.0, + "step": 16528 + }, + { + "epoch": 2.1026586948225416, + "ewc_loss": 0.008383520878851414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38352061691694e-05, + "grad_norm": 4.081382751464844, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.895339846611023, + "num_tokens": 630570833.0, + "step": 16529 + }, + { + "epoch": 2.102785905101132, + "ewc_loss": 0.008329483680427074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329483534907922e-05, + "grad_norm": 4.014348983764648, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8904430270195007, + "num_tokens": 630615043.0, + "step": 16530 + }, + { + "epoch": 2.1029131153797227, + "ewc_loss": 0.008315091021358967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315090963151306e-05, + "grad_norm": 4.117271423339844, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.9031919240951538, + "num_tokens": 630651368.0, + "step": 16531 + }, + { + "epoch": 2.1030403256583132, + "ewc_loss": 0.00839939620345831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399396028835326e-05, + "grad_norm": 4.095310688018799, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.876388669013977, + "num_tokens": 630690341.0, + "step": 16532 + }, + { + "epoch": 2.1031675359369038, + "ewc_loss": 0.00832841545343399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32841542433016e-05, + "grad_norm": 4.060762405395508, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8808543086051941, + "num_tokens": 630727168.0, + "step": 16533 + }, + { + "epoch": 2.1032947462154943, + "ewc_loss": 0.008315411396324635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31541110528633e-05, + "grad_norm": 4.043572902679443, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.887252688407898, + "num_tokens": 630766737.0, + "step": 16534 + }, + { + "epoch": 2.103421956494085, + "ewc_loss": 0.008330143056809902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330143464263529e-05, + "grad_norm": 4.078651428222656, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8810757398605347, + "num_tokens": 630803721.0, + "step": 16535 + }, + { + "epoch": 2.1035491667726753, + "ewc_loss": 0.008351392112672329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35139217087999e-05, + "grad_norm": 4.073027610778809, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8886740803718567, + "num_tokens": 630843697.0, + "step": 16536 + }, + { + "epoch": 2.103676377051266, + "ewc_loss": 0.008315524086356163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31552388262935e-05, + "grad_norm": 4.142703056335449, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8841519355773926, + "num_tokens": 630874325.0, + "step": 16537 + }, + { + "epoch": 2.1038035873298564, + "ewc_loss": 0.008369514718651772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369514398509637e-05, + "grad_norm": 4.109498500823975, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8797442317008972, + "num_tokens": 630911456.0, + "step": 16538 + }, + { + "epoch": 2.103930797608447, + "ewc_loss": 0.008306281641125679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.306281961267814e-05, + "grad_norm": 4.065047740936279, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8815668821334839, + "num_tokens": 630954418.0, + "step": 16539 + }, + { + "epoch": 2.1040580078870375, + "ewc_loss": 0.008308511227369308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3085113146808e-05, + "grad_norm": 4.03555965423584, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8821976184844971, + "num_tokens": 630991826.0, + "step": 16540 + }, + { + "epoch": 2.104185218165628, + "ewc_loss": 0.008309903554618359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309903205372393e-05, + "grad_norm": 4.1730852127075195, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8833720684051514, + "num_tokens": 631025831.0, + "step": 16541 + }, + { + "epoch": 2.1043124284442185, + "ewc_loss": 0.008386876434087753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386876288568601e-05, + "grad_norm": 4.082873821258545, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8689633011817932, + "num_tokens": 631062896.0, + "step": 16542 + }, + { + "epoch": 2.1044396387228086, + "ewc_loss": 0.00828014686703682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280146721517667e-05, + "grad_norm": 4.145244121551514, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8884903192520142, + "num_tokens": 631099291.0, + "step": 16543 + }, + { + "epoch": 2.104566849001399, + "ewc_loss": 0.00837483536452055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374835306312889e-05, + "grad_norm": 4.1199774742126465, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8752869367599487, + "num_tokens": 631141570.0, + "step": 16544 + }, + { + "epoch": 2.1046940592799896, + "ewc_loss": 0.00834587961435318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345879905391484e-05, + "grad_norm": 4.10600471496582, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8757587671279907, + "num_tokens": 631178882.0, + "step": 16545 + }, + { + "epoch": 2.10482126955858, + "ewc_loss": 0.008329659700393677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329659613082185e-05, + "grad_norm": 4.062231540679932, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8802031874656677, + "num_tokens": 631218059.0, + "step": 16546 + }, + { + "epoch": 2.1049484798371707, + "ewc_loss": 0.008319553919136524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319554035551846e-05, + "grad_norm": 4.156186103820801, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8840744495391846, + "num_tokens": 631252827.0, + "step": 16547 + }, + { + "epoch": 2.1050756901157612, + "ewc_loss": 0.008381295949220657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.381296356674284e-05, + "grad_norm": 4.015982151031494, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8891153335571289, + "num_tokens": 631294370.0, + "step": 16548 + }, + { + "epoch": 2.1052029003943518, + "ewc_loss": 0.00827445276081562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274452557088807e-05, + "grad_norm": 4.080350399017334, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8790806531906128, + "num_tokens": 631331091.0, + "step": 16549 + }, + { + "epoch": 2.1053301106729423, + "ewc_loss": 0.0083809494972229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380949293496087e-05, + "grad_norm": 4.081573009490967, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8857381343841553, + "num_tokens": 631367285.0, + "step": 16550 + }, + { + "epoch": 2.105457320951533, + "ewc_loss": 0.008343187160789967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343187073478475e-05, + "grad_norm": 4.118650436401367, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8690075874328613, + "num_tokens": 631402471.0, + "step": 16551 + }, + { + "epoch": 2.1055845312301233, + "ewc_loss": 0.008360813371837139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360813080798835e-05, + "grad_norm": 4.0670928955078125, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.882623016834259, + "num_tokens": 631439706.0, + "step": 16552 + }, + { + "epoch": 2.105711741508714, + "ewc_loss": 0.008358404040336609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358404011232778e-05, + "grad_norm": 4.135098457336426, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8883689641952515, + "num_tokens": 631471437.0, + "step": 16553 + }, + { + "epoch": 2.1058389517873044, + "ewc_loss": 0.008406980894505978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406980487052351e-05, + "grad_norm": 4.008131980895996, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8809857368469238, + "num_tokens": 631515028.0, + "step": 16554 + }, + { + "epoch": 2.105966162065895, + "ewc_loss": 0.008301069028675556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.301068737637252e-05, + "grad_norm": 4.084110736846924, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8809300065040588, + "num_tokens": 631552946.0, + "step": 16555 + }, + { + "epoch": 2.1060933723444855, + "ewc_loss": 0.008401656523346901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401656668866053e-05, + "grad_norm": 4.0518951416015625, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8861229419708252, + "num_tokens": 631596045.0, + "step": 16556 + }, + { + "epoch": 2.106220582623076, + "ewc_loss": 0.00833376869559288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333769073942676e-05, + "grad_norm": 4.075416564941406, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8899198770523071, + "num_tokens": 631630233.0, + "step": 16557 + }, + { + "epoch": 2.1063477929016665, + "ewc_loss": 0.008373471908271313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373471791855991e-05, + "grad_norm": 4.069910526275635, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.887782096862793, + "num_tokens": 631666665.0, + "step": 16558 + }, + { + "epoch": 2.106475003180257, + "ewc_loss": 0.008342758752405643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342758519575e-05, + "grad_norm": 4.0382080078125, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8792348504066467, + "num_tokens": 631709536.0, + "step": 16559 + }, + { + "epoch": 2.1066022134588476, + "ewc_loss": 0.008336825296282768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336824976140633e-05, + "grad_norm": 4.115836143493652, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.897908627986908, + "num_tokens": 631744333.0, + "step": 16560 + }, + { + "epoch": 2.106729423737438, + "ewc_loss": 0.008368567563593388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368567796424031e-05, + "grad_norm": 4.103259086608887, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8691913485527039, + "num_tokens": 631783522.0, + "step": 16561 + }, + { + "epoch": 2.1068566340160286, + "ewc_loss": 0.008333693258464336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333693403983489e-05, + "grad_norm": 4.105707168579102, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8691003918647766, + "num_tokens": 631822579.0, + "step": 16562 + }, + { + "epoch": 2.106983844294619, + "ewc_loss": 0.008317791856825352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317791798617691e-05, + "grad_norm": 4.029547691345215, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8888274431228638, + "num_tokens": 631862642.0, + "step": 16563 + }, + { + "epoch": 2.1071110545732097, + "ewc_loss": 0.008293603546917439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2936036051251e-05, + "grad_norm": 4.047844886779785, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8923090100288391, + "num_tokens": 631903161.0, + "step": 16564 + }, + { + "epoch": 2.1072382648518, + "ewc_loss": 0.008326481096446514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32648147479631e-05, + "grad_norm": 4.16972541809082, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8859339952468872, + "num_tokens": 631938149.0, + "step": 16565 + }, + { + "epoch": 2.1073654751303907, + "ewc_loss": 0.008381124585866928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.381124644074589e-05, + "grad_norm": 4.081124305725098, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8868945240974426, + "num_tokens": 631974717.0, + "step": 16566 + }, + { + "epoch": 2.107492685408981, + "ewc_loss": 0.008284468203783035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.284467912744731e-05, + "grad_norm": 4.073907852172852, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8870241641998291, + "num_tokens": 632011147.0, + "step": 16567 + }, + { + "epoch": 2.1076198956875714, + "ewc_loss": 0.008324362337589264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324361988343298e-05, + "grad_norm": 4.057456970214844, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8883675336837769, + "num_tokens": 632051915.0, + "step": 16568 + }, + { + "epoch": 2.107747105966162, + "ewc_loss": 0.0083029018715024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.302902278956026e-05, + "grad_norm": 4.179941177368164, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8753971457481384, + "num_tokens": 632083047.0, + "step": 16569 + }, + { + "epoch": 2.1078743162447524, + "ewc_loss": 0.008404692634940147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40469219838269e-05, + "grad_norm": 4.103670597076416, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8961174488067627, + "num_tokens": 632119250.0, + "step": 16570 + }, + { + "epoch": 2.108001526523343, + "ewc_loss": 0.00829747412353754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.297473686980084e-05, + "grad_norm": 4.125680446624756, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8895329236984253, + "num_tokens": 632152696.0, + "step": 16571 + }, + { + "epoch": 2.1081287368019335, + "ewc_loss": 0.00835912674665451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.359126513823867e-05, + "grad_norm": 4.109292507171631, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.9010618925094604, + "num_tokens": 632188167.0, + "step": 16572 + }, + { + "epoch": 2.108255947080524, + "ewc_loss": 0.008335184305906296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.335184247698635e-05, + "grad_norm": 4.072227478027344, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.881184995174408, + "num_tokens": 632229567.0, + "step": 16573 + }, + { + "epoch": 2.1083831573591145, + "ewc_loss": 0.008303801529109478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303801587317139e-05, + "grad_norm": 4.0569634437561035, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8918004631996155, + "num_tokens": 632266870.0, + "step": 16574 + }, + { + "epoch": 2.108510367637705, + "ewc_loss": 0.008331859484314919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331859135068953e-05, + "grad_norm": 4.111738681793213, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8829312324523926, + "num_tokens": 632303844.0, + "step": 16575 + }, + { + "epoch": 2.1086375779162956, + "ewc_loss": 0.008348410949110985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348411211045459e-05, + "grad_norm": 4.08644437789917, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8835892677307129, + "num_tokens": 632339343.0, + "step": 16576 + }, + { + "epoch": 2.108764788194886, + "ewc_loss": 0.008304724469780922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304724178742617e-05, + "grad_norm": 4.053165435791016, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8875954747200012, + "num_tokens": 632385516.0, + "step": 16577 + }, + { + "epoch": 2.1088919984734766, + "ewc_loss": 0.008305368945002556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305368828587234e-05, + "grad_norm": 4.100947380065918, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8877193331718445, + "num_tokens": 632420664.0, + "step": 16578 + }, + { + "epoch": 2.109019208752067, + "ewc_loss": 0.008338501676917076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338502084370703e-05, + "grad_norm": 4.058623790740967, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8880300521850586, + "num_tokens": 632458082.0, + "step": 16579 + }, + { + "epoch": 2.1091464190306577, + "ewc_loss": 0.00828982051461935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289820107165724e-05, + "grad_norm": 4.08689546585083, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8725088238716125, + "num_tokens": 632495963.0, + "step": 16580 + }, + { + "epoch": 2.109273629309248, + "ewc_loss": 0.008318057283759117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318057371070608e-05, + "grad_norm": 4.164333820343018, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8844329118728638, + "num_tokens": 632529968.0, + "step": 16581 + }, + { + "epoch": 2.1094008395878387, + "ewc_loss": 0.008358052931725979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358053310075775e-05, + "grad_norm": 4.088931083679199, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8894933462142944, + "num_tokens": 632567527.0, + "step": 16582 + }, + { + "epoch": 2.1095280498664293, + "ewc_loss": 0.00827952940016985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.279529720311984e-05, + "grad_norm": 4.03717041015625, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8839826583862305, + "num_tokens": 632606192.0, + "step": 16583 + }, + { + "epoch": 2.10965526014502, + "ewc_loss": 0.008299301378428936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299301407532766e-05, + "grad_norm": 4.070322513580322, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8926788568496704, + "num_tokens": 632645940.0, + "step": 16584 + }, + { + "epoch": 2.1097824704236103, + "ewc_loss": 0.008324883878231049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324883674504235e-05, + "grad_norm": 4.057218074798584, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8897089958190918, + "num_tokens": 632689385.0, + "step": 16585 + }, + { + "epoch": 2.109909680702201, + "ewc_loss": 0.008271681144833565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271681144833565e-05, + "grad_norm": 4.162064075469971, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8912781476974487, + "num_tokens": 632720178.0, + "step": 16586 + }, + { + "epoch": 2.1100368909807914, + "ewc_loss": 0.008360770530998707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360770152648911e-05, + "grad_norm": 4.075234889984131, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8754252195358276, + "num_tokens": 632763249.0, + "step": 16587 + }, + { + "epoch": 2.110164101259382, + "ewc_loss": 0.008262043818831444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.262043411377817e-05, + "grad_norm": 4.1114583015441895, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8666678667068481, + "num_tokens": 632803293.0, + "step": 16588 + }, + { + "epoch": 2.1102913115379724, + "ewc_loss": 0.008310722187161446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31072247819975e-05, + "grad_norm": 4.132603168487549, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8742837905883789, + "num_tokens": 632838731.0, + "step": 16589 + }, + { + "epoch": 2.110418521816563, + "ewc_loss": 0.008318142034113407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318141772178933e-05, + "grad_norm": 4.128713607788086, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8753422498703003, + "num_tokens": 632875044.0, + "step": 16590 + }, + { + "epoch": 2.1105457320951535, + "ewc_loss": 0.008302232250571251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30223216325976e-05, + "grad_norm": 4.135499477386475, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8649721741676331, + "num_tokens": 632911330.0, + "step": 16591 + }, + { + "epoch": 2.1106729423737436, + "ewc_loss": 0.008328787982463837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328787953360006e-05, + "grad_norm": 4.132743835449219, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8744722604751587, + "num_tokens": 632952122.0, + "step": 16592 + }, + { + "epoch": 2.110800152652334, + "ewc_loss": 0.008318621665239334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318621985381469e-05, + "grad_norm": 4.175752639770508, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8676099181175232, + "num_tokens": 632985602.0, + "step": 16593 + }, + { + "epoch": 2.1109273629309246, + "ewc_loss": 0.008357337675988674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357337355846539e-05, + "grad_norm": 4.0878520011901855, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8828646540641785, + "num_tokens": 633023792.0, + "step": 16594 + }, + { + "epoch": 2.111054573209515, + "ewc_loss": 0.008310051634907722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.310051634907722e-05, + "grad_norm": 4.114071846008301, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.883920431137085, + "num_tokens": 633065457.0, + "step": 16595 + }, + { + "epoch": 2.1111817834881057, + "ewc_loss": 0.008354379795491695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354379679076374e-05, + "grad_norm": 4.076021671295166, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8922675848007202, + "num_tokens": 633101402.0, + "step": 16596 + }, + { + "epoch": 2.1113089937666962, + "ewc_loss": 0.00833908747881651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339087798958644e-05, + "grad_norm": 4.073284149169922, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8738390207290649, + "num_tokens": 633147002.0, + "step": 16597 + }, + { + "epoch": 2.1114362040452868, + "ewc_loss": 0.008347555063664913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34755483083427e-05, + "grad_norm": 4.106811046600342, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8824959993362427, + "num_tokens": 633184209.0, + "step": 16598 + }, + { + "epoch": 2.1115634143238773, + "ewc_loss": 0.008355604484677315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355604950338602e-05, + "grad_norm": 4.097771644592285, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8883130550384521, + "num_tokens": 633224525.0, + "step": 16599 + }, + { + "epoch": 2.111690624602468, + "ewc_loss": 0.008333766832947731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333766891155392e-05, + "grad_norm": 4.114811897277832, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8967353701591492, + "num_tokens": 633253171.0, + "step": 16600 + }, + { + "epoch": 2.1118178348810583, + "ewc_loss": 0.008363285101950169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363284723600373e-05, + "grad_norm": 4.068135738372803, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8804891109466553, + "num_tokens": 633294326.0, + "step": 16601 + }, + { + "epoch": 2.111945045159649, + "ewc_loss": 0.00830926839262247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309268741868436e-05, + "grad_norm": 4.118207931518555, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8942884802818298, + "num_tokens": 633327236.0, + "step": 16602 + }, + { + "epoch": 2.1120722554382394, + "ewc_loss": 0.008364017121493816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364017412532121e-05, + "grad_norm": 4.048172473907471, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8915369510650635, + "num_tokens": 633368593.0, + "step": 16603 + }, + { + "epoch": 2.11219946571683, + "ewc_loss": 0.008299851790070534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299851469928399e-05, + "grad_norm": 4.176192760467529, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.863005518913269, + "num_tokens": 633402598.0, + "step": 16604 + }, + { + "epoch": 2.1123266759954205, + "ewc_loss": 0.008401782251894474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401782542932779e-05, + "grad_norm": 4.112522602081299, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8780975937843323, + "num_tokens": 633437069.0, + "step": 16605 + }, + { + "epoch": 2.112453886274011, + "ewc_loss": 0.00831800326704979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318002801388502e-05, + "grad_norm": 4.026880741119385, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8844341039657593, + "num_tokens": 633479752.0, + "step": 16606 + }, + { + "epoch": 2.1125810965526015, + "ewc_loss": 0.008302074857056141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.302075002575293e-05, + "grad_norm": 4.0821943283081055, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.877838134765625, + "num_tokens": 633516949.0, + "step": 16607 + }, + { + "epoch": 2.112708306831192, + "ewc_loss": 0.008372405543923378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372405136469752e-05, + "grad_norm": 4.13790225982666, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8866524696350098, + "num_tokens": 633547511.0, + "step": 16608 + }, + { + "epoch": 2.1128355171097826, + "ewc_loss": 0.008383902721107006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383902604691684e-05, + "grad_norm": 4.046666145324707, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8849352598190308, + "num_tokens": 633583940.0, + "step": 16609 + }, + { + "epoch": 2.112962727388373, + "ewc_loss": 0.008314045146107674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314045408042148e-05, + "grad_norm": 4.04854154586792, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8721973299980164, + "num_tokens": 633626245.0, + "step": 16610 + }, + { + "epoch": 2.1130899376669636, + "ewc_loss": 0.008360407315194607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360407082363963e-05, + "grad_norm": 4.119237422943115, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8867561221122742, + "num_tokens": 633662889.0, + "step": 16611 + }, + { + "epoch": 2.113217147945554, + "ewc_loss": 0.008416861295700073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416861237492412e-05, + "grad_norm": 4.108730316162109, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8791925311088562, + "num_tokens": 633701461.0, + "step": 16612 + }, + { + "epoch": 2.1133443582241447, + "ewc_loss": 0.008360030129551888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360030187759548e-05, + "grad_norm": 4.095565319061279, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8849010467529297, + "num_tokens": 633741302.0, + "step": 16613 + }, + { + "epoch": 2.113471568502735, + "ewc_loss": 0.008364489302039146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364489622181281e-05, + "grad_norm": 4.056760787963867, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8880676031112671, + "num_tokens": 633780337.0, + "step": 16614 + }, + { + "epoch": 2.1135987787813257, + "ewc_loss": 0.008346639573574066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346639515366405e-05, + "grad_norm": 4.093758583068848, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8896448612213135, + "num_tokens": 633814450.0, + "step": 16615 + }, + { + "epoch": 2.1137259890599163, + "ewc_loss": 0.008370934054255486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370933937840164e-05, + "grad_norm": 4.084653854370117, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8743811845779419, + "num_tokens": 633856252.0, + "step": 16616 + }, + { + "epoch": 2.1138531993385064, + "ewc_loss": 0.008355198428034782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355198224307969e-05, + "grad_norm": 4.120479583740234, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8724652528762817, + "num_tokens": 633895235.0, + "step": 16617 + }, + { + "epoch": 2.113980409617097, + "ewc_loss": 0.00838684942573309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386849367525429e-05, + "grad_norm": 4.07650089263916, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8847205638885498, + "num_tokens": 633938948.0, + "step": 16618 + }, + { + "epoch": 2.1141076198956874, + "ewc_loss": 0.008329351432621479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329351112479344e-05, + "grad_norm": 4.079107761383057, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8903380632400513, + "num_tokens": 633973393.0, + "step": 16619 + }, + { + "epoch": 2.114234830174278, + "ewc_loss": 0.00835051853209734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350518328370526e-05, + "grad_norm": 4.115819931030273, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8885194063186646, + "num_tokens": 634006714.0, + "step": 16620 + }, + { + "epoch": 2.1143620404528685, + "ewc_loss": 0.008347662165760994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347661787411198e-05, + "grad_norm": 4.056636333465576, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8882190585136414, + "num_tokens": 634046104.0, + "step": 16621 + }, + { + "epoch": 2.114489250731459, + "ewc_loss": 0.008324451744556427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324452210217714e-05, + "grad_norm": 4.066581726074219, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8755244016647339, + "num_tokens": 634086401.0, + "step": 16622 + }, + { + "epoch": 2.1146164610100495, + "ewc_loss": 0.008357471786439419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357471961062402e-05, + "grad_norm": 4.130730152130127, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8773281574249268, + "num_tokens": 634125479.0, + "step": 16623 + }, + { + "epoch": 2.11474367128864, + "ewc_loss": 0.008372285403311253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372285810764879e-05, + "grad_norm": 4.123208522796631, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8652548789978027, + "num_tokens": 634163873.0, + "step": 16624 + }, + { + "epoch": 2.1148708815672306, + "ewc_loss": 0.008346226066350937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34622624097392e-05, + "grad_norm": 4.0823798179626465, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8809940814971924, + "num_tokens": 634200533.0, + "step": 16625 + }, + { + "epoch": 2.114998091845821, + "ewc_loss": 0.00832535419613123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325354428961873e-05, + "grad_norm": 4.099469184875488, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8764760494232178, + "num_tokens": 634235891.0, + "step": 16626 + }, + { + "epoch": 2.1151253021244116, + "ewc_loss": 0.008362413384020329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362413063878193e-05, + "grad_norm": 4.133357524871826, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8833240866661072, + "num_tokens": 634266748.0, + "step": 16627 + }, + { + "epoch": 2.115252512403002, + "ewc_loss": 0.008377266116440296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377266203751788e-05, + "grad_norm": 4.138589382171631, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8692392706871033, + "num_tokens": 634302603.0, + "step": 16628 + }, + { + "epoch": 2.1153797226815927, + "ewc_loss": 0.008377090096473694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377090125577524e-05, + "grad_norm": 4.024029731750488, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8836038708686829, + "num_tokens": 634345471.0, + "step": 16629 + }, + { + "epoch": 2.115506932960183, + "ewc_loss": 0.008312257006764412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312256977660581e-05, + "grad_norm": 4.084078311920166, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8801692724227905, + "num_tokens": 634383902.0, + "step": 16630 + }, + { + "epoch": 2.1156341432387737, + "ewc_loss": 0.008407410234212875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40741049614735e-05, + "grad_norm": 4.108972549438477, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8775585889816284, + "num_tokens": 634421082.0, + "step": 16631 + }, + { + "epoch": 2.1157613535173643, + "ewc_loss": 0.008369285613298416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36928520584479e-05, + "grad_norm": 4.111495494842529, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8772861361503601, + "num_tokens": 634456490.0, + "step": 16632 + }, + { + "epoch": 2.115888563795955, + "ewc_loss": 0.008381571620702744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3815713878721e-05, + "grad_norm": 4.094541549682617, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8641549348831177, + "num_tokens": 634498252.0, + "step": 16633 + }, + { + "epoch": 2.1160157740745453, + "ewc_loss": 0.008347989991307259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347989933099598e-05, + "grad_norm": 4.0553507804870605, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8912551403045654, + "num_tokens": 634540769.0, + "step": 16634 + }, + { + "epoch": 2.116142984353136, + "ewc_loss": 0.008349291048943996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349290874321014e-05, + "grad_norm": 4.1247453689575195, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8895139694213867, + "num_tokens": 634573807.0, + "step": 16635 + }, + { + "epoch": 2.1162701946317264, + "ewc_loss": 0.008410682901740074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410682494286448e-05, + "grad_norm": 4.143312454223633, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8780636787414551, + "num_tokens": 634609542.0, + "step": 16636 + }, + { + "epoch": 2.116397404910317, + "ewc_loss": 0.008385742083191872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38574196677655e-05, + "grad_norm": 4.056314945220947, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8897368907928467, + "num_tokens": 634648641.0, + "step": 16637 + }, + { + "epoch": 2.1165246151889074, + "ewc_loss": 0.008330986835062504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330986747751012e-05, + "grad_norm": 4.132791996002197, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8811736106872559, + "num_tokens": 634685531.0, + "step": 16638 + }, + { + "epoch": 2.116651825467498, + "ewc_loss": 0.008398150093853474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398149657296017e-05, + "grad_norm": 4.169295787811279, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8839194774627686, + "num_tokens": 634717239.0, + "step": 16639 + }, + { + "epoch": 2.116779035746088, + "ewc_loss": 0.008404326625168324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404326945310459e-05, + "grad_norm": 4.14803409576416, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8808388710021973, + "num_tokens": 634750058.0, + "step": 16640 + }, + { + "epoch": 2.1169062460246786, + "ewc_loss": 0.008383926004171371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38392588775605e-05, + "grad_norm": 4.02345609664917, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8765291571617126, + "num_tokens": 634793480.0, + "step": 16641 + }, + { + "epoch": 2.117033456303269, + "ewc_loss": 0.008326644077897072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326643728651106e-05, + "grad_norm": 4.121087551116943, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8727709650993347, + "num_tokens": 634829923.0, + "step": 16642 + }, + { + "epoch": 2.1171606665818596, + "ewc_loss": 0.00843557808548212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435577910859138e-05, + "grad_norm": 4.111818313598633, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8784286975860596, + "num_tokens": 634868499.0, + "step": 16643 + }, + { + "epoch": 2.11728787686045, + "ewc_loss": 0.00838784221559763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387842535739765e-05, + "grad_norm": 4.126906871795654, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.891223669052124, + "num_tokens": 634905922.0, + "step": 16644 + }, + { + "epoch": 2.1174150871390407, + "ewc_loss": 0.008392829447984695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.392829477088526e-05, + "grad_norm": 4.065551280975342, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8830560445785522, + "num_tokens": 634946190.0, + "step": 16645 + }, + { + "epoch": 2.1175422974176312, + "ewc_loss": 0.008363200351595879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363200322492048e-05, + "grad_norm": 4.066235542297363, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8958275318145752, + "num_tokens": 634981840.0, + "step": 16646 + }, + { + "epoch": 2.1176695076962218, + "ewc_loss": 0.008371220901608467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371220610570163e-05, + "grad_norm": 4.1023101806640625, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8799545168876648, + "num_tokens": 635024755.0, + "step": 16647 + }, + { + "epoch": 2.1177967179748123, + "ewc_loss": 0.008389501832425594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389501454075798e-05, + "grad_norm": 4.089016437530518, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.885999858379364, + "num_tokens": 635063076.0, + "step": 16648 + }, + { + "epoch": 2.117923928253403, + "ewc_loss": 0.008344738744199276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34473903523758e-05, + "grad_norm": 4.054964542388916, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8941894769668579, + "num_tokens": 635103052.0, + "step": 16649 + }, + { + "epoch": 2.1180511385319933, + "ewc_loss": 0.008330831304192543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33083176985383e-05, + "grad_norm": 4.107573986053467, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8916565775871277, + "num_tokens": 635140917.0, + "step": 16650 + }, + { + "epoch": 2.118178348810584, + "ewc_loss": 0.008348889648914337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348889969056472e-05, + "grad_norm": 4.072150230407715, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8867795467376709, + "num_tokens": 635182049.0, + "step": 16651 + }, + { + "epoch": 2.1183055590891744, + "ewc_loss": 0.00830069836229086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30069839139469e-05, + "grad_norm": 4.134767532348633, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8785659074783325, + "num_tokens": 635217613.0, + "step": 16652 + }, + { + "epoch": 2.118432769367765, + "ewc_loss": 0.008360284380614758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360284118680283e-05, + "grad_norm": 4.09702730178833, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8920331001281738, + "num_tokens": 635252716.0, + "step": 16653 + }, + { + "epoch": 2.1185599796463555, + "ewc_loss": 0.008296097628772259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.296097803395241e-05, + "grad_norm": 4.100310325622559, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8856621980667114, + "num_tokens": 635288919.0, + "step": 16654 + }, + { + "epoch": 2.118687189924946, + "ewc_loss": 0.008331283926963806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331283606821671e-05, + "grad_norm": 4.0716729164123535, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8776267766952515, + "num_tokens": 635329781.0, + "step": 16655 + }, + { + "epoch": 2.1188144002035365, + "ewc_loss": 0.008300019428133965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300019544549286e-05, + "grad_norm": 4.12109375, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8924680948257446, + "num_tokens": 635363844.0, + "step": 16656 + }, + { + "epoch": 2.118941610482127, + "ewc_loss": 0.008325120434165001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325120143126696e-05, + "grad_norm": 4.138881206512451, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8804633021354675, + "num_tokens": 635397568.0, + "step": 16657 + }, + { + "epoch": 2.1190688207607176, + "ewc_loss": 0.008301611989736557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.301612251671031e-05, + "grad_norm": 4.137691020965576, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.868276834487915, + "num_tokens": 635432903.0, + "step": 16658 + }, + { + "epoch": 2.119196031039308, + "ewc_loss": 0.008320288732647896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320288907270879e-05, + "grad_norm": 4.106459617614746, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.883481502532959, + "num_tokens": 635473388.0, + "step": 16659 + }, + { + "epoch": 2.1193232413178986, + "ewc_loss": 0.008296815678477287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.296815940411761e-05, + "grad_norm": 4.051199436187744, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8862218856811523, + "num_tokens": 635512966.0, + "step": 16660 + }, + { + "epoch": 2.119450451596489, + "ewc_loss": 0.008271986618638039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.27198673505336e-05, + "grad_norm": 4.1456708908081055, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.870890736579895, + "num_tokens": 635547814.0, + "step": 16661 + }, + { + "epoch": 2.1195776618750797, + "ewc_loss": 0.008355312049388885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35531172924675e-05, + "grad_norm": 4.124327182769775, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.875288724899292, + "num_tokens": 635583120.0, + "step": 16662 + }, + { + "epoch": 2.11970487215367, + "ewc_loss": 0.008316747844219208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316747698700055e-05, + "grad_norm": 4.113912582397461, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8783950805664062, + "num_tokens": 635621531.0, + "step": 16663 + }, + { + "epoch": 2.1198320824322607, + "ewc_loss": 0.008304694667458534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304694347316399e-05, + "grad_norm": 4.134889602661133, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8741468787193298, + "num_tokens": 635659641.0, + "step": 16664 + }, + { + "epoch": 2.119959292710851, + "ewc_loss": 0.00832781009376049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32781006465666e-05, + "grad_norm": 4.103257656097412, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8767393827438354, + "num_tokens": 635697953.0, + "step": 16665 + }, + { + "epoch": 2.1200865029894413, + "ewc_loss": 0.008306683041155338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.306682866532356e-05, + "grad_norm": 4.096649646759033, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8573176860809326, + "num_tokens": 635742519.0, + "step": 16666 + }, + { + "epoch": 2.120213713268032, + "ewc_loss": 0.008326233364641666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326233364641666e-05, + "grad_norm": 4.1006340980529785, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8892446756362915, + "num_tokens": 635777786.0, + "step": 16667 + }, + { + "epoch": 2.1203409235466224, + "ewc_loss": 0.008327816613018513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327816613018513e-05, + "grad_norm": 4.090490341186523, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8848484754562378, + "num_tokens": 635814490.0, + "step": 16668 + }, + { + "epoch": 2.120468133825213, + "ewc_loss": 0.008311809971928596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311809506267309e-05, + "grad_norm": 4.044076919555664, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8690178990364075, + "num_tokens": 635858351.0, + "step": 16669 + }, + { + "epoch": 2.1205953441038035, + "ewc_loss": 0.008298865519464016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298865577671677e-05, + "grad_norm": 4.098931789398193, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.880953848361969, + "num_tokens": 635895585.0, + "step": 16670 + }, + { + "epoch": 2.120722554382394, + "ewc_loss": 0.008344575762748718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344575326191261e-05, + "grad_norm": 4.106423854827881, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8894188404083252, + "num_tokens": 635933591.0, + "step": 16671 + }, + { + "epoch": 2.1208497646609845, + "ewc_loss": 0.008326360955834389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326360693899915e-05, + "grad_norm": 4.102331161499023, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8788124322891235, + "num_tokens": 635972040.0, + "step": 16672 + }, + { + "epoch": 2.120976974939575, + "ewc_loss": 0.008325094357132912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325094677275047e-05, + "grad_norm": 4.068871021270752, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8860765099525452, + "num_tokens": 636013287.0, + "step": 16673 + }, + { + "epoch": 2.1211041852181656, + "ewc_loss": 0.008297840133309364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.297840395243838e-05, + "grad_norm": 4.1487884521484375, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8787000179290771, + "num_tokens": 636046137.0, + "step": 16674 + }, + { + "epoch": 2.121231395496756, + "ewc_loss": 0.00837464164942503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374641765840352e-05, + "grad_norm": 4.069782257080078, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8726509809494019, + "num_tokens": 636088870.0, + "step": 16675 + }, + { + "epoch": 2.1213586057753466, + "ewc_loss": 0.00828726775944233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.287267701234668e-05, + "grad_norm": 4.060975551605225, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8923966884613037, + "num_tokens": 636130519.0, + "step": 16676 + }, + { + "epoch": 2.121485816053937, + "ewc_loss": 0.008315441198647022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315440936712548e-05, + "grad_norm": 4.1234588623046875, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8849596977233887, + "num_tokens": 636165641.0, + "step": 16677 + }, + { + "epoch": 2.1216130263325277, + "ewc_loss": 0.008350152522325516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350152347702533e-05, + "grad_norm": 4.109402179718018, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8702449202537537, + "num_tokens": 636203811.0, + "step": 16678 + }, + { + "epoch": 2.121740236611118, + "ewc_loss": 0.008316476829349995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316477033076808e-05, + "grad_norm": 4.072081565856934, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8926992416381836, + "num_tokens": 636239503.0, + "step": 16679 + }, + { + "epoch": 2.1218674468897087, + "ewc_loss": 0.00831182673573494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311826968565583e-05, + "grad_norm": 4.072201251983643, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8844749927520752, + "num_tokens": 636278994.0, + "step": 16680 + }, + { + "epoch": 2.1219946571682993, + "ewc_loss": 0.008318055421113968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318055188283324e-05, + "grad_norm": 4.107491493225098, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8782137632369995, + "num_tokens": 636318025.0, + "step": 16681 + }, + { + "epoch": 2.12212186744689, + "ewc_loss": 0.008333086036145687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333085861522704e-05, + "grad_norm": 4.072937965393066, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8840723037719727, + "num_tokens": 636357289.0, + "step": 16682 + }, + { + "epoch": 2.1222490777254803, + "ewc_loss": 0.00829089805483818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.290898404084146e-05, + "grad_norm": 4.0645647048950195, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8687880635261536, + "num_tokens": 636399067.0, + "step": 16683 + }, + { + "epoch": 2.122376288004071, + "ewc_loss": 0.00832323171198368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323232032125816e-05, + "grad_norm": 4.061219215393066, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.887769877910614, + "num_tokens": 636442418.0, + "step": 16684 + }, + { + "epoch": 2.1225034982826614, + "ewc_loss": 0.008304937742650509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304938091896474e-05, + "grad_norm": 4.108883380889893, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8705363273620605, + "num_tokens": 636484273.0, + "step": 16685 + }, + { + "epoch": 2.122630708561252, + "ewc_loss": 0.008312559686601162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312559657497332e-05, + "grad_norm": 4.07235050201416, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8912400007247925, + "num_tokens": 636525675.0, + "step": 16686 + }, + { + "epoch": 2.1227579188398424, + "ewc_loss": 0.008282138034701347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.282138151116669e-05, + "grad_norm": 4.1125922203063965, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8791667222976685, + "num_tokens": 636559652.0, + "step": 16687 + }, + { + "epoch": 2.122885129118433, + "ewc_loss": 0.008326913230121136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326912939082831e-05, + "grad_norm": 4.09213924407959, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8992727994918823, + "num_tokens": 636597900.0, + "step": 16688 + }, + { + "epoch": 2.1230123393970235, + "ewc_loss": 0.008283977396786213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.283977513201535e-05, + "grad_norm": 4.129804611206055, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8849515914916992, + "num_tokens": 636635822.0, + "step": 16689 + }, + { + "epoch": 2.1231395496756136, + "ewc_loss": 0.008298731409013271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298730972455814e-05, + "grad_norm": 4.195484161376953, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8726128339767456, + "num_tokens": 636667275.0, + "step": 16690 + }, + { + "epoch": 2.123266759954204, + "ewc_loss": 0.008300931192934513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300931222038344e-05, + "grad_norm": 4.118021011352539, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8849629163742065, + "num_tokens": 636704723.0, + "step": 16691 + }, + { + "epoch": 2.1233939702327946, + "ewc_loss": 0.008261103183031082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.261103357654065e-05, + "grad_norm": 4.06313419342041, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8862104415893555, + "num_tokens": 636748209.0, + "step": 16692 + }, + { + "epoch": 2.123521180511385, + "ewc_loss": 0.008248843252658844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.248843369074166e-05, + "grad_norm": 4.096823215484619, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8864783048629761, + "num_tokens": 636785052.0, + "step": 16693 + }, + { + "epoch": 2.1236483907899757, + "ewc_loss": 0.008283437229692936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.283436909550801e-05, + "grad_norm": 4.128073215484619, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8934448957443237, + "num_tokens": 636816073.0, + "step": 16694 + }, + { + "epoch": 2.123775601068566, + "ewc_loss": 0.008304964751005173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304965012939647e-05, + "grad_norm": 4.139651775360107, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8893054723739624, + "num_tokens": 636848963.0, + "step": 16695 + }, + { + "epoch": 2.1239028113471567, + "ewc_loss": 0.008292952552437782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29295240691863e-05, + "grad_norm": 4.099120140075684, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8955219984054565, + "num_tokens": 636880632.0, + "step": 16696 + }, + { + "epoch": 2.1240300216257473, + "ewc_loss": 0.00827454961836338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274550054920837e-05, + "grad_norm": 4.056978225708008, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8706767559051514, + "num_tokens": 636923597.0, + "step": 16697 + }, + { + "epoch": 2.124157231904338, + "ewc_loss": 0.008268868550658226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.268868987215683e-05, + "grad_norm": 4.090453147888184, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.876950204372406, + "num_tokens": 636962250.0, + "step": 16698 + }, + { + "epoch": 2.1242844421829283, + "ewc_loss": 0.008303913287818432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303913637064397e-05, + "grad_norm": 4.103182315826416, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8890052437782288, + "num_tokens": 636997973.0, + "step": 16699 + }, + { + "epoch": 2.124411652461519, + "ewc_loss": 0.008304831571877003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304831862915307e-05, + "grad_norm": 4.075998306274414, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8713868856430054, + "num_tokens": 637037320.0, + "step": 16700 + }, + { + "epoch": 2.1245388627401094, + "ewc_loss": 0.008269431069493294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.269431418739259e-05, + "grad_norm": 4.173771381378174, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8640855550765991, + "num_tokens": 637072267.0, + "step": 16701 + }, + { + "epoch": 2.1246660730187, + "ewc_loss": 0.00834259670227766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342596265720204e-05, + "grad_norm": 4.065091133117676, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.8964710235595703, + "num_tokens": 637108782.0, + "step": 16702 + }, + { + "epoch": 2.1247932832972904, + "ewc_loss": 0.008255751803517342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.255751890828833e-05, + "grad_norm": 4.1293134689331055, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8833408951759338, + "num_tokens": 637147368.0, + "step": 16703 + }, + { + "epoch": 2.124920493575881, + "ewc_loss": 0.008313482627272606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.313482976518571e-05, + "grad_norm": 4.072704792022705, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8906466960906982, + "num_tokens": 637181909.0, + "step": 16704 + }, + { + "epoch": 2.1250477038544715, + "ewc_loss": 0.008268429897725582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.268430246971548e-05, + "grad_norm": 4.1273369789123535, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8835986852645874, + "num_tokens": 637218498.0, + "step": 16705 + }, + { + "epoch": 2.125174914133062, + "ewc_loss": 0.008321281522512436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321281347889453e-05, + "grad_norm": 4.133774280548096, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8843482732772827, + "num_tokens": 637254825.0, + "step": 16706 + }, + { + "epoch": 2.1253021244116526, + "ewc_loss": 0.008314945735037327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314945443999022e-05, + "grad_norm": 4.073172092437744, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8721863627433777, + "num_tokens": 637301593.0, + "step": 16707 + }, + { + "epoch": 2.125429334690243, + "ewc_loss": 0.008280737325549126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280737529275939e-05, + "grad_norm": 4.064458847045898, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8925220370292664, + "num_tokens": 637341676.0, + "step": 16708 + }, + { + "epoch": 2.1255565449688336, + "ewc_loss": 0.008291560225188732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291559788631275e-05, + "grad_norm": 4.105381488800049, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.886753499507904, + "num_tokens": 637378840.0, + "step": 16709 + }, + { + "epoch": 2.125683755247424, + "ewc_loss": 0.008328351192176342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328351395903155e-05, + "grad_norm": 4.197523593902588, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8662711977958679, + "num_tokens": 637412711.0, + "step": 16710 + }, + { + "epoch": 2.1258109655260147, + "ewc_loss": 0.008354853838682175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354854071512818e-05, + "grad_norm": 4.088011741638184, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8818222284317017, + "num_tokens": 637448287.0, + "step": 16711 + }, + { + "epoch": 2.125938175804605, + "ewc_loss": 0.008270524442195892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.27052426757291e-05, + "grad_norm": 4.06535005569458, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.880407452583313, + "num_tokens": 637487899.0, + "step": 16712 + }, + { + "epoch": 2.1260653860831957, + "ewc_loss": 0.008302271366119385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.302271453430876e-05, + "grad_norm": 4.067233085632324, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8867257833480835, + "num_tokens": 637528161.0, + "step": 16713 + }, + { + "epoch": 2.1261925963617863, + "ewc_loss": 0.008317218162119389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317218453157693e-05, + "grad_norm": 4.109079837799072, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8720325231552124, + "num_tokens": 637565692.0, + "step": 16714 + }, + { + "epoch": 2.1263198066403763, + "ewc_loss": 0.008324002847075462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324003283632919e-05, + "grad_norm": 4.124735355377197, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8705089092254639, + "num_tokens": 637600368.0, + "step": 16715 + }, + { + "epoch": 2.126447016918967, + "ewc_loss": 0.008336898870766163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336898463312536e-05, + "grad_norm": 4.091408729553223, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8810126185417175, + "num_tokens": 637635732.0, + "step": 16716 + }, + { + "epoch": 2.1265742271975574, + "ewc_loss": 0.008331704884767532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331704884767532e-05, + "grad_norm": 4.093288898468018, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8777933716773987, + "num_tokens": 637677127.0, + "step": 16717 + }, + { + "epoch": 2.126701437476148, + "ewc_loss": 0.00834095012396574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.340950444107875e-05, + "grad_norm": 4.108577728271484, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.882373571395874, + "num_tokens": 637715278.0, + "step": 16718 + }, + { + "epoch": 2.1268286477547385, + "ewc_loss": 0.008355225436389446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355225145351142e-05, + "grad_norm": 4.192122936248779, + "learning_rate": 1e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8874918222427368, + "num_tokens": 637742937.0, + "step": 16719 + }, + { + "epoch": 2.126955858033329, + "ewc_loss": 0.008423876017332077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.423875988228247e-05, + "grad_norm": 4.13850736618042, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8837131261825562, + "num_tokens": 637776761.0, + "step": 16720 + }, + { + "epoch": 2.1270830683119195, + "ewc_loss": 0.00835674349218607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356743637705222e-05, + "grad_norm": 4.069477081298828, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8835675120353699, + "num_tokens": 637812197.0, + "step": 16721 + }, + { + "epoch": 2.12721027859051, + "ewc_loss": 0.008373208343982697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373208402190357e-05, + "grad_norm": 4.093574523925781, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8854820728302002, + "num_tokens": 637850442.0, + "step": 16722 + }, + { + "epoch": 2.1273374888691006, + "ewc_loss": 0.008409205824136734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409205474890769e-05, + "grad_norm": 4.0908894538879395, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8845987319946289, + "num_tokens": 637889352.0, + "step": 16723 + }, + { + "epoch": 2.127464699147691, + "ewc_loss": 0.008387467823922634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387467823922634e-05, + "grad_norm": 4.114404201507568, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8656789064407349, + "num_tokens": 637927195.0, + "step": 16724 + }, + { + "epoch": 2.1275919094262816, + "ewc_loss": 0.008407819084823132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407819404965267e-05, + "grad_norm": 4.117408752441406, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.873299241065979, + "num_tokens": 637969431.0, + "step": 16725 + }, + { + "epoch": 2.127719119704872, + "ewc_loss": 0.008400315418839455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400314982281998e-05, + "grad_norm": 4.0261430740356445, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.898453414440155, + "num_tokens": 638013910.0, + "step": 16726 + }, + { + "epoch": 2.1278463299834627, + "ewc_loss": 0.00836258102208376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362581138499081e-05, + "grad_norm": 4.10801887512207, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.874001681804657, + "num_tokens": 638051848.0, + "step": 16727 + }, + { + "epoch": 2.127973540262053, + "ewc_loss": 0.008433466777205467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433466427959502e-05, + "grad_norm": 4.062648296356201, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8821871876716614, + "num_tokens": 638092192.0, + "step": 16728 + }, + { + "epoch": 2.1281007505406437, + "ewc_loss": 0.008372952230274677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372952288482338e-05, + "grad_norm": 4.0767107009887695, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8911349177360535, + "num_tokens": 638134306.0, + "step": 16729 + }, + { + "epoch": 2.1282279608192343, + "ewc_loss": 0.008400327526032925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400327351409942e-05, + "grad_norm": 4.0765767097473145, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8713663816452026, + "num_tokens": 638177988.0, + "step": 16730 + }, + { + "epoch": 2.128355171097825, + "ewc_loss": 0.008381127379834652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.381127554457635e-05, + "grad_norm": 4.108140468597412, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8931388854980469, + "num_tokens": 638212237.0, + "step": 16731 + }, + { + "epoch": 2.1284823813764153, + "ewc_loss": 0.008391297422349453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391297160414979e-05, + "grad_norm": 4.120241165161133, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8765354156494141, + "num_tokens": 638252492.0, + "step": 16732 + }, + { + "epoch": 2.128609591655006, + "ewc_loss": 0.008384313434362411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384312968701124e-05, + "grad_norm": 4.076655387878418, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8828102946281433, + "num_tokens": 638294308.0, + "step": 16733 + }, + { + "epoch": 2.1287368019335964, + "ewc_loss": 0.008339935913681984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339936175616458e-05, + "grad_norm": 4.127964973449707, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8850241899490356, + "num_tokens": 638330178.0, + "step": 16734 + }, + { + "epoch": 2.128864012212187, + "ewc_loss": 0.008373935706913471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373935997951776e-05, + "grad_norm": 4.103695869445801, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8630303144454956, + "num_tokens": 638372731.0, + "step": 16735 + }, + { + "epoch": 2.1289912224907774, + "ewc_loss": 0.008334428071975708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.334428275702521e-05, + "grad_norm": 4.090794563293457, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.879338264465332, + "num_tokens": 638407522.0, + "step": 16736 + }, + { + "epoch": 2.129118432769368, + "ewc_loss": 0.008339528925716877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339528721990064e-05, + "grad_norm": 4.11714506149292, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8854003548622131, + "num_tokens": 638443262.0, + "step": 16737 + }, + { + "epoch": 2.129245643047958, + "ewc_loss": 0.008363437838852406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363438246306032e-05, + "grad_norm": 4.071745872497559, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.881947934627533, + "num_tokens": 638482729.0, + "step": 16738 + }, + { + "epoch": 2.129372853326549, + "ewc_loss": 0.008312379010021687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3123792137485e-05, + "grad_norm": 4.106593132019043, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8885983228683472, + "num_tokens": 638520034.0, + "step": 16739 + }, + { + "epoch": 2.129500063605139, + "ewc_loss": 0.0083623006939888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362301014130935e-05, + "grad_norm": 4.11306095123291, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8877153396606445, + "num_tokens": 638558631.0, + "step": 16740 + }, + { + "epoch": 2.1296272738837296, + "ewc_loss": 0.008348912931978703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348913252120838e-05, + "grad_norm": 4.0965046882629395, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8936574459075928, + "num_tokens": 638593565.0, + "step": 16741 + }, + { + "epoch": 2.12975448416232, + "ewc_loss": 0.008323512971401215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323512884089723e-05, + "grad_norm": 4.126842975616455, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8856642246246338, + "num_tokens": 638625952.0, + "step": 16742 + }, + { + "epoch": 2.1298816944409107, + "ewc_loss": 0.00836103968322277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361040090676397e-05, + "grad_norm": 4.12843656539917, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8792593479156494, + "num_tokens": 638662000.0, + "step": 16743 + }, + { + "epoch": 2.130008904719501, + "ewc_loss": 0.008356495760381222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356495527550578e-05, + "grad_norm": 4.075729846954346, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8693244457244873, + "num_tokens": 638706020.0, + "step": 16744 + }, + { + "epoch": 2.1301361149980917, + "ewc_loss": 0.008315741084516048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315740706166252e-05, + "grad_norm": 4.093358039855957, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8887526988983154, + "num_tokens": 638745354.0, + "step": 16745 + }, + { + "epoch": 2.1302633252766823, + "ewc_loss": 0.008339487947523594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339487976627424e-05, + "grad_norm": 4.095545768737793, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8778395056724548, + "num_tokens": 638785497.0, + "step": 16746 + }, + { + "epoch": 2.130390535555273, + "ewc_loss": 0.008323844522237778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32384466775693e-05, + "grad_norm": 4.171264171600342, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.863330602645874, + "num_tokens": 638821650.0, + "step": 16747 + }, + { + "epoch": 2.1305177458338633, + "ewc_loss": 0.008381111547350883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.381111547350883e-05, + "grad_norm": 4.126628875732422, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8692210912704468, + "num_tokens": 638855589.0, + "step": 16748 + }, + { + "epoch": 2.130644956112454, + "ewc_loss": 0.008325006812810898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325006638187915e-05, + "grad_norm": 4.11737060546875, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8823760747909546, + "num_tokens": 638895766.0, + "step": 16749 + }, + { + "epoch": 2.1307721663910444, + "ewc_loss": 0.008341377601027489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341377542819828e-05, + "grad_norm": 4.129950046539307, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8742995262145996, + "num_tokens": 638932622.0, + "step": 16750 + }, + { + "epoch": 2.130899376669635, + "ewc_loss": 0.008358328603208065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358328341273591e-05, + "grad_norm": 4.137693881988525, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8735758066177368, + "num_tokens": 638971588.0, + "step": 16751 + }, + { + "epoch": 2.1310265869482254, + "ewc_loss": 0.008355582132935524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355581667274237e-05, + "grad_norm": 4.093631744384766, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8883289694786072, + "num_tokens": 639009921.0, + "step": 16752 + }, + { + "epoch": 2.131153797226816, + "ewc_loss": 0.008323264308273792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323264046339318e-05, + "grad_norm": 4.104824542999268, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8820343017578125, + "num_tokens": 639048160.0, + "step": 16753 + }, + { + "epoch": 2.1312810075054065, + "ewc_loss": 0.008357536047697067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357535989489406e-05, + "grad_norm": 4.100546836853027, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8948265910148621, + "num_tokens": 639080260.0, + "step": 16754 + }, + { + "epoch": 2.131408217783997, + "ewc_loss": 0.00835478026419878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354779856745154e-05, + "grad_norm": 4.151986122131348, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.87370765209198, + "num_tokens": 639118758.0, + "step": 16755 + }, + { + "epoch": 2.1315354280625876, + "ewc_loss": 0.008376252837479115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376252662856132e-05, + "grad_norm": 4.121479034423828, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8821405172348022, + "num_tokens": 639159528.0, + "step": 16756 + }, + { + "epoch": 2.131662638341178, + "ewc_loss": 0.008320736698806286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320737106259912e-05, + "grad_norm": 4.0644707679748535, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8860596418380737, + "num_tokens": 639200394.0, + "step": 16757 + }, + { + "epoch": 2.1317898486197686, + "ewc_loss": 0.008303025737404823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303025970235467e-05, + "grad_norm": 4.095730781555176, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8740090727806091, + "num_tokens": 639240938.0, + "step": 16758 + }, + { + "epoch": 2.131917058898359, + "ewc_loss": 0.00836640503257513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366404654225335e-05, + "grad_norm": 4.14106559753418, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8775479793548584, + "num_tokens": 639278097.0, + "step": 16759 + }, + { + "epoch": 2.1320442691769497, + "ewc_loss": 0.008352784439921379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352784789167345e-05, + "grad_norm": 4.027002811431885, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8807839155197144, + "num_tokens": 639324512.0, + "step": 16760 + }, + { + "epoch": 2.13217147945554, + "ewc_loss": 0.008259943686425686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.259943570010364e-05, + "grad_norm": 4.088777542114258, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8861435651779175, + "num_tokens": 639366833.0, + "step": 16761 + }, + { + "epoch": 2.1322986897341307, + "ewc_loss": 0.008333179168403149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333178993780166e-05, + "grad_norm": 4.1252899169921875, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8674913048744202, + "num_tokens": 639405765.0, + "step": 16762 + }, + { + "epoch": 2.132425900012721, + "ewc_loss": 0.008312876336276531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312876161653548e-05, + "grad_norm": 4.163819313049316, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8809286952018738, + "num_tokens": 639439622.0, + "step": 16763 + }, + { + "epoch": 2.1325531102913113, + "ewc_loss": 0.008322576060891151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322575740749016e-05, + "grad_norm": 4.082380771636963, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8907957673072815, + "num_tokens": 639479943.0, + "step": 16764 + }, + { + "epoch": 2.132680320569902, + "ewc_loss": 0.008259814232587814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.259814057964832e-05, + "grad_norm": 4.08338737487793, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8778728246688843, + "num_tokens": 639520487.0, + "step": 16765 + }, + { + "epoch": 2.1328075308484924, + "ewc_loss": 0.008286251686513424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286251977551728e-05, + "grad_norm": 4.133206844329834, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8813084363937378, + "num_tokens": 639558997.0, + "step": 16766 + }, + { + "epoch": 2.132934741127083, + "ewc_loss": 0.008291932754218578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291933045256883e-05, + "grad_norm": 4.089107036590576, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8812994956970215, + "num_tokens": 639594569.0, + "step": 16767 + }, + { + "epoch": 2.1330619514056735, + "ewc_loss": 0.008243191055953503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.243191405199468e-05, + "grad_norm": 4.088054180145264, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8951922655105591, + "num_tokens": 639629827.0, + "step": 16768 + }, + { + "epoch": 2.133189161684264, + "ewc_loss": 0.008274082094430923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274082210846245e-05, + "grad_norm": 4.121776103973389, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8763874769210815, + "num_tokens": 639667634.0, + "step": 16769 + }, + { + "epoch": 2.1333163719628545, + "ewc_loss": 0.008281276561319828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28127667773515e-05, + "grad_norm": 4.1087212562561035, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8819293975830078, + "num_tokens": 639700536.0, + "step": 16770 + }, + { + "epoch": 2.133443582241445, + "ewc_loss": 0.00828932598233223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289326069643721e-05, + "grad_norm": 4.057417392730713, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.884249210357666, + "num_tokens": 639744006.0, + "step": 16771 + }, + { + "epoch": 2.1335707925200356, + "ewc_loss": 0.008248173631727695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2481732533779e-05, + "grad_norm": 4.03724479675293, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8884771466255188, + "num_tokens": 639784950.0, + "step": 16772 + }, + { + "epoch": 2.133698002798626, + "ewc_loss": 0.008260499686002731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.260499453172088e-05, + "grad_norm": 4.065648078918457, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8943418264389038, + "num_tokens": 639823567.0, + "step": 16773 + }, + { + "epoch": 2.1338252130772166, + "ewc_loss": 0.008284990675747395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28499105409719e-05, + "grad_norm": 4.040131568908691, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8930022716522217, + "num_tokens": 639863386.0, + "step": 16774 + }, + { + "epoch": 2.133952423355807, + "ewc_loss": 0.008235222660005093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.235222776420414e-05, + "grad_norm": 4.094718933105469, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8709967136383057, + "num_tokens": 639901219.0, + "step": 16775 + }, + { + "epoch": 2.1340796336343977, + "ewc_loss": 0.008295364677906036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.295364386867732e-05, + "grad_norm": 4.192379474639893, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.865862250328064, + "num_tokens": 639935240.0, + "step": 16776 + }, + { + "epoch": 2.134206843912988, + "ewc_loss": 0.008333148434758186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333148434758186e-05, + "grad_norm": 4.094808101654053, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8739099502563477, + "num_tokens": 639975370.0, + "step": 16777 + }, + { + "epoch": 2.1343340541915787, + "ewc_loss": 0.008244553580880165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244553464464843e-05, + "grad_norm": 4.0976243019104, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8649376034736633, + "num_tokens": 640020463.0, + "step": 16778 + }, + { + "epoch": 2.1344612644701693, + "ewc_loss": 0.008281754329800606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28175398055464e-05, + "grad_norm": 4.126256942749023, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8762197494506836, + "num_tokens": 640057142.0, + "step": 16779 + }, + { + "epoch": 2.13458847474876, + "ewc_loss": 0.008302317000925541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.302317291963845e-05, + "grad_norm": 4.059068202972412, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8795292377471924, + "num_tokens": 640098879.0, + "step": 16780 + }, + { + "epoch": 2.1347156850273503, + "ewc_loss": 0.00825036782771349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.250367682194337e-05, + "grad_norm": 4.106524467468262, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8847255706787109, + "num_tokens": 640136077.0, + "step": 16781 + }, + { + "epoch": 2.134842895305941, + "ewc_loss": 0.008317902684211731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317902393173426e-05, + "grad_norm": 4.1177473068237305, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8682020306587219, + "num_tokens": 640175160.0, + "step": 16782 + }, + { + "epoch": 2.1349701055845314, + "ewc_loss": 0.008302057161927223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.302057540277019e-05, + "grad_norm": 3.9986844062805176, + "learning_rate": 1e-06, + "loss": 0.2765, + "mean_token_accuracy": 0.9046581983566284, + "num_tokens": 640216670.0, + "step": 16783 + }, + { + "epoch": 2.135097315863122, + "ewc_loss": 0.008226984180510044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.226984209613875e-05, + "grad_norm": 4.088095188140869, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8779280781745911, + "num_tokens": 640254872.0, + "step": 16784 + }, + { + "epoch": 2.1352245261417124, + "ewc_loss": 0.008322122506797314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322122448589653e-05, + "grad_norm": 4.135788440704346, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8808498978614807, + "num_tokens": 640289319.0, + "step": 16785 + }, + { + "epoch": 2.135351736420303, + "ewc_loss": 0.008305219933390617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305220399051905e-05, + "grad_norm": 4.108250617980957, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8961721658706665, + "num_tokens": 640323897.0, + "step": 16786 + }, + { + "epoch": 2.1354789466988935, + "ewc_loss": 0.008288303390145302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288303797598928e-05, + "grad_norm": 4.044377326965332, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8866657018661499, + "num_tokens": 640366203.0, + "step": 16787 + }, + { + "epoch": 2.1356061569774836, + "ewc_loss": 0.008265192620456219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.265192445833236e-05, + "grad_norm": 4.044867515563965, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8899204134941101, + "num_tokens": 640408089.0, + "step": 16788 + }, + { + "epoch": 2.135733367256074, + "ewc_loss": 0.00828053429722786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280534530058503e-05, + "grad_norm": 4.083213806152344, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8927807211875916, + "num_tokens": 640443532.0, + "step": 16789 + }, + { + "epoch": 2.1358605775346646, + "ewc_loss": 0.008294457569718361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.294457802549005e-05, + "grad_norm": 4.138843059539795, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8763892650604248, + "num_tokens": 640480425.0, + "step": 16790 + }, + { + "epoch": 2.135987787813255, + "ewc_loss": 0.008321450091898441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321450150106102e-05, + "grad_norm": 4.064110279083252, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8843905925750732, + "num_tokens": 640525658.0, + "step": 16791 + }, + { + "epoch": 2.1361149980918457, + "ewc_loss": 0.008249918930232525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.249918755609542e-05, + "grad_norm": 4.135063171386719, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8919289708137512, + "num_tokens": 640563276.0, + "step": 16792 + }, + { + "epoch": 2.136242208370436, + "ewc_loss": 0.008317472413182259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317472384078428e-05, + "grad_norm": 4.103842258453369, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8763380646705627, + "num_tokens": 640608150.0, + "step": 16793 + }, + { + "epoch": 2.1363694186490267, + "ewc_loss": 0.00824512355029583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.245123171946034e-05, + "grad_norm": 4.083956718444824, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8838719725608826, + "num_tokens": 640646127.0, + "step": 16794 + }, + { + "epoch": 2.1364966289276173, + "ewc_loss": 0.008289748802781105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289748802781105e-05, + "grad_norm": 4.139299392700195, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8879343867301941, + "num_tokens": 640683195.0, + "step": 16795 + }, + { + "epoch": 2.136623839206208, + "ewc_loss": 0.008300378918647766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300378976855427e-05, + "grad_norm": 4.075796127319336, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8942979574203491, + "num_tokens": 640724102.0, + "step": 16796 + }, + { + "epoch": 2.1367510494847983, + "ewc_loss": 0.008242759853601456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.242759940912947e-05, + "grad_norm": 4.071976661682129, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8773608207702637, + "num_tokens": 640762624.0, + "step": 16797 + }, + { + "epoch": 2.136878259763389, + "ewc_loss": 0.008252061903476715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.252061525126919e-05, + "grad_norm": 4.079023838043213, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8784484267234802, + "num_tokens": 640799437.0, + "step": 16798 + }, + { + "epoch": 2.1370054700419794, + "ewc_loss": 0.008256212808191776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.256212458945811e-05, + "grad_norm": 4.070120334625244, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8880288600921631, + "num_tokens": 640837115.0, + "step": 16799 + }, + { + "epoch": 2.13713268032057, + "ewc_loss": 0.008244558237493038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244557830039412e-05, + "grad_norm": 4.023993015289307, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8814694881439209, + "num_tokens": 640882312.0, + "step": 16800 + }, + { + "epoch": 2.1372598905991604, + "ewc_loss": 0.008227446116507053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.227446232922375e-05, + "grad_norm": 4.082294940948486, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.881639301776886, + "num_tokens": 640921032.0, + "step": 16801 + }, + { + "epoch": 2.137387100877751, + "ewc_loss": 0.008277053944766521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.277053711935878e-05, + "grad_norm": 4.143989086151123, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8836263418197632, + "num_tokens": 640957424.0, + "step": 16802 + }, + { + "epoch": 2.1375143111563415, + "ewc_loss": 0.008293593302369118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29359341878444e-05, + "grad_norm": 4.139868259429932, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8742753267288208, + "num_tokens": 640992985.0, + "step": 16803 + }, + { + "epoch": 2.137641521434932, + "ewc_loss": 0.008280174806714058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280174370156601e-05, + "grad_norm": 4.072234153747559, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8900181651115417, + "num_tokens": 641030379.0, + "step": 16804 + }, + { + "epoch": 2.1377687317135226, + "ewc_loss": 0.008259191177785397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.259191235993057e-05, + "grad_norm": 4.119502067565918, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.87619549036026, + "num_tokens": 641068526.0, + "step": 16805 + }, + { + "epoch": 2.137895941992113, + "ewc_loss": 0.008297687396407127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29768760013394e-05, + "grad_norm": 4.108362197875977, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8765697479248047, + "num_tokens": 641103351.0, + "step": 16806 + }, + { + "epoch": 2.1380231522707036, + "ewc_loss": 0.008289458230137825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289458492072299e-05, + "grad_norm": 4.099116325378418, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8839086294174194, + "num_tokens": 641143570.0, + "step": 16807 + }, + { + "epoch": 2.138150362549294, + "ewc_loss": 0.008280904032289982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280904148705304e-05, + "grad_norm": 4.086338520050049, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.887259304523468, + "num_tokens": 641180625.0, + "step": 16808 + }, + { + "epoch": 2.1382775728278847, + "ewc_loss": 0.008287638425827026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.287638775072992e-05, + "grad_norm": 4.0956926345825195, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8629916310310364, + "num_tokens": 641226770.0, + "step": 16809 + }, + { + "epoch": 2.138404783106475, + "ewc_loss": 0.0083150090649724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315008744830266e-05, + "grad_norm": 4.067159652709961, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8932902812957764, + "num_tokens": 641269682.0, + "step": 16810 + }, + { + "epoch": 2.1385319933850653, + "ewc_loss": 0.008283436298370361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28343618195504e-05, + "grad_norm": 4.094183444976807, + "learning_rate": 1e-06, + "loss": 0.2982, + "mean_token_accuracy": 0.893919825553894, + "num_tokens": 641303803.0, + "step": 16811 + }, + { + "epoch": 2.1386592036636562, + "ewc_loss": 0.008303492330014706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303492359118536e-05, + "grad_norm": 4.113542556762695, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8776648044586182, + "num_tokens": 641343141.0, + "step": 16812 + }, + { + "epoch": 2.1387864139422463, + "ewc_loss": 0.008292959071695805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.292958955280483e-05, + "grad_norm": 4.0679755210876465, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8859599828720093, + "num_tokens": 641384006.0, + "step": 16813 + }, + { + "epoch": 2.138913624220837, + "ewc_loss": 0.008276369422674179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276369044324383e-05, + "grad_norm": 4.14827299118042, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8616621494293213, + "num_tokens": 641423998.0, + "step": 16814 + }, + { + "epoch": 2.1390408344994274, + "ewc_loss": 0.008335373364388943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.335373422596604e-05, + "grad_norm": 4.126569747924805, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.882279634475708, + "num_tokens": 641457886.0, + "step": 16815 + }, + { + "epoch": 2.139168044778018, + "ewc_loss": 0.008299563080072403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299563342006877e-05, + "grad_norm": 4.063141345977783, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8830186128616333, + "num_tokens": 641503386.0, + "step": 16816 + }, + { + "epoch": 2.1392952550566084, + "ewc_loss": 0.008272317238152027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.272317063529044e-05, + "grad_norm": 4.084979057312012, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8923014402389526, + "num_tokens": 641539533.0, + "step": 16817 + }, + { + "epoch": 2.139422465335199, + "ewc_loss": 0.008303431794047356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303431968670338e-05, + "grad_norm": 4.145668983459473, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.877773642539978, + "num_tokens": 641574020.0, + "step": 16818 + }, + { + "epoch": 2.1395496756137895, + "ewc_loss": 0.008324855007231236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324854570673779e-05, + "grad_norm": 4.202808856964111, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8837352991104126, + "num_tokens": 641605206.0, + "step": 16819 + }, + { + "epoch": 2.13967688589238, + "ewc_loss": 0.008328726515173912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328726107720286e-05, + "grad_norm": 4.028290271759033, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8713244199752808, + "num_tokens": 641649857.0, + "step": 16820 + }, + { + "epoch": 2.1398040961709706, + "ewc_loss": 0.008203922770917416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.203922334359959e-05, + "grad_norm": 4.123063564300537, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8784245252609253, + "num_tokens": 641683360.0, + "step": 16821 + }, + { + "epoch": 2.139931306449561, + "ewc_loss": 0.008343440480530262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343440276803449e-05, + "grad_norm": 4.113390922546387, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8828164339065552, + "num_tokens": 641720105.0, + "step": 16822 + }, + { + "epoch": 2.1400585167281516, + "ewc_loss": 0.008297961205244064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.297961176140234e-05, + "grad_norm": 4.132594585418701, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8852553367614746, + "num_tokens": 641750718.0, + "step": 16823 + }, + { + "epoch": 2.140185727006742, + "ewc_loss": 0.00830952450633049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309524855576456e-05, + "grad_norm": 4.083221435546875, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8820602297782898, + "num_tokens": 641791620.0, + "step": 16824 + }, + { + "epoch": 2.1403129372853327, + "ewc_loss": 0.00828159973025322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28159973025322e-05, + "grad_norm": 4.073948383331299, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8713865280151367, + "num_tokens": 641833566.0, + "step": 16825 + }, + { + "epoch": 2.140440147563923, + "ewc_loss": 0.008318621665239334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318621985381469e-05, + "grad_norm": 4.114606857299805, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8875585794448853, + "num_tokens": 641869080.0, + "step": 16826 + }, + { + "epoch": 2.1405673578425137, + "ewc_loss": 0.00833135936409235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331359276780859e-05, + "grad_norm": 4.084357261657715, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8791702389717102, + "num_tokens": 641910195.0, + "step": 16827 + }, + { + "epoch": 2.1406945681211043, + "ewc_loss": 0.008303678594529629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303678623633459e-05, + "grad_norm": 4.15335750579834, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8708184957504272, + "num_tokens": 641947270.0, + "step": 16828 + }, + { + "epoch": 2.140821778399695, + "ewc_loss": 0.008362851105630398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362851076526567e-05, + "grad_norm": 4.061899185180664, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8834543228149414, + "num_tokens": 641986472.0, + "step": 16829 + }, + { + "epoch": 2.1409489886782853, + "ewc_loss": 0.008286328054964542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286327647510916e-05, + "grad_norm": 4.113525867462158, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8674952983856201, + "num_tokens": 642028130.0, + "step": 16830 + }, + { + "epoch": 2.141076198956876, + "ewc_loss": 0.008358579128980637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35857936181128e-05, + "grad_norm": 4.121018886566162, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8759955763816833, + "num_tokens": 642067097.0, + "step": 16831 + }, + { + "epoch": 2.1412034092354664, + "ewc_loss": 0.008329778909683228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329778938787058e-05, + "grad_norm": 4.096674919128418, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8925697803497314, + "num_tokens": 642099628.0, + "step": 16832 + }, + { + "epoch": 2.141330619514057, + "ewc_loss": 0.00833518523722887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.335184975294396e-05, + "grad_norm": 4.097814083099365, + "learning_rate": 1e-06, + "loss": 0.2776, + "mean_token_accuracy": 0.9025282859802246, + "num_tokens": 642135941.0, + "step": 16833 + }, + { + "epoch": 2.1414578297926474, + "ewc_loss": 0.00831638928502798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316388993989676e-05, + "grad_norm": 4.158566951751709, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8752763271331787, + "num_tokens": 642169465.0, + "step": 16834 + }, + { + "epoch": 2.141585040071238, + "ewc_loss": 0.008363021537661552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363021333934739e-05, + "grad_norm": 4.080844402313232, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8849896788597107, + "num_tokens": 642208961.0, + "step": 16835 + }, + { + "epoch": 2.141712250349828, + "ewc_loss": 0.00828610174357891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286101365229115e-05, + "grad_norm": 4.061448097229004, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8810657262802124, + "num_tokens": 642249691.0, + "step": 16836 + }, + { + "epoch": 2.141839460628419, + "ewc_loss": 0.008322693407535553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322693611262366e-05, + "grad_norm": 4.069287300109863, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8844808340072632, + "num_tokens": 642292425.0, + "step": 16837 + }, + { + "epoch": 2.141966670907009, + "ewc_loss": 0.008317453786730766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317453466588631e-05, + "grad_norm": 4.083527088165283, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.891548752784729, + "num_tokens": 642330982.0, + "step": 16838 + }, + { + "epoch": 2.1420938811855996, + "ewc_loss": 0.008315839804708958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315839659189805e-05, + "grad_norm": 4.114497661590576, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8839973211288452, + "num_tokens": 642365414.0, + "step": 16839 + }, + { + "epoch": 2.14222109146419, + "ewc_loss": 0.008352096192538738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352095755981281e-05, + "grad_norm": 4.175479412078857, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.876641571521759, + "num_tokens": 642398660.0, + "step": 16840 + }, + { + "epoch": 2.1423483017427807, + "ewc_loss": 0.008367820642888546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367820555577055e-05, + "grad_norm": 4.132858753204346, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8897022604942322, + "num_tokens": 642432605.0, + "step": 16841 + }, + { + "epoch": 2.142475512021371, + "ewc_loss": 0.00832601822912693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326017996296287e-05, + "grad_norm": 4.0716471672058105, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8907052278518677, + "num_tokens": 642472041.0, + "step": 16842 + }, + { + "epoch": 2.1426027222999617, + "ewc_loss": 0.008314365521073341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314365550177172e-05, + "grad_norm": 4.050397872924805, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.892900824546814, + "num_tokens": 642512399.0, + "step": 16843 + }, + { + "epoch": 2.1427299325785523, + "ewc_loss": 0.0083320876583457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.332087600138038e-05, + "grad_norm": 4.110964775085449, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8770631551742554, + "num_tokens": 642550829.0, + "step": 16844 + }, + { + "epoch": 2.142857142857143, + "ewc_loss": 0.008364059962332249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364060340682045e-05, + "grad_norm": 4.0602216720581055, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.881623387336731, + "num_tokens": 642591255.0, + "step": 16845 + }, + { + "epoch": 2.1429843531357333, + "ewc_loss": 0.008312608115375042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312607678817585e-05, + "grad_norm": 4.160759449005127, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8848243355751038, + "num_tokens": 642625772.0, + "step": 16846 + }, + { + "epoch": 2.143111563414324, + "ewc_loss": 0.008374661207199097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37466141092591e-05, + "grad_norm": 4.04208517074585, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.876982569694519, + "num_tokens": 642667596.0, + "step": 16847 + }, + { + "epoch": 2.1432387736929144, + "ewc_loss": 0.008272157981991768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.272158447653055e-05, + "grad_norm": 4.141687393188477, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8834434747695923, + "num_tokens": 642702159.0, + "step": 16848 + }, + { + "epoch": 2.143365983971505, + "ewc_loss": 0.00839280616492033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39280619402416e-05, + "grad_norm": 4.065169334411621, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8938716053962708, + "num_tokens": 642737307.0, + "step": 16849 + }, + { + "epoch": 2.1434931942500954, + "ewc_loss": 0.008314779959619045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314779552165419e-05, + "grad_norm": 4.136987209320068, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.873592734336853, + "num_tokens": 642776660.0, + "step": 16850 + }, + { + "epoch": 2.143620404528686, + "ewc_loss": 0.008389230817556381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389230788452551e-05, + "grad_norm": 4.149318218231201, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.875881552696228, + "num_tokens": 642814131.0, + "step": 16851 + }, + { + "epoch": 2.1437476148072765, + "ewc_loss": 0.008365807123482227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36580729810521e-05, + "grad_norm": 4.09510612487793, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8641928434371948, + "num_tokens": 642854138.0, + "step": 16852 + }, + { + "epoch": 2.143874825085867, + "ewc_loss": 0.008327143266797066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327143586939201e-05, + "grad_norm": 4.064849853515625, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8902503252029419, + "num_tokens": 642893767.0, + "step": 16853 + }, + { + "epoch": 2.1440020353644575, + "ewc_loss": 0.008343217894434929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343218360096216e-05, + "grad_norm": 4.1139631271362305, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8944737911224365, + "num_tokens": 642928286.0, + "step": 16854 + }, + { + "epoch": 2.144129245643048, + "ewc_loss": 0.008381393738090992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.381393854506314e-05, + "grad_norm": 4.056637763977051, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8891221284866333, + "num_tokens": 642971220.0, + "step": 16855 + }, + { + "epoch": 2.1442564559216386, + "ewc_loss": 0.008304205723106861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304205402964726e-05, + "grad_norm": 4.055997371673584, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8859813213348389, + "num_tokens": 643013618.0, + "step": 16856 + }, + { + "epoch": 2.144383666200229, + "ewc_loss": 0.008330698125064373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33069789223373e-05, + "grad_norm": 4.075087547302246, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8860718011856079, + "num_tokens": 643058122.0, + "step": 16857 + }, + { + "epoch": 2.1445108764788197, + "ewc_loss": 0.00833040289580822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330403215950355e-05, + "grad_norm": 4.127554893493652, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8709791898727417, + "num_tokens": 643096662.0, + "step": 16858 + }, + { + "epoch": 2.14463808675741, + "ewc_loss": 0.00833633542060852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336335304193199e-05, + "grad_norm": 4.075680255889893, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8948361873626709, + "num_tokens": 643137923.0, + "step": 16859 + }, + { + "epoch": 2.1447652970360007, + "ewc_loss": 0.008280509151518345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280509064206854e-05, + "grad_norm": 4.09081506729126, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8874461054801941, + "num_tokens": 643176918.0, + "step": 16860 + }, + { + "epoch": 2.144892507314591, + "ewc_loss": 0.008304595947265625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304596121888608e-05, + "grad_norm": 4.101759910583496, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8986318111419678, + "num_tokens": 643217525.0, + "step": 16861 + }, + { + "epoch": 2.1450197175931813, + "ewc_loss": 0.008281814865767956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2818150985986e-05, + "grad_norm": 4.070446491241455, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8974661827087402, + "num_tokens": 643255131.0, + "step": 16862 + }, + { + "epoch": 2.145146927871772, + "ewc_loss": 0.008243289776146412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.24328963062726e-05, + "grad_norm": 4.1020894050598145, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8907147645950317, + "num_tokens": 643292092.0, + "step": 16863 + }, + { + "epoch": 2.1452741381503624, + "ewc_loss": 0.008273384533822536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.273384446511045e-05, + "grad_norm": 4.0974650382995605, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8796210289001465, + "num_tokens": 643330510.0, + "step": 16864 + }, + { + "epoch": 2.145401348428953, + "ewc_loss": 0.008244485594332218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244485798059031e-05, + "grad_norm": 4.098278522491455, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8786097764968872, + "num_tokens": 643372900.0, + "step": 16865 + }, + { + "epoch": 2.1455285587075434, + "ewc_loss": 0.008244727738201618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244728087447584e-05, + "grad_norm": 4.08192253112793, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8887820243835449, + "num_tokens": 643412866.0, + "step": 16866 + }, + { + "epoch": 2.145655768986134, + "ewc_loss": 0.008221481926739216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.221482130466029e-05, + "grad_norm": 4.101055145263672, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8666722178459167, + "num_tokens": 643452846.0, + "step": 16867 + }, + { + "epoch": 2.1457829792647245, + "ewc_loss": 0.008254210464656353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.254210843006149e-05, + "grad_norm": 4.129086017608643, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8868420720100403, + "num_tokens": 643488264.0, + "step": 16868 + }, + { + "epoch": 2.145910189543315, + "ewc_loss": 0.008243131451308727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.24313101475127e-05, + "grad_norm": 4.141770839691162, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8717060089111328, + "num_tokens": 643528131.0, + "step": 16869 + }, + { + "epoch": 2.1460373998219056, + "ewc_loss": 0.008242198266088963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.242198236985132e-05, + "grad_norm": 4.133141994476318, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8827918767929077, + "num_tokens": 643564041.0, + "step": 16870 + }, + { + "epoch": 2.146164610100496, + "ewc_loss": 0.008239485323429108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.239485032390803e-05, + "grad_norm": 4.103094100952148, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8715749382972717, + "num_tokens": 643601507.0, + "step": 16871 + }, + { + "epoch": 2.1462918203790866, + "ewc_loss": 0.008243811316788197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.243811316788197e-05, + "grad_norm": 4.068085670471191, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8681372404098511, + "num_tokens": 643647161.0, + "step": 16872 + }, + { + "epoch": 2.146419030657677, + "ewc_loss": 0.008245356380939484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.245356002589688e-05, + "grad_norm": 4.06802225112915, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8872101306915283, + "num_tokens": 643687114.0, + "step": 16873 + }, + { + "epoch": 2.1465462409362677, + "ewc_loss": 0.008247176185250282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.247176447184756e-05, + "grad_norm": 4.148251056671143, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8694912195205688, + "num_tokens": 643722110.0, + "step": 16874 + }, + { + "epoch": 2.146673451214858, + "ewc_loss": 0.008305124007165432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305123628815636e-05, + "grad_norm": 4.113934516906738, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8606339693069458, + "num_tokens": 643765770.0, + "step": 16875 + }, + { + "epoch": 2.1468006614934487, + "ewc_loss": 0.008240980096161366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.240980241680518e-05, + "grad_norm": 4.066235542297363, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8745708465576172, + "num_tokens": 643810097.0, + "step": 16876 + }, + { + "epoch": 2.1469278717720393, + "ewc_loss": 0.008248910307884216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.248910307884216e-05, + "grad_norm": 4.147310733795166, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8589582443237305, + "num_tokens": 643847904.0, + "step": 16877 + }, + { + "epoch": 2.14705508205063, + "ewc_loss": 0.00833410769701004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.334108133567497e-05, + "grad_norm": 4.079079627990723, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8903361558914185, + "num_tokens": 643889157.0, + "step": 16878 + }, + { + "epoch": 2.1471822923292203, + "ewc_loss": 0.00823337584733963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.233376138377935e-05, + "grad_norm": 4.025495529174805, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.885399341583252, + "num_tokens": 643931181.0, + "step": 16879 + }, + { + "epoch": 2.147309502607811, + "ewc_loss": 0.00824460107833147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244601485785097e-05, + "grad_norm": 4.094902038574219, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8808979988098145, + "num_tokens": 643965415.0, + "step": 16880 + }, + { + "epoch": 2.1474367128864014, + "ewc_loss": 0.008327700197696686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327700197696686e-05, + "grad_norm": 4.091559886932373, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8693723678588867, + "num_tokens": 644010647.0, + "step": 16881 + }, + { + "epoch": 2.147563923164992, + "ewc_loss": 0.008274301886558533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274301944766194e-05, + "grad_norm": 4.106393337249756, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8972561359405518, + "num_tokens": 644044347.0, + "step": 16882 + }, + { + "epoch": 2.1476911334435824, + "ewc_loss": 0.008297843858599663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.297844033222646e-05, + "grad_norm": 4.0753560066223145, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8883085250854492, + "num_tokens": 644083456.0, + "step": 16883 + }, + { + "epoch": 2.147818343722173, + "ewc_loss": 0.008272537961602211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.272538252640516e-05, + "grad_norm": 4.089314937591553, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8729126453399658, + "num_tokens": 644123851.0, + "step": 16884 + }, + { + "epoch": 2.1479455540007635, + "ewc_loss": 0.00830589048564434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305890514748171e-05, + "grad_norm": 4.053586959838867, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8902969360351562, + "num_tokens": 644168880.0, + "step": 16885 + }, + { + "epoch": 2.1480727642793536, + "ewc_loss": 0.008241118863224983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.241118484875187e-05, + "grad_norm": 4.097298622131348, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8859343528747559, + "num_tokens": 644204532.0, + "step": 16886 + }, + { + "epoch": 2.148199974557944, + "ewc_loss": 0.008296298794448376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.296298619825393e-05, + "grad_norm": 4.073235511779785, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8994014859199524, + "num_tokens": 644246160.0, + "step": 16887 + }, + { + "epoch": 2.1483271848365346, + "ewc_loss": 0.00825063232332468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.250632527051494e-05, + "grad_norm": 4.049556255340576, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8934195041656494, + "num_tokens": 644284881.0, + "step": 16888 + }, + { + "epoch": 2.148454395115125, + "ewc_loss": 0.008244287222623825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244287164416164e-05, + "grad_norm": 4.107571601867676, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8764777183532715, + "num_tokens": 644326661.0, + "step": 16889 + }, + { + "epoch": 2.1485816053937157, + "ewc_loss": 0.008279687725007534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.279687608592212e-05, + "grad_norm": 4.080073356628418, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8872384428977966, + "num_tokens": 644364420.0, + "step": 16890 + }, + { + "epoch": 2.148708815672306, + "ewc_loss": 0.008229208178818226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.22920846985653e-05, + "grad_norm": 4.134765625, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8658934235572815, + "num_tokens": 644398626.0, + "step": 16891 + }, + { + "epoch": 2.1488360259508967, + "ewc_loss": 0.008287541568279266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.287541277240962e-05, + "grad_norm": 4.122429370880127, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8838895559310913, + "num_tokens": 644434772.0, + "step": 16892 + }, + { + "epoch": 2.1489632362294873, + "ewc_loss": 0.008271640166640282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271640399470925e-05, + "grad_norm": 4.061745643615723, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8796486258506775, + "num_tokens": 644474425.0, + "step": 16893 + }, + { + "epoch": 2.149090446508078, + "ewc_loss": 0.008248850703239441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.24885064503178e-05, + "grad_norm": 4.178435325622559, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8770719766616821, + "num_tokens": 644507796.0, + "step": 16894 + }, + { + "epoch": 2.1492176567866683, + "ewc_loss": 0.008341684937477112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341684588231146e-05, + "grad_norm": 4.135855674743652, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8789846897125244, + "num_tokens": 644544172.0, + "step": 16895 + }, + { + "epoch": 2.149344867065259, + "ewc_loss": 0.008284933865070343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.284933574032038e-05, + "grad_norm": 4.107920169830322, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8806396722793579, + "num_tokens": 644582217.0, + "step": 16896 + }, + { + "epoch": 2.1494720773438494, + "ewc_loss": 0.008288934826850891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288934623124078e-05, + "grad_norm": 4.143385887145996, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8874037861824036, + "num_tokens": 644620028.0, + "step": 16897 + }, + { + "epoch": 2.14959928762244, + "ewc_loss": 0.008341236971318722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341237116837874e-05, + "grad_norm": 4.11210298538208, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8681967854499817, + "num_tokens": 644663743.0, + "step": 16898 + }, + { + "epoch": 2.1497264979010304, + "ewc_loss": 0.008293834514915943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293834252981469e-05, + "grad_norm": 4.096409797668457, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8840919733047485, + "num_tokens": 644700914.0, + "step": 16899 + }, + { + "epoch": 2.149853708179621, + "ewc_loss": 0.008305597119033337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30559729365632e-05, + "grad_norm": 4.083623886108398, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8794480562210083, + "num_tokens": 644744232.0, + "step": 16900 + }, + { + "epoch": 2.1499809184582115, + "ewc_loss": 0.008312586694955826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312586578540504e-05, + "grad_norm": 4.118711948394775, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8767281770706177, + "num_tokens": 644780851.0, + "step": 16901 + }, + { + "epoch": 2.150108128736802, + "ewc_loss": 0.008330894634127617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330894343089312e-05, + "grad_norm": 4.112098217010498, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8896914124488831, + "num_tokens": 644822103.0, + "step": 16902 + }, + { + "epoch": 2.1502353390153925, + "ewc_loss": 0.008324426598846912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324426744366065e-05, + "grad_norm": 4.1051225662231445, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8793043494224548, + "num_tokens": 644861209.0, + "step": 16903 + }, + { + "epoch": 2.150362549293983, + "ewc_loss": 0.008333542384207249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333542064065114e-05, + "grad_norm": 4.161767959594727, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8708534240722656, + "num_tokens": 644899860.0, + "step": 16904 + }, + { + "epoch": 2.1504897595725736, + "ewc_loss": 0.008360461331903934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360460924450308e-05, + "grad_norm": 4.043794631958008, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.878885805606842, + "num_tokens": 644943101.0, + "step": 16905 + }, + { + "epoch": 2.150616969851164, + "ewc_loss": 0.008270283229649067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.27028343337588e-05, + "grad_norm": 4.140242099761963, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8720474243164062, + "num_tokens": 644981786.0, + "step": 16906 + }, + { + "epoch": 2.1507441801297547, + "ewc_loss": 0.008386515080928802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386514673475176e-05, + "grad_norm": 4.110246658325195, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8832972049713135, + "num_tokens": 645017949.0, + "step": 16907 + }, + { + "epoch": 2.150871390408345, + "ewc_loss": 0.008321714587509632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321714267367497e-05, + "grad_norm": 4.122546195983887, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8811630010604858, + "num_tokens": 645053443.0, + "step": 16908 + }, + { + "epoch": 2.1509986006869353, + "ewc_loss": 0.008340095169842243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.340095519088209e-05, + "grad_norm": 4.103209018707275, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8824522495269775, + "num_tokens": 645089916.0, + "step": 16909 + }, + { + "epoch": 2.1511258109655262, + "ewc_loss": 0.008347243070602417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347242692252621e-05, + "grad_norm": 4.1090569496154785, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8666000366210938, + "num_tokens": 645134531.0, + "step": 16910 + }, + { + "epoch": 2.1512530212441163, + "ewc_loss": 0.008355235680937767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355235331691802e-05, + "grad_norm": 4.132842063903809, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8766852021217346, + "num_tokens": 645172608.0, + "step": 16911 + }, + { + "epoch": 2.151380231522707, + "ewc_loss": 0.00836226437240839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362264634342864e-05, + "grad_norm": 4.092466831207275, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8777288198471069, + "num_tokens": 645209895.0, + "step": 16912 + }, + { + "epoch": 2.1515074418012974, + "ewc_loss": 0.008333563804626465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333563891937956e-05, + "grad_norm": 4.109499931335449, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8892364501953125, + "num_tokens": 645245259.0, + "step": 16913 + }, + { + "epoch": 2.151634652079888, + "ewc_loss": 0.008354690857231617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35469109006226e-05, + "grad_norm": 4.082642555236816, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8763192892074585, + "num_tokens": 645284962.0, + "step": 16914 + }, + { + "epoch": 2.1517618623584784, + "ewc_loss": 0.008326638489961624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326638635480776e-05, + "grad_norm": 4.094080448150635, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8960684537887573, + "num_tokens": 645322352.0, + "step": 16915 + }, + { + "epoch": 2.151889072637069, + "ewc_loss": 0.00834614410996437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34614402265288e-05, + "grad_norm": 4.0869646072387695, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8813483715057373, + "num_tokens": 645358401.0, + "step": 16916 + }, + { + "epoch": 2.1520162829156595, + "ewc_loss": 0.008334165439009666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.334165613632649e-05, + "grad_norm": 4.0703325271606445, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8935878276824951, + "num_tokens": 645399147.0, + "step": 16917 + }, + { + "epoch": 2.15214349319425, + "ewc_loss": 0.008322417736053467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322417852468789e-05, + "grad_norm": 4.1418657302856445, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8648748993873596, + "num_tokens": 645437286.0, + "step": 16918 + }, + { + "epoch": 2.1522707034728406, + "ewc_loss": 0.008370003663003445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37000334286131e-05, + "grad_norm": 4.087445259094238, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8757413029670715, + "num_tokens": 645477853.0, + "step": 16919 + }, + { + "epoch": 2.152397913751431, + "ewc_loss": 0.008317590691149235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317590254591778e-05, + "grad_norm": 4.107675075531006, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8734380602836609, + "num_tokens": 645520476.0, + "step": 16920 + }, + { + "epoch": 2.1525251240300216, + "ewc_loss": 0.008348550647497177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34855018183589e-05, + "grad_norm": 4.121034145355225, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8695684671401978, + "num_tokens": 645561539.0, + "step": 16921 + }, + { + "epoch": 2.152652334308612, + "ewc_loss": 0.008328234776854515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328234980581328e-05, + "grad_norm": 4.073258399963379, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.87800133228302, + "num_tokens": 645602950.0, + "step": 16922 + }, + { + "epoch": 2.1527795445872027, + "ewc_loss": 0.008293394930660725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293394785141572e-05, + "grad_norm": 4.165457248687744, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8716945648193359, + "num_tokens": 645639215.0, + "step": 16923 + }, + { + "epoch": 2.152906754865793, + "ewc_loss": 0.008369547314941883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3695471403189e-05, + "grad_norm": 4.113162517547607, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8788699507713318, + "num_tokens": 645677287.0, + "step": 16924 + }, + { + "epoch": 2.1530339651443837, + "ewc_loss": 0.008299661800265312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299661567434669e-05, + "grad_norm": 4.137285232543945, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8856472373008728, + "num_tokens": 645710784.0, + "step": 16925 + }, + { + "epoch": 2.1531611754229742, + "ewc_loss": 0.008335985243320465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.335985330631956e-05, + "grad_norm": 4.126683712005615, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8774994015693665, + "num_tokens": 645750568.0, + "step": 16926 + }, + { + "epoch": 2.1532883857015648, + "ewc_loss": 0.008325496688485146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325497037731111e-05, + "grad_norm": 4.135910511016846, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8870060443878174, + "num_tokens": 645783911.0, + "step": 16927 + }, + { + "epoch": 2.1534155959801553, + "ewc_loss": 0.00833083875477314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330839045811445e-05, + "grad_norm": 4.082802772521973, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8959139585494995, + "num_tokens": 645822524.0, + "step": 16928 + }, + { + "epoch": 2.153542806258746, + "ewc_loss": 0.008298497647047043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298497414216399e-05, + "grad_norm": 4.180243492126465, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8789383172988892, + "num_tokens": 645856924.0, + "step": 16929 + }, + { + "epoch": 2.1536700165373364, + "ewc_loss": 0.008382975123822689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382974920095876e-05, + "grad_norm": 4.125139236450195, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8746073246002197, + "num_tokens": 645896995.0, + "step": 16930 + }, + { + "epoch": 2.153797226815927, + "ewc_loss": 0.008310840465128422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3108403487131e-05, + "grad_norm": 4.013238906860352, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.888527512550354, + "num_tokens": 645941768.0, + "step": 16931 + }, + { + "epoch": 2.1539244370945174, + "ewc_loss": 0.008273986168205738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.273986168205738e-05, + "grad_norm": 4.1288604736328125, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8817842602729797, + "num_tokens": 645981909.0, + "step": 16932 + }, + { + "epoch": 2.154051647373108, + "ewc_loss": 0.008366446942090988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366446854779497e-05, + "grad_norm": 4.106929779052734, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8730106353759766, + "num_tokens": 646022034.0, + "step": 16933 + }, + { + "epoch": 2.154178857651698, + "ewc_loss": 0.008324600756168365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324600639753044e-05, + "grad_norm": 4.12611722946167, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8812763690948486, + "num_tokens": 646056714.0, + "step": 16934 + }, + { + "epoch": 2.1543060679302886, + "ewc_loss": 0.008344575762748718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344576053787023e-05, + "grad_norm": 4.0941572189331055, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8746023178100586, + "num_tokens": 646095668.0, + "step": 16935 + }, + { + "epoch": 2.154433278208879, + "ewc_loss": 0.008321925066411495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321925270138308e-05, + "grad_norm": 4.097869396209717, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.86622154712677, + "num_tokens": 646136745.0, + "step": 16936 + }, + { + "epoch": 2.1545604884874696, + "ewc_loss": 0.0083306347951293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330634591402486e-05, + "grad_norm": 4.089331150054932, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8791393637657166, + "num_tokens": 646180314.0, + "step": 16937 + }, + { + "epoch": 2.15468769876606, + "ewc_loss": 0.008329730480909348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329730917466804e-05, + "grad_norm": 4.086804389953613, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8843995332717896, + "num_tokens": 646218191.0, + "step": 16938 + }, + { + "epoch": 2.1548149090446507, + "ewc_loss": 0.008314862847328186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31486249808222e-05, + "grad_norm": 4.069016456604004, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8947547674179077, + "num_tokens": 646256350.0, + "step": 16939 + }, + { + "epoch": 2.154942119323241, + "ewc_loss": 0.008308184333145618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308183896588162e-05, + "grad_norm": 4.068668842315674, + "learning_rate": 1e-06, + "loss": 0.2687, + "mean_token_accuracy": 0.9027114510536194, + "num_tokens": 646290915.0, + "step": 16940 + }, + { + "epoch": 2.1550693296018317, + "ewc_loss": 0.008295778185129166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.295778388855979e-05, + "grad_norm": 4.153054714202881, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8736096620559692, + "num_tokens": 646325838.0, + "step": 16941 + }, + { + "epoch": 2.1551965398804223, + "ewc_loss": 0.008346634916961193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346635149791837e-05, + "grad_norm": 4.130361557006836, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8632701635360718, + "num_tokens": 646362742.0, + "step": 16942 + }, + { + "epoch": 2.155323750159013, + "ewc_loss": 0.008317800238728523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317799802171066e-05, + "grad_norm": 4.0941596031188965, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8696972131729126, + "num_tokens": 646403958.0, + "step": 16943 + }, + { + "epoch": 2.1554509604376033, + "ewc_loss": 0.008293655700981617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293655992019922e-05, + "grad_norm": 4.118322849273682, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8752099275588989, + "num_tokens": 646442872.0, + "step": 16944 + }, + { + "epoch": 2.155578170716194, + "ewc_loss": 0.008339250460267067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33925005281344e-05, + "grad_norm": 4.077005386352539, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8889148235321045, + "num_tokens": 646484642.0, + "step": 16945 + }, + { + "epoch": 2.1557053809947844, + "ewc_loss": 0.008288426324725151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288426033686846e-05, + "grad_norm": 4.11942195892334, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8896012306213379, + "num_tokens": 646521688.0, + "step": 16946 + }, + { + "epoch": 2.155832591273375, + "ewc_loss": 0.008331314660608768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331314893439412e-05, + "grad_norm": 4.111536502838135, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8881601095199585, + "num_tokens": 646560182.0, + "step": 16947 + }, + { + "epoch": 2.1559598015519654, + "ewc_loss": 0.008304388262331486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304388029500842e-05, + "grad_norm": 4.154595851898193, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8851513266563416, + "num_tokens": 646593252.0, + "step": 16948 + }, + { + "epoch": 2.156087011830556, + "ewc_loss": 0.008353964425623417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353964221896604e-05, + "grad_norm": 4.072922229766846, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8786378502845764, + "num_tokens": 646635514.0, + "step": 16949 + }, + { + "epoch": 2.1562142221091465, + "ewc_loss": 0.00827751774340868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.277517918031663e-05, + "grad_norm": 4.1430511474609375, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8682675957679749, + "num_tokens": 646674785.0, + "step": 16950 + }, + { + "epoch": 2.156341432387737, + "ewc_loss": 0.008357737213373184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357736805919558e-05, + "grad_norm": 4.079062461853027, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8810877799987793, + "num_tokens": 646713832.0, + "step": 16951 + }, + { + "epoch": 2.1564686426663275, + "ewc_loss": 0.008304670453071594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304670336656272e-05, + "grad_norm": 4.097191333770752, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8843939304351807, + "num_tokens": 646752218.0, + "step": 16952 + }, + { + "epoch": 2.156595852944918, + "ewc_loss": 0.008342163637280464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342164073837921e-05, + "grad_norm": 4.14192008972168, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.880253255367279, + "num_tokens": 646786196.0, + "step": 16953 + }, + { + "epoch": 2.1567230632235086, + "ewc_loss": 0.008366784080862999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366783731617033e-05, + "grad_norm": 4.143409729003906, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8770152926445007, + "num_tokens": 646821462.0, + "step": 16954 + }, + { + "epoch": 2.156850273502099, + "ewc_loss": 0.008336744271218777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336744213011116e-05, + "grad_norm": 4.103118896484375, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8887113332748413, + "num_tokens": 646856613.0, + "step": 16955 + }, + { + "epoch": 2.1569774837806897, + "ewc_loss": 0.008329934440553188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329934644280002e-05, + "grad_norm": 4.083576679229736, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8925648927688599, + "num_tokens": 646893013.0, + "step": 16956 + }, + { + "epoch": 2.15710469405928, + "ewc_loss": 0.008337287232279778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.337286999449134e-05, + "grad_norm": 4.059152126312256, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.886272668838501, + "num_tokens": 646936180.0, + "step": 16957 + }, + { + "epoch": 2.1572319043378707, + "ewc_loss": 0.008317189291119576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317189349327236e-05, + "grad_norm": 4.0933027267456055, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8813349604606628, + "num_tokens": 646979314.0, + "step": 16958 + }, + { + "epoch": 2.157359114616461, + "ewc_loss": 0.008347599767148495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347599941771477e-05, + "grad_norm": 4.078430652618408, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8912827968597412, + "num_tokens": 647020136.0, + "step": 16959 + }, + { + "epoch": 2.1574863248950513, + "ewc_loss": 0.008314681239426136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314681326737627e-05, + "grad_norm": 4.159451484680176, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8862180709838867, + "num_tokens": 647052970.0, + "step": 16960 + }, + { + "epoch": 2.157613535173642, + "ewc_loss": 0.008362213149666786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362212975043803e-05, + "grad_norm": 4.088564395904541, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8766695261001587, + "num_tokens": 647093195.0, + "step": 16961 + }, + { + "epoch": 2.1577407454522324, + "ewc_loss": 0.008293877355754375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293877181131393e-05, + "grad_norm": 4.120858669281006, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8815689086914062, + "num_tokens": 647133800.0, + "step": 16962 + }, + { + "epoch": 2.157867955730823, + "ewc_loss": 0.008338156156241894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338156476384029e-05, + "grad_norm": 4.18585205078125, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8642331957817078, + "num_tokens": 647167704.0, + "step": 16963 + }, + { + "epoch": 2.1579951660094134, + "ewc_loss": 0.008365812711417675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36581239127554e-05, + "grad_norm": 4.095936298370361, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8699035048484802, + "num_tokens": 647208474.0, + "step": 16964 + }, + { + "epoch": 2.158122376288004, + "ewc_loss": 0.008281955495476723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.281955524580553e-05, + "grad_norm": 4.109887599945068, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8945665955543518, + "num_tokens": 647242131.0, + "step": 16965 + }, + { + "epoch": 2.1582495865665945, + "ewc_loss": 0.00832931138575077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329311822308227e-05, + "grad_norm": 4.13951301574707, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8649534583091736, + "num_tokens": 647280952.0, + "step": 16966 + }, + { + "epoch": 2.158376796845185, + "ewc_loss": 0.008326019160449505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32601945148781e-05, + "grad_norm": 4.12709903717041, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8760325908660889, + "num_tokens": 647318411.0, + "step": 16967 + }, + { + "epoch": 2.1585040071237755, + "ewc_loss": 0.008304049260914326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304048969876021e-05, + "grad_norm": 4.096182823181152, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8702072501182556, + "num_tokens": 647361002.0, + "step": 16968 + }, + { + "epoch": 2.158631217402366, + "ewc_loss": 0.00830180011689663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.301799971377477e-05, + "grad_norm": 4.065310001373291, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8841348886489868, + "num_tokens": 647402995.0, + "step": 16969 + }, + { + "epoch": 2.1587584276809566, + "ewc_loss": 0.008312453515827656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312453428516164e-05, + "grad_norm": 4.085243225097656, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.883132815361023, + "num_tokens": 647441220.0, + "step": 16970 + }, + { + "epoch": 2.158885637959547, + "ewc_loss": 0.00833483599126339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.334835729328915e-05, + "grad_norm": 4.122193813323975, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8753001093864441, + "num_tokens": 647476832.0, + "step": 16971 + }, + { + "epoch": 2.1590128482381377, + "ewc_loss": 0.00835525244474411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355252066394314e-05, + "grad_norm": 4.0830841064453125, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8823610544204712, + "num_tokens": 647516855.0, + "step": 16972 + }, + { + "epoch": 2.159140058516728, + "ewc_loss": 0.008321486413478851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321486529894173e-05, + "grad_norm": 4.115167617797852, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8782020807266235, + "num_tokens": 647556913.0, + "step": 16973 + }, + { + "epoch": 2.1592672687953187, + "ewc_loss": 0.008347975090146065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347974653588608e-05, + "grad_norm": 4.112595081329346, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8971271514892578, + "num_tokens": 647591242.0, + "step": 16974 + }, + { + "epoch": 2.1593944790739092, + "ewc_loss": 0.008336962200701237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33696176414378e-05, + "grad_norm": 4.121099472045898, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.884972333908081, + "num_tokens": 647632156.0, + "step": 16975 + }, + { + "epoch": 2.1595216893524998, + "ewc_loss": 0.008334960788488388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33496087579988e-05, + "grad_norm": 4.116212368011475, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8886390924453735, + "num_tokens": 647669857.0, + "step": 16976 + }, + { + "epoch": 2.1596488996310903, + "ewc_loss": 0.008314263075590134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314262959174812e-05, + "grad_norm": 4.093087196350098, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8689426183700562, + "num_tokens": 647715580.0, + "step": 16977 + }, + { + "epoch": 2.159776109909681, + "ewc_loss": 0.008300850167870522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300849731313065e-05, + "grad_norm": 4.134384632110596, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8728313446044922, + "num_tokens": 647755917.0, + "step": 16978 + }, + { + "epoch": 2.1599033201882714, + "ewc_loss": 0.008330216631293297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330216951435432e-05, + "grad_norm": 4.030066013336182, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8972433805465698, + "num_tokens": 647802205.0, + "step": 16979 + }, + { + "epoch": 2.160030530466862, + "ewc_loss": 0.008236350491642952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.236350549850613e-05, + "grad_norm": 4.095211029052734, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8669142127037048, + "num_tokens": 647846627.0, + "step": 16980 + }, + { + "epoch": 2.1601577407454524, + "ewc_loss": 0.008304175920784473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304175571538508e-05, + "grad_norm": 4.180433750152588, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8726488947868347, + "num_tokens": 647885033.0, + "step": 16981 + }, + { + "epoch": 2.160284951024043, + "ewc_loss": 0.008318808861076832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318808977492154e-05, + "grad_norm": 4.120593547821045, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8711561560630798, + "num_tokens": 647923370.0, + "step": 16982 + }, + { + "epoch": 2.1604121613026335, + "ewc_loss": 0.008253606036305428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25360621092841e-05, + "grad_norm": 4.0599517822265625, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8862745761871338, + "num_tokens": 647963431.0, + "step": 16983 + }, + { + "epoch": 2.1605393715812236, + "ewc_loss": 0.008235602639615536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.235602581407875e-05, + "grad_norm": 4.146976947784424, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8732351660728455, + "num_tokens": 648002854.0, + "step": 16984 + }, + { + "epoch": 2.160666581859814, + "ewc_loss": 0.008316720835864544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316720777656883e-05, + "grad_norm": 4.1333537101745605, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8706016540527344, + "num_tokens": 648041082.0, + "step": 16985 + }, + { + "epoch": 2.1607937921384046, + "ewc_loss": 0.008264342322945595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2643426139839e-05, + "grad_norm": 4.140860080718994, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8752629160881042, + "num_tokens": 648082109.0, + "step": 16986 + }, + { + "epoch": 2.160921002416995, + "ewc_loss": 0.00826244056224823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.262440678663552e-05, + "grad_norm": 4.0898518562316895, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8942563533782959, + "num_tokens": 648117398.0, + "step": 16987 + }, + { + "epoch": 2.1610482126955857, + "ewc_loss": 0.00824459083378315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.244591299444437e-05, + "grad_norm": 4.084259033203125, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8891018629074097, + "num_tokens": 648158997.0, + "step": 16988 + }, + { + "epoch": 2.161175422974176, + "ewc_loss": 0.008252903819084167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.252904081018642e-05, + "grad_norm": 4.132092475891113, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8788135051727295, + "num_tokens": 648194334.0, + "step": 16989 + }, + { + "epoch": 2.1613026332527667, + "ewc_loss": 0.008295364677906036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.295364386867732e-05, + "grad_norm": 4.128665924072266, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8787322044372559, + "num_tokens": 648230601.0, + "step": 16990 + }, + { + "epoch": 2.1614298435313573, + "ewc_loss": 0.00825666543096304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.256665023509413e-05, + "grad_norm": 4.067140102386475, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8753035664558411, + "num_tokens": 648274979.0, + "step": 16991 + }, + { + "epoch": 2.161557053809948, + "ewc_loss": 0.008255302906036377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.255302964244038e-05, + "grad_norm": 4.109475135803223, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8671730160713196, + "num_tokens": 648319229.0, + "step": 16992 + }, + { + "epoch": 2.1616842640885383, + "ewc_loss": 0.00827521551400423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.275215077446774e-05, + "grad_norm": 4.136996746063232, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8922458291053772, + "num_tokens": 648354550.0, + "step": 16993 + }, + { + "epoch": 2.161811474367129, + "ewc_loss": 0.008299555629491806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299556066049263e-05, + "grad_norm": 4.107174873352051, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8926823139190674, + "num_tokens": 648389820.0, + "step": 16994 + }, + { + "epoch": 2.1619386846457194, + "ewc_loss": 0.008261086419224739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.261086622951552e-05, + "grad_norm": 4.137940883636475, + "learning_rate": 1e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.893082320690155, + "num_tokens": 648423841.0, + "step": 16995 + }, + { + "epoch": 2.16206589492431, + "ewc_loss": 0.008283291943371296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.283292117994279e-05, + "grad_norm": 4.168405532836914, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8683786988258362, + "num_tokens": 648459865.0, + "step": 16996 + }, + { + "epoch": 2.1621931052029004, + "ewc_loss": 0.008305233903229237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305234223371372e-05, + "grad_norm": 4.125226020812988, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.898524284362793, + "num_tokens": 648492639.0, + "step": 16997 + }, + { + "epoch": 2.162320315481491, + "ewc_loss": 0.008264110423624516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264110510936007e-05, + "grad_norm": 4.085876941680908, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8863385915756226, + "num_tokens": 648533456.0, + "step": 16998 + }, + { + "epoch": 2.1624475257600815, + "ewc_loss": 0.008259636349976063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.259636524599046e-05, + "grad_norm": 4.09871244430542, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8943403363227844, + "num_tokens": 648571955.0, + "step": 16999 + }, + { + "epoch": 2.162574736038672, + "ewc_loss": 0.008305735886096954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305735536850989e-05, + "grad_norm": 4.0356292724609375, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8970297574996948, + "num_tokens": 648614284.0, + "step": 17000 + }, + { + "epoch": 2.1627019463172625, + "ewc_loss": 0.008256671018898487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.256670844275504e-05, + "grad_norm": 4.112377166748047, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8891319036483765, + "num_tokens": 648654399.0, + "step": 17001 + }, + { + "epoch": 2.162829156595853, + "ewc_loss": 0.008324972353875637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324972441187128e-05, + "grad_norm": 4.090027332305908, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8843262195587158, + "num_tokens": 648694490.0, + "step": 17002 + }, + { + "epoch": 2.1629563668744436, + "ewc_loss": 0.008259003050625324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25900278869085e-05, + "grad_norm": 4.121665954589844, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8864430785179138, + "num_tokens": 648730665.0, + "step": 17003 + }, + { + "epoch": 2.163083577153034, + "ewc_loss": 0.008288329467177391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288329263450578e-05, + "grad_norm": 4.16927433013916, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8864283561706543, + "num_tokens": 648765289.0, + "step": 17004 + }, + { + "epoch": 2.1632107874316246, + "ewc_loss": 0.008312606252729893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312606223626062e-05, + "grad_norm": 4.102747440338135, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8819795846939087, + "num_tokens": 648803982.0, + "step": 17005 + }, + { + "epoch": 2.163337997710215, + "ewc_loss": 0.008264903910458088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264904317911714e-05, + "grad_norm": 4.1141462326049805, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.885776937007904, + "num_tokens": 648840992.0, + "step": 17006 + }, + { + "epoch": 2.1634652079888053, + "ewc_loss": 0.008285870775580406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.285870717372745e-05, + "grad_norm": 4.070855140686035, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8765881061553955, + "num_tokens": 648883621.0, + "step": 17007 + }, + { + "epoch": 2.1635924182673962, + "ewc_loss": 0.008270172402262688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270172838820145e-05, + "grad_norm": 4.153075695037842, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8675456643104553, + "num_tokens": 648922258.0, + "step": 17008 + }, + { + "epoch": 2.1637196285459863, + "ewc_loss": 0.008307547308504581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30754725029692e-05, + "grad_norm": 4.100719928741455, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.881523847579956, + "num_tokens": 648965010.0, + "step": 17009 + }, + { + "epoch": 2.163846838824577, + "ewc_loss": 0.008255858905613422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.255858847405761e-05, + "grad_norm": 4.063801288604736, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8764110207557678, + "num_tokens": 649008124.0, + "step": 17010 + }, + { + "epoch": 2.1639740491031674, + "ewc_loss": 0.008259383961558342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.259384048869833e-05, + "grad_norm": 4.1529622077941895, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8825279474258423, + "num_tokens": 649044743.0, + "step": 17011 + }, + { + "epoch": 2.164101259381758, + "ewc_loss": 0.00832139141857624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321391214849427e-05, + "grad_norm": 4.103357791900635, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.893151044845581, + "num_tokens": 649078410.0, + "step": 17012 + }, + { + "epoch": 2.1642284696603484, + "ewc_loss": 0.008266677148640156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.266677468782291e-05, + "grad_norm": 4.1436076164245605, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8917321562767029, + "num_tokens": 649115415.0, + "step": 17013 + }, + { + "epoch": 2.164355679938939, + "ewc_loss": 0.00830035749822855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300357148982584e-05, + "grad_norm": 4.106635093688965, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.87987220287323, + "num_tokens": 649156266.0, + "step": 17014 + }, + { + "epoch": 2.1644828902175295, + "ewc_loss": 0.008251027204096317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.251026883954182e-05, + "grad_norm": 4.136937618255615, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8886061310768127, + "num_tokens": 649192552.0, + "step": 17015 + }, + { + "epoch": 2.16461010049612, + "ewc_loss": 0.008306018076837063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.306017844006419e-05, + "grad_norm": 4.223290920257568, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8831521272659302, + "num_tokens": 649224163.0, + "step": 17016 + }, + { + "epoch": 2.1647373107747105, + "ewc_loss": 0.008327898569405079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327898831339553e-05, + "grad_norm": 4.09627628326416, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.8997830152511597, + "num_tokens": 649259348.0, + "step": 17017 + }, + { + "epoch": 2.164864521053301, + "ewc_loss": 0.008227808400988579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.227808575611562e-05, + "grad_norm": 4.064281463623047, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8958478569984436, + "num_tokens": 649296875.0, + "step": 17018 + }, + { + "epoch": 2.1649917313318916, + "ewc_loss": 0.008267528377473354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.26752875582315e-05, + "grad_norm": 4.166995525360107, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8723121285438538, + "num_tokens": 649333455.0, + "step": 17019 + }, + { + "epoch": 2.165118941610482, + "ewc_loss": 0.008339542895555496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339543273905292e-05, + "grad_norm": 4.1617841720581055, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8849004507064819, + "num_tokens": 649364567.0, + "step": 17020 + }, + { + "epoch": 2.1652461518890727, + "ewc_loss": 0.008281110785901546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.281110785901546e-05, + "grad_norm": 4.14857816696167, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8663698434829712, + "num_tokens": 649400579.0, + "step": 17021 + }, + { + "epoch": 2.165373362167663, + "ewc_loss": 0.008295658975839615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.295659063151106e-05, + "grad_norm": 4.098645210266113, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.893589973449707, + "num_tokens": 649432940.0, + "step": 17022 + }, + { + "epoch": 2.1655005724462537, + "ewc_loss": 0.008296959102153778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.296959276776761e-05, + "grad_norm": 4.120798110961914, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8845374584197998, + "num_tokens": 649466695.0, + "step": 17023 + }, + { + "epoch": 2.1656277827248442, + "ewc_loss": 0.008325113914906979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325113594764844e-05, + "grad_norm": 4.08091402053833, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8809623122215271, + "num_tokens": 649504578.0, + "step": 17024 + }, + { + "epoch": 2.1657549930034348, + "ewc_loss": 0.00830975454300642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309754775837064e-05, + "grad_norm": 4.10684871673584, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8727395534515381, + "num_tokens": 649541536.0, + "step": 17025 + }, + { + "epoch": 2.1658822032820253, + "ewc_loss": 0.008369319140911102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369319402845576e-05, + "grad_norm": 4.1460137367248535, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8792346715927124, + "num_tokens": 649581280.0, + "step": 17026 + }, + { + "epoch": 2.166009413560616, + "ewc_loss": 0.008367307484149933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367307600565255e-05, + "grad_norm": 4.069802761077881, + "learning_rate": 1e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.902800440788269, + "num_tokens": 649619837.0, + "step": 17027 + }, + { + "epoch": 2.1661366238392064, + "ewc_loss": 0.008325297385454178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325296948896721e-05, + "grad_norm": 4.148105144500732, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8916248083114624, + "num_tokens": 649654708.0, + "step": 17028 + }, + { + "epoch": 2.166263834117797, + "ewc_loss": 0.008392930030822754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.392929885303602e-05, + "grad_norm": 4.147033214569092, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8627698421478271, + "num_tokens": 649691709.0, + "step": 17029 + }, + { + "epoch": 2.1663910443963874, + "ewc_loss": 0.008349248208105564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34924794617109e-05, + "grad_norm": 4.0868120193481445, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8766009211540222, + "num_tokens": 649734876.0, + "step": 17030 + }, + { + "epoch": 2.166518254674978, + "ewc_loss": 0.0083327516913414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.332751895068213e-05, + "grad_norm": 4.0974931716918945, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8893406987190247, + "num_tokens": 649775477.0, + "step": 17031 + }, + { + "epoch": 2.166645464953568, + "ewc_loss": 0.008360798470675945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360798528883606e-05, + "grad_norm": 4.089478015899658, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8912326097488403, + "num_tokens": 649816136.0, + "step": 17032 + }, + { + "epoch": 2.1667726752321586, + "ewc_loss": 0.00833976361900568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339763735421002e-05, + "grad_norm": 4.185608863830566, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.889266848564148, + "num_tokens": 649847942.0, + "step": 17033 + }, + { + "epoch": 2.166899885510749, + "ewc_loss": 0.008396010845899582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396010525757447e-05, + "grad_norm": 4.138064861297607, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8784117102622986, + "num_tokens": 649887875.0, + "step": 17034 + }, + { + "epoch": 2.1670270957893396, + "ewc_loss": 0.008333363570272923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333363803103566e-05, + "grad_norm": 4.148365497589111, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8717398047447205, + "num_tokens": 649920557.0, + "step": 17035 + }, + { + "epoch": 2.16715430606793, + "ewc_loss": 0.00836278311908245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362782682524994e-05, + "grad_norm": 4.12516975402832, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8913623094558716, + "num_tokens": 649959704.0, + "step": 17036 + }, + { + "epoch": 2.1672815163465207, + "ewc_loss": 0.008356625214219093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35662503959611e-05, + "grad_norm": 4.080235958099365, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8860235214233398, + "num_tokens": 650000433.0, + "step": 17037 + }, + { + "epoch": 2.167408726625111, + "ewc_loss": 0.00832731369882822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327313844347373e-05, + "grad_norm": 4.094974994659424, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.892767071723938, + "num_tokens": 650038793.0, + "step": 17038 + }, + { + "epoch": 2.1675359369037017, + "ewc_loss": 0.008364212699234486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364212408196181e-05, + "grad_norm": 4.158817768096924, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8896164894104004, + "num_tokens": 650072877.0, + "step": 17039 + }, + { + "epoch": 2.1676631471822922, + "ewc_loss": 0.00837072916328907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370729483431205e-05, + "grad_norm": 4.143587589263916, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8701529502868652, + "num_tokens": 650109245.0, + "step": 17040 + }, + { + "epoch": 2.1677903574608828, + "ewc_loss": 0.008347823284566402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347823313670233e-05, + "grad_norm": 4.0570759773254395, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8942909836769104, + "num_tokens": 650150254.0, + "step": 17041 + }, + { + "epoch": 2.1679175677394733, + "ewc_loss": 0.00832385290414095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323852671310306e-05, + "grad_norm": 4.14136266708374, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8636931777000427, + "num_tokens": 650189587.0, + "step": 17042 + }, + { + "epoch": 2.168044778018064, + "ewc_loss": 0.00837487168610096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37487168610096e-05, + "grad_norm": 4.135434627532959, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8772664666175842, + "num_tokens": 650225443.0, + "step": 17043 + }, + { + "epoch": 2.1681719882966544, + "ewc_loss": 0.008336336351931095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336336759384722e-05, + "grad_norm": 4.120677947998047, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8775336742401123, + "num_tokens": 650262319.0, + "step": 17044 + }, + { + "epoch": 2.168299198575245, + "ewc_loss": 0.008318292908370495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318293112097308e-05, + "grad_norm": 4.115418434143066, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8676199913024902, + "num_tokens": 650297137.0, + "step": 17045 + }, + { + "epoch": 2.1684264088538354, + "ewc_loss": 0.008344000205397606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34399979794398e-05, + "grad_norm": 4.113077163696289, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8811398148536682, + "num_tokens": 650335521.0, + "step": 17046 + }, + { + "epoch": 2.168553619132426, + "ewc_loss": 0.008345356211066246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345356036443263e-05, + "grad_norm": 4.1190385818481445, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8763343691825867, + "num_tokens": 650373050.0, + "step": 17047 + }, + { + "epoch": 2.1686808294110165, + "ewc_loss": 0.00836038775742054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360387437278405e-05, + "grad_norm": 4.113261699676514, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8881656527519226, + "num_tokens": 650407438.0, + "step": 17048 + }, + { + "epoch": 2.168808039689607, + "ewc_loss": 0.008349873125553131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349872950930148e-05, + "grad_norm": 4.060360431671143, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8777174949645996, + "num_tokens": 650447793.0, + "step": 17049 + }, + { + "epoch": 2.1689352499681975, + "ewc_loss": 0.008322129026055336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322128996951506e-05, + "grad_norm": 4.161706447601318, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8776888847351074, + "num_tokens": 650487167.0, + "step": 17050 + }, + { + "epoch": 2.169062460246788, + "ewc_loss": 0.008397041819989681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397042256547138e-05, + "grad_norm": 4.118338584899902, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8709255456924438, + "num_tokens": 650522960.0, + "step": 17051 + }, + { + "epoch": 2.1691896705253786, + "ewc_loss": 0.00834741536527872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347415132448077e-05, + "grad_norm": 4.098995685577393, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8766985535621643, + "num_tokens": 650564161.0, + "step": 17052 + }, + { + "epoch": 2.169316880803969, + "ewc_loss": 0.008346780203282833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346779941348359e-05, + "grad_norm": 4.135389804840088, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8876430988311768, + "num_tokens": 650599201.0, + "step": 17053 + }, + { + "epoch": 2.1694440910825596, + "ewc_loss": 0.008373129181563854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373129094252363e-05, + "grad_norm": 4.183149337768555, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8636738657951355, + "num_tokens": 650637385.0, + "step": 17054 + }, + { + "epoch": 2.16957130136115, + "ewc_loss": 0.008385973051190376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38597261463292e-05, + "grad_norm": 4.102015972137451, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8816252946853638, + "num_tokens": 650671954.0, + "step": 17055 + }, + { + "epoch": 2.1696985116397407, + "ewc_loss": 0.008310994133353233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31099459901452e-05, + "grad_norm": 4.075557708740234, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.89876389503479, + "num_tokens": 650708475.0, + "step": 17056 + }, + { + "epoch": 2.169825721918331, + "ewc_loss": 0.008353717625141144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353717566933483e-05, + "grad_norm": 4.1027727127075195, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8857420682907104, + "num_tokens": 650748089.0, + "step": 17057 + }, + { + "epoch": 2.1699529321969213, + "ewc_loss": 0.00836480874568224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364808309124783e-05, + "grad_norm": 4.054805278778076, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8836679458618164, + "num_tokens": 650790869.0, + "step": 17058 + }, + { + "epoch": 2.170080142475512, + "ewc_loss": 0.008344821631908417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344821981154382e-05, + "grad_norm": 4.140550136566162, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8704455494880676, + "num_tokens": 650830697.0, + "step": 17059 + }, + { + "epoch": 2.1702073527541024, + "ewc_loss": 0.008401547558605671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40154752950184e-05, + "grad_norm": 4.091899871826172, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8912140727043152, + "num_tokens": 650868755.0, + "step": 17060 + }, + { + "epoch": 2.170334563032693, + "ewc_loss": 0.00833046156913042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330461423611268e-05, + "grad_norm": 4.100261211395264, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8911239504814148, + "num_tokens": 650908019.0, + "step": 17061 + }, + { + "epoch": 2.1704617733112834, + "ewc_loss": 0.00835065171122551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350651478394866e-05, + "grad_norm": 4.080860137939453, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.880976140499115, + "num_tokens": 650954195.0, + "step": 17062 + }, + { + "epoch": 2.170588983589874, + "ewc_loss": 0.008326095528900623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326095121446997e-05, + "grad_norm": 4.088885307312012, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8946742415428162, + "num_tokens": 650992740.0, + "step": 17063 + }, + { + "epoch": 2.1707161938684645, + "ewc_loss": 0.008327443152666092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327443356392905e-05, + "grad_norm": 4.0759124755859375, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8945698738098145, + "num_tokens": 651031524.0, + "step": 17064 + }, + { + "epoch": 2.170843404147055, + "ewc_loss": 0.008283017203211784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.283017086796463e-05, + "grad_norm": 4.139449596405029, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8795347213745117, + "num_tokens": 651070004.0, + "step": 17065 + }, + { + "epoch": 2.1709706144256455, + "ewc_loss": 0.00832856260240078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328562398673967e-05, + "grad_norm": 4.144901275634766, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8699573278427124, + "num_tokens": 651104474.0, + "step": 17066 + }, + { + "epoch": 2.171097824704236, + "ewc_loss": 0.008293405175209045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293404971482232e-05, + "grad_norm": 4.065654277801514, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8772743940353394, + "num_tokens": 651148913.0, + "step": 17067 + }, + { + "epoch": 2.1712250349828266, + "ewc_loss": 0.008260569535195827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.260569302365184e-05, + "grad_norm": 4.05621337890625, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.889006495475769, + "num_tokens": 651192072.0, + "step": 17068 + }, + { + "epoch": 2.171352245261417, + "ewc_loss": 0.008269794285297394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.269794489024207e-05, + "grad_norm": 4.182556629180908, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8754421472549438, + "num_tokens": 651226132.0, + "step": 17069 + }, + { + "epoch": 2.1714794555400077, + "ewc_loss": 0.008346322923898697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346323011210188e-05, + "grad_norm": 4.12200927734375, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.884264349937439, + "num_tokens": 651263467.0, + "step": 17070 + }, + { + "epoch": 2.171606665818598, + "ewc_loss": 0.00825018621981144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.250186510849744e-05, + "grad_norm": 4.055276870727539, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8730567693710327, + "num_tokens": 651309521.0, + "step": 17071 + }, + { + "epoch": 2.1717338760971887, + "ewc_loss": 0.008243322372436523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.243322372436523e-05, + "grad_norm": 4.138171672821045, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8851146697998047, + "num_tokens": 651347026.0, + "step": 17072 + }, + { + "epoch": 2.1718610863757792, + "ewc_loss": 0.008321098051965237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321097993757576e-05, + "grad_norm": 4.15296745300293, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8905613422393799, + "num_tokens": 651380545.0, + "step": 17073 + }, + { + "epoch": 2.1719882966543698, + "ewc_loss": 0.008278245106339455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.278245513793081e-05, + "grad_norm": 4.226541996002197, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8874571323394775, + "num_tokens": 651409428.0, + "step": 17074 + }, + { + "epoch": 2.1721155069329603, + "ewc_loss": 0.008319243788719177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319244079757482e-05, + "grad_norm": 4.117395877838135, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8694542050361633, + "num_tokens": 651447993.0, + "step": 17075 + }, + { + "epoch": 2.172242717211551, + "ewc_loss": 0.008254426531493664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25442693894729e-05, + "grad_norm": 4.108057498931885, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8763250112533569, + "num_tokens": 651488210.0, + "step": 17076 + }, + { + "epoch": 2.1723699274901414, + "ewc_loss": 0.008298325352370739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298325701616704e-05, + "grad_norm": 4.113228797912598, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8779109716415405, + "num_tokens": 651525742.0, + "step": 17077 + }, + { + "epoch": 2.172497137768732, + "ewc_loss": 0.00830115657299757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.301156776724383e-05, + "grad_norm": 4.087019920349121, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8883311152458191, + "num_tokens": 651565569.0, + "step": 17078 + }, + { + "epoch": 2.1726243480473224, + "ewc_loss": 0.008291629143059254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29162891022861e-05, + "grad_norm": 4.082559108734131, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8774818778038025, + "num_tokens": 651605291.0, + "step": 17079 + }, + { + "epoch": 2.172751558325913, + "ewc_loss": 0.008291284553706646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291284757433459e-05, + "grad_norm": 4.095811367034912, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.882043719291687, + "num_tokens": 651641998.0, + "step": 17080 + }, + { + "epoch": 2.1728787686045035, + "ewc_loss": 0.008318072184920311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318072650581598e-05, + "grad_norm": 4.094854831695557, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8800649642944336, + "num_tokens": 651683040.0, + "step": 17081 + }, + { + "epoch": 2.1730059788830935, + "ewc_loss": 0.00832020491361618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320205233758315e-05, + "grad_norm": 4.132322311401367, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8906745910644531, + "num_tokens": 651718067.0, + "step": 17082 + }, + { + "epoch": 2.173133189161684, + "ewc_loss": 0.008352951146662235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35295140859671e-05, + "grad_norm": 4.051039695739746, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8942930698394775, + "num_tokens": 651761995.0, + "step": 17083 + }, + { + "epoch": 2.1732603994402746, + "ewc_loss": 0.00828817393630743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288174285553396e-05, + "grad_norm": 4.1287922859191895, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8954828381538391, + "num_tokens": 651797780.0, + "step": 17084 + }, + { + "epoch": 2.173387609718865, + "ewc_loss": 0.00834644865244627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346448885276914e-05, + "grad_norm": 4.158527851104736, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8723128437995911, + "num_tokens": 651832931.0, + "step": 17085 + }, + { + "epoch": 2.1735148199974557, + "ewc_loss": 0.008327255956828594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32725636428222e-05, + "grad_norm": 4.177905082702637, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8770936727523804, + "num_tokens": 651868440.0, + "step": 17086 + }, + { + "epoch": 2.173642030276046, + "ewc_loss": 0.008328640833497047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3286409790162e-05, + "grad_norm": 4.187582492828369, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8849906325340271, + "num_tokens": 651900029.0, + "step": 17087 + }, + { + "epoch": 2.1737692405546367, + "ewc_loss": 0.00833936408162117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339364285347983e-05, + "grad_norm": 4.0588555335998535, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8786858320236206, + "num_tokens": 651944892.0, + "step": 17088 + }, + { + "epoch": 2.1738964508332272, + "ewc_loss": 0.008275030180811882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.275030268123373e-05, + "grad_norm": 4.1908793449401855, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8893606662750244, + "num_tokens": 651979330.0, + "step": 17089 + }, + { + "epoch": 2.1740236611118178, + "ewc_loss": 0.00838384311646223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383842941839248e-05, + "grad_norm": 4.097275257110596, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.875532329082489, + "num_tokens": 652017389.0, + "step": 17090 + }, + { + "epoch": 2.1741508713904083, + "ewc_loss": 0.008280366659164429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280366455437616e-05, + "grad_norm": 4.094485759735107, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8772729635238647, + "num_tokens": 652052260.0, + "step": 17091 + }, + { + "epoch": 2.174278081668999, + "ewc_loss": 0.008346992544829845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346992399310693e-05, + "grad_norm": 4.153986930847168, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8692325353622437, + "num_tokens": 652089917.0, + "step": 17092 + }, + { + "epoch": 2.1744052919475894, + "ewc_loss": 0.008368098177015781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368098497157916e-05, + "grad_norm": 4.104121685028076, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8839043378829956, + "num_tokens": 652129849.0, + "step": 17093 + }, + { + "epoch": 2.17453250222618, + "ewc_loss": 0.008315474726259708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315474406117573e-05, + "grad_norm": 4.143527030944824, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8755751252174377, + "num_tokens": 652163178.0, + "step": 17094 + }, + { + "epoch": 2.1746597125047704, + "ewc_loss": 0.008387465961277485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387466368731111e-05, + "grad_norm": 4.13726282119751, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8879749774932861, + "num_tokens": 652201521.0, + "step": 17095 + }, + { + "epoch": 2.174786922783361, + "ewc_loss": 0.008352926932275295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352926670340821e-05, + "grad_norm": 4.178485870361328, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.883203387260437, + "num_tokens": 652234820.0, + "step": 17096 + }, + { + "epoch": 2.1749141330619515, + "ewc_loss": 0.008385823108255863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385823457501829e-05, + "grad_norm": 4.086989879608154, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8787053823471069, + "num_tokens": 652275686.0, + "step": 17097 + }, + { + "epoch": 2.175041343340542, + "ewc_loss": 0.008350202813744545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350203279405832e-05, + "grad_norm": 4.1190361976623535, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.873837947845459, + "num_tokens": 652313243.0, + "step": 17098 + }, + { + "epoch": 2.1751685536191325, + "ewc_loss": 0.008385336957871914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38533669593744e-05, + "grad_norm": 4.071998119354248, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8945292830467224, + "num_tokens": 652351550.0, + "step": 17099 + }, + { + "epoch": 2.175295763897723, + "ewc_loss": 0.008353478275239468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353478187927976e-05, + "grad_norm": 4.0563530921936035, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8800711631774902, + "num_tokens": 652399834.0, + "step": 17100 + }, + { + "epoch": 2.1754229741763136, + "ewc_loss": 0.0083566689863801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356669422937557e-05, + "grad_norm": 4.180783271789551, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8686478734016418, + "num_tokens": 652435839.0, + "step": 17101 + }, + { + "epoch": 2.175550184454904, + "ewc_loss": 0.008434613235294819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434613118879497e-05, + "grad_norm": 4.0681843757629395, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8795871734619141, + "num_tokens": 652475747.0, + "step": 17102 + }, + { + "epoch": 2.1756773947334946, + "ewc_loss": 0.008334570564329624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.334570156875998e-05, + "grad_norm": 4.089717864990234, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8916407823562622, + "num_tokens": 652517147.0, + "step": 17103 + }, + { + "epoch": 2.175804605012085, + "ewc_loss": 0.00838258396834135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382584201171994e-05, + "grad_norm": 4.0934295654296875, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8942769765853882, + "num_tokens": 652552811.0, + "step": 17104 + }, + { + "epoch": 2.1759318152906753, + "ewc_loss": 0.008363778702914715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363778761122376e-05, + "grad_norm": 4.169430255889893, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8893703818321228, + "num_tokens": 652587709.0, + "step": 17105 + }, + { + "epoch": 2.1760590255692662, + "ewc_loss": 0.008385746739804745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385746332351118e-05, + "grad_norm": 4.1249237060546875, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8827102780342102, + "num_tokens": 652626213.0, + "step": 17106 + }, + { + "epoch": 2.1761862358478563, + "ewc_loss": 0.008339951746165752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339952182723209e-05, + "grad_norm": 4.183871746063232, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8556170463562012, + "num_tokens": 652665692.0, + "step": 17107 + }, + { + "epoch": 2.176313446126447, + "ewc_loss": 0.008389122784137726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3891223766841e-05, + "grad_norm": 4.139663219451904, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8737617135047913, + "num_tokens": 652705515.0, + "step": 17108 + }, + { + "epoch": 2.1764406564050374, + "ewc_loss": 0.00831427052617073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314270962728187e-05, + "grad_norm": 4.097794055938721, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.892687201499939, + "num_tokens": 652743343.0, + "step": 17109 + }, + { + "epoch": 2.176567866683628, + "ewc_loss": 0.008310847915709019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.310847624670714e-05, + "grad_norm": 4.088244915008545, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8761516809463501, + "num_tokens": 652784587.0, + "step": 17110 + }, + { + "epoch": 2.1766950769622184, + "ewc_loss": 0.008317696861922741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317697211168706e-05, + "grad_norm": 4.14796781539917, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8721900582313538, + "num_tokens": 652821985.0, + "step": 17111 + }, + { + "epoch": 2.176822287240809, + "ewc_loss": 0.00833806674927473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338066982105374e-05, + "grad_norm": 4.108264446258545, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8914865255355835, + "num_tokens": 652855772.0, + "step": 17112 + }, + { + "epoch": 2.1769494975193995, + "ewc_loss": 0.00832110270857811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321103086927906e-05, + "grad_norm": 4.306990623474121, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8792091012001038, + "num_tokens": 652890497.0, + "step": 17113 + }, + { + "epoch": 2.17707670779799, + "ewc_loss": 0.008454829454421997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454829367110506e-05, + "grad_norm": 4.0375800132751465, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8918944597244263, + "num_tokens": 652931611.0, + "step": 17114 + }, + { + "epoch": 2.1772039180765805, + "ewc_loss": 0.00825390126556158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.253901614807546e-05, + "grad_norm": 4.071556568145752, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8826537132263184, + "num_tokens": 652972129.0, + "step": 17115 + }, + { + "epoch": 2.177331128355171, + "ewc_loss": 0.00839569978415966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395699842367321e-05, + "grad_norm": 4.1665496826171875, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8758397102355957, + "num_tokens": 653009848.0, + "step": 17116 + }, + { + "epoch": 2.1774583386337616, + "ewc_loss": 0.008421442471444607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421442908002064e-05, + "grad_norm": 4.150926113128662, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8792372941970825, + "num_tokens": 653044614.0, + "step": 17117 + }, + { + "epoch": 2.177585548912352, + "ewc_loss": 0.008381801657378674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38180203572847e-05, + "grad_norm": 4.0314741134643555, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8949354887008667, + "num_tokens": 653087614.0, + "step": 17118 + }, + { + "epoch": 2.1777127591909426, + "ewc_loss": 0.008330277167260647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33027734188363e-05, + "grad_norm": 4.141819000244141, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8930856585502625, + "num_tokens": 653120434.0, + "step": 17119 + }, + { + "epoch": 2.177839969469533, + "ewc_loss": 0.008408105000853539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408105350099504e-05, + "grad_norm": 4.051453113555908, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8938792943954468, + "num_tokens": 653159390.0, + "step": 17120 + }, + { + "epoch": 2.1779671797481237, + "ewc_loss": 0.008341286331415176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341285865753889e-05, + "grad_norm": 4.1088457107543945, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8749316334724426, + "num_tokens": 653201045.0, + "step": 17121 + }, + { + "epoch": 2.1780943900267142, + "ewc_loss": 0.008416307158768177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416307537117973e-05, + "grad_norm": 4.07747220993042, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8839900493621826, + "num_tokens": 653244968.0, + "step": 17122 + }, + { + "epoch": 2.1782216003053048, + "ewc_loss": 0.008356274105608463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356274338439107e-05, + "grad_norm": 4.151094436645508, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8763335347175598, + "num_tokens": 653281157.0, + "step": 17123 + }, + { + "epoch": 2.1783488105838953, + "ewc_loss": 0.008410662412643433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410662121605128e-05, + "grad_norm": 4.077125549316406, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8947221636772156, + "num_tokens": 653321055.0, + "step": 17124 + }, + { + "epoch": 2.178476020862486, + "ewc_loss": 0.008330791257321835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330791024491191e-05, + "grad_norm": 4.1478047370910645, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.870306670665741, + "num_tokens": 653359320.0, + "step": 17125 + }, + { + "epoch": 2.1786032311410763, + "ewc_loss": 0.008396035991609097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396035991609097e-05, + "grad_norm": 4.0381550788879395, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.8992829322814941, + "num_tokens": 653404186.0, + "step": 17126 + }, + { + "epoch": 2.178730441419667, + "ewc_loss": 0.008302497677505016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.302497735712677e-05, + "grad_norm": 4.122831344604492, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8922669887542725, + "num_tokens": 653438122.0, + "step": 17127 + }, + { + "epoch": 2.1788576516982574, + "ewc_loss": 0.008397717028856277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397716737817973e-05, + "grad_norm": 4.14833402633667, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8825534582138062, + "num_tokens": 653474727.0, + "step": 17128 + }, + { + "epoch": 2.178984861976848, + "ewc_loss": 0.00835253857076168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352538134204224e-05, + "grad_norm": 4.081045150756836, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8867391347885132, + "num_tokens": 653513531.0, + "step": 17129 + }, + { + "epoch": 2.179112072255438, + "ewc_loss": 0.00830721016973257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307210373459384e-05, + "grad_norm": 4.100624084472656, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8921618461608887, + "num_tokens": 653555871.0, + "step": 17130 + }, + { + "epoch": 2.1792392825340285, + "ewc_loss": 0.008326906710863113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32690711831674e-05, + "grad_norm": 4.065733909606934, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8841631412506104, + "num_tokens": 653598825.0, + "step": 17131 + }, + { + "epoch": 2.179366492812619, + "ewc_loss": 0.008299178443849087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299178443849087e-05, + "grad_norm": 4.0693440437316895, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8910608887672424, + "num_tokens": 653639838.0, + "step": 17132 + }, + { + "epoch": 2.1794937030912096, + "ewc_loss": 0.008309571072459221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309570694109425e-05, + "grad_norm": 4.144776821136475, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8735279440879822, + "num_tokens": 653678958.0, + "step": 17133 + }, + { + "epoch": 2.1796209133698, + "ewc_loss": 0.00832302588969469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323026122525334e-05, + "grad_norm": 4.115993499755859, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8907297253608704, + "num_tokens": 653718043.0, + "step": 17134 + }, + { + "epoch": 2.1797481236483907, + "ewc_loss": 0.0082769263535738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.27692638267763e-05, + "grad_norm": 4.092884063720703, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8976094126701355, + "num_tokens": 653756308.0, + "step": 17135 + }, + { + "epoch": 2.179875333926981, + "ewc_loss": 0.008281508460640907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.281508053187281e-05, + "grad_norm": 4.131006717681885, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.891154944896698, + "num_tokens": 653793140.0, + "step": 17136 + }, + { + "epoch": 2.1800025442055717, + "ewc_loss": 0.008289827965199947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2898281107191e-05, + "grad_norm": 4.181272983551025, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8879455327987671, + "num_tokens": 653823441.0, + "step": 17137 + }, + { + "epoch": 2.1801297544841622, + "ewc_loss": 0.008314409293234348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314409205922857e-05, + "grad_norm": 4.158853054046631, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8731937408447266, + "num_tokens": 653860743.0, + "step": 17138 + }, + { + "epoch": 2.1802569647627528, + "ewc_loss": 0.00829099491238594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.290995174320415e-05, + "grad_norm": 4.116574764251709, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8942545652389526, + "num_tokens": 653894785.0, + "step": 17139 + }, + { + "epoch": 2.1803841750413433, + "ewc_loss": 0.008274875581264496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274876017821953e-05, + "grad_norm": 4.099569797515869, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8804460763931274, + "num_tokens": 653932208.0, + "step": 17140 + }, + { + "epoch": 2.180511385319934, + "ewc_loss": 0.008293913677334785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293913560919464e-05, + "grad_norm": 4.139988899230957, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8716956377029419, + "num_tokens": 653968818.0, + "step": 17141 + }, + { + "epoch": 2.1806385955985244, + "ewc_loss": 0.008321537636220455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321537461597472e-05, + "grad_norm": 4.12498664855957, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8780289888381958, + "num_tokens": 654010696.0, + "step": 17142 + }, + { + "epoch": 2.180765805877115, + "ewc_loss": 0.008293942548334599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29394266474992e-05, + "grad_norm": 4.123482704162598, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8862402439117432, + "num_tokens": 654047632.0, + "step": 17143 + }, + { + "epoch": 2.1808930161557054, + "ewc_loss": 0.00832444429397583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324444206664339e-05, + "grad_norm": 4.110869884490967, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8866472244262695, + "num_tokens": 654088200.0, + "step": 17144 + }, + { + "epoch": 2.181020226434296, + "ewc_loss": 0.008300218731164932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300218905787915e-05, + "grad_norm": 4.128513336181641, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8805949687957764, + "num_tokens": 654124225.0, + "step": 17145 + }, + { + "epoch": 2.1811474367128865, + "ewc_loss": 0.008332332596182823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.332332799909636e-05, + "grad_norm": 4.107976913452148, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8843012452125549, + "num_tokens": 654163317.0, + "step": 17146 + }, + { + "epoch": 2.181274646991477, + "ewc_loss": 0.00831160694360733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311607234645635e-05, + "grad_norm": 4.118163585662842, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8762561082839966, + "num_tokens": 654199208.0, + "step": 17147 + }, + { + "epoch": 2.1814018572700675, + "ewc_loss": 0.008322110399603844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32211080705747e-05, + "grad_norm": 4.105706691741943, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8718552589416504, + "num_tokens": 654239988.0, + "step": 17148 + }, + { + "epoch": 2.181529067548658, + "ewc_loss": 0.00829903595149517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299035835079849e-05, + "grad_norm": 4.138265609741211, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8772273659706116, + "num_tokens": 654276361.0, + "step": 17149 + }, + { + "epoch": 2.1816562778272486, + "ewc_loss": 0.008348651230335236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348651317646727e-05, + "grad_norm": 4.083129405975342, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.886928915977478, + "num_tokens": 654316690.0, + "step": 17150 + }, + { + "epoch": 2.181783488105839, + "ewc_loss": 0.00831629429012537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316294406540692e-05, + "grad_norm": 4.081027507781982, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.890814483165741, + "num_tokens": 654354112.0, + "step": 17151 + }, + { + "epoch": 2.1819106983844296, + "ewc_loss": 0.008333384990692139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333384903380647e-05, + "grad_norm": 4.063605785369873, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8925893902778625, + "num_tokens": 654398896.0, + "step": 17152 + }, + { + "epoch": 2.18203790866302, + "ewc_loss": 0.008317403495311737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317403262481093e-05, + "grad_norm": 4.157649993896484, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8942718505859375, + "num_tokens": 654435180.0, + "step": 17153 + }, + { + "epoch": 2.1821651189416107, + "ewc_loss": 0.00836213305592537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362132939510047e-05, + "grad_norm": 4.1575822830200195, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.871656596660614, + "num_tokens": 654473958.0, + "step": 17154 + }, + { + "epoch": 2.1822923292202008, + "ewc_loss": 0.008345125243067741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345125388586894e-05, + "grad_norm": 4.1584553718566895, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8707767128944397, + "num_tokens": 654512918.0, + "step": 17155 + }, + { + "epoch": 2.1824195394987913, + "ewc_loss": 0.008339395746588707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339395571965724e-05, + "grad_norm": 4.17881441116333, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.890291154384613, + "num_tokens": 654552296.0, + "step": 17156 + }, + { + "epoch": 2.182546749777382, + "ewc_loss": 0.008347442373633385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347442781087011e-05, + "grad_norm": 4.094112873077393, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8899000287055969, + "num_tokens": 654592088.0, + "step": 17157 + }, + { + "epoch": 2.1826739600559724, + "ewc_loss": 0.008311290293931961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311290730489418e-05, + "grad_norm": 4.1044158935546875, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8763198256492615, + "num_tokens": 654635119.0, + "step": 17158 + }, + { + "epoch": 2.182801170334563, + "ewc_loss": 0.008337324485182762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.337324106832966e-05, + "grad_norm": 4.170379638671875, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8704919815063477, + "num_tokens": 654671204.0, + "step": 17159 + }, + { + "epoch": 2.1829283806131534, + "ewc_loss": 0.008377590216696262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377589983865619e-05, + "grad_norm": 4.190279960632324, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8800761699676514, + "num_tokens": 654702986.0, + "step": 17160 + }, + { + "epoch": 2.183055590891744, + "ewc_loss": 0.008366757072508335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366756810573861e-05, + "grad_norm": 4.096965789794922, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8612538576126099, + "num_tokens": 654745813.0, + "step": 17161 + }, + { + "epoch": 2.1831828011703345, + "ewc_loss": 0.008321860805153847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321861241711304e-05, + "grad_norm": 4.137352466583252, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8821783065795898, + "num_tokens": 654783305.0, + "step": 17162 + }, + { + "epoch": 2.183310011448925, + "ewc_loss": 0.008379369974136353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379369683098048e-05, + "grad_norm": 4.073229789733887, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8903212547302246, + "num_tokens": 654823427.0, + "step": 17163 + }, + { + "epoch": 2.1834372217275155, + "ewc_loss": 0.008326258510351181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326258102897555e-05, + "grad_norm": 4.063952922821045, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8937849402427673, + "num_tokens": 654862400.0, + "step": 17164 + }, + { + "epoch": 2.183564432006106, + "ewc_loss": 0.008363827131688595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36382678244263e-05, + "grad_norm": 4.11169958114624, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8866791725158691, + "num_tokens": 654902484.0, + "step": 17165 + }, + { + "epoch": 2.1836916422846966, + "ewc_loss": 0.008358033373951912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358032937394455e-05, + "grad_norm": 4.06949520111084, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8889880180358887, + "num_tokens": 654946882.0, + "step": 17166 + }, + { + "epoch": 2.183818852563287, + "ewc_loss": 0.008320565335452557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320565393660218e-05, + "grad_norm": 4.21414852142334, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8745920062065125, + "num_tokens": 654977850.0, + "step": 17167 + }, + { + "epoch": 2.1839460628418776, + "ewc_loss": 0.008424322120845318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424322004429996e-05, + "grad_norm": 4.166797161102295, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8690906763076782, + "num_tokens": 655015352.0, + "step": 17168 + }, + { + "epoch": 2.184073273120468, + "ewc_loss": 0.008349468931555748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349469135282561e-05, + "grad_norm": 4.142636299133301, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8837441205978394, + "num_tokens": 655047488.0, + "step": 17169 + }, + { + "epoch": 2.1842004833990587, + "ewc_loss": 0.008358514867722988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358514605788514e-05, + "grad_norm": 4.165140151977539, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8801931142807007, + "num_tokens": 655083410.0, + "step": 17170 + }, + { + "epoch": 2.1843276936776492, + "ewc_loss": 0.008372412994503975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372413140023127e-05, + "grad_norm": 4.090301990509033, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8912725448608398, + "num_tokens": 655120594.0, + "step": 17171 + }, + { + "epoch": 2.1844549039562398, + "ewc_loss": 0.008331270888447762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331271237693727e-05, + "grad_norm": 4.1410956382751465, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8796040415763855, + "num_tokens": 655157310.0, + "step": 17172 + }, + { + "epoch": 2.1845821142348303, + "ewc_loss": 0.00837050098925829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37050101836212e-05, + "grad_norm": 4.113526344299316, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8908432722091675, + "num_tokens": 655193749.0, + "step": 17173 + }, + { + "epoch": 2.184709324513421, + "ewc_loss": 0.008344012312591076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344012167071924e-05, + "grad_norm": 4.113365173339844, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8942170739173889, + "num_tokens": 655235532.0, + "step": 17174 + }, + { + "epoch": 2.1848365347920113, + "ewc_loss": 0.008338069543242455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338069164892659e-05, + "grad_norm": 4.07081937789917, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8899109363555908, + "num_tokens": 655282154.0, + "step": 17175 + }, + { + "epoch": 2.184963745070602, + "ewc_loss": 0.00830367673188448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303676440846175e-05, + "grad_norm": 4.1299848556518555, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8768343925476074, + "num_tokens": 655321342.0, + "step": 17176 + }, + { + "epoch": 2.1850909553491924, + "ewc_loss": 0.008363651111721992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363651431864128e-05, + "grad_norm": 4.101269245147705, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8983839750289917, + "num_tokens": 655358107.0, + "step": 17177 + }, + { + "epoch": 2.185218165627783, + "ewc_loss": 0.008316420018672943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316419553011656e-05, + "grad_norm": 4.154036045074463, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.87913978099823, + "num_tokens": 655393462.0, + "step": 17178 + }, + { + "epoch": 2.1853453759063735, + "ewc_loss": 0.008357600308954716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357600745512173e-05, + "grad_norm": 4.102607250213623, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8854839205741882, + "num_tokens": 655436114.0, + "step": 17179 + }, + { + "epoch": 2.1854725861849635, + "ewc_loss": 0.008303023874759674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303023787448183e-05, + "grad_norm": 4.153820037841797, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.885320782661438, + "num_tokens": 655473119.0, + "step": 17180 + }, + { + "epoch": 2.185599796463554, + "ewc_loss": 0.008367752656340599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367752889171243e-05, + "grad_norm": 4.114985466003418, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8778518438339233, + "num_tokens": 655515537.0, + "step": 17181 + }, + { + "epoch": 2.1857270067421446, + "ewc_loss": 0.008318061009049416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318061009049416e-05, + "grad_norm": 4.124610424041748, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8767269849777222, + "num_tokens": 655555944.0, + "step": 17182 + }, + { + "epoch": 2.185854217020735, + "ewc_loss": 0.008342158049345016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34215825307183e-05, + "grad_norm": 4.21646785736084, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8673180937767029, + "num_tokens": 655589952.0, + "step": 17183 + }, + { + "epoch": 2.1859814272993257, + "ewc_loss": 0.008381073363125324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.381072984775528e-05, + "grad_norm": 4.101241588592529, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8946433067321777, + "num_tokens": 655628269.0, + "step": 17184 + }, + { + "epoch": 2.186108637577916, + "ewc_loss": 0.008276545442640781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276545850094408e-05, + "grad_norm": 4.208343982696533, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8868964314460754, + "num_tokens": 655657315.0, + "step": 17185 + }, + { + "epoch": 2.1862358478565067, + "ewc_loss": 0.008389672264456749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389672439079732e-05, + "grad_norm": 4.132174968719482, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8821440935134888, + "num_tokens": 655696117.0, + "step": 17186 + }, + { + "epoch": 2.1863630581350972, + "ewc_loss": 0.008305826224386692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305826486321166e-05, + "grad_norm": 4.10270881652832, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8671997785568237, + "num_tokens": 655738823.0, + "step": 17187 + }, + { + "epoch": 2.1864902684136878, + "ewc_loss": 0.008312642574310303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312642603414133e-05, + "grad_norm": 4.055406093597412, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8747531175613403, + "num_tokens": 655781457.0, + "step": 17188 + }, + { + "epoch": 2.1866174786922783, + "ewc_loss": 0.00830422155559063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.304221410071477e-05, + "grad_norm": 4.10049295425415, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8779397010803223, + "num_tokens": 655824704.0, + "step": 17189 + }, + { + "epoch": 2.186744688970869, + "ewc_loss": 0.008326747454702854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326747774844989e-05, + "grad_norm": 4.105760097503662, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8919023275375366, + "num_tokens": 655860013.0, + "step": 17190 + }, + { + "epoch": 2.1868718992494594, + "ewc_loss": 0.008310701698064804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31070210551843e-05, + "grad_norm": 4.14142370223999, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8801685571670532, + "num_tokens": 655897146.0, + "step": 17191 + }, + { + "epoch": 2.18699910952805, + "ewc_loss": 0.008348526433110237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348526898771524e-05, + "grad_norm": 4.098276615142822, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8871290683746338, + "num_tokens": 655934299.0, + "step": 17192 + }, + { + "epoch": 2.1871263198066404, + "ewc_loss": 0.008280284702777863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280284237116575e-05, + "grad_norm": 4.130917072296143, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8866048455238342, + "num_tokens": 655976554.0, + "step": 17193 + }, + { + "epoch": 2.187253530085231, + "ewc_loss": 0.008333459496498108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333459118148312e-05, + "grad_norm": 4.130250930786133, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8943683505058289, + "num_tokens": 656011756.0, + "step": 17194 + }, + { + "epoch": 2.1873807403638215, + "ewc_loss": 0.00830324087291956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303240610985085e-05, + "grad_norm": 4.108272075653076, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8797169923782349, + "num_tokens": 656054405.0, + "step": 17195 + }, + { + "epoch": 2.187507950642412, + "ewc_loss": 0.008291756734251976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29175696708262e-05, + "grad_norm": 4.109123706817627, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8871172070503235, + "num_tokens": 656094668.0, + "step": 17196 + }, + { + "epoch": 2.1876351609210025, + "ewc_loss": 0.008291566744446754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291566336993128e-05, + "grad_norm": 4.091023921966553, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8831067085266113, + "num_tokens": 656137664.0, + "step": 17197 + }, + { + "epoch": 2.187762371199593, + "ewc_loss": 0.008286623284220695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286623051390052e-05, + "grad_norm": 4.126049041748047, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.87489914894104, + "num_tokens": 656180611.0, + "step": 17198 + }, + { + "epoch": 2.1878895814781836, + "ewc_loss": 0.008297166787087917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.297166641568765e-05, + "grad_norm": 4.173893451690674, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.874082088470459, + "num_tokens": 656217789.0, + "step": 17199 + }, + { + "epoch": 2.188016791756774, + "ewc_loss": 0.008307980373501778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307980169774964e-05, + "grad_norm": 4.102055549621582, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8800882697105408, + "num_tokens": 656261940.0, + "step": 17200 + }, + { + "epoch": 2.1881440020353646, + "ewc_loss": 0.008235498331487179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.235498535213992e-05, + "grad_norm": 4.11186408996582, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8920974731445312, + "num_tokens": 656296000.0, + "step": 17201 + }, + { + "epoch": 2.188271212313955, + "ewc_loss": 0.008289746940135956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289747347589582e-05, + "grad_norm": 4.084782600402832, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8933242559432983, + "num_tokens": 656340862.0, + "step": 17202 + }, + { + "epoch": 2.1883984225925452, + "ewc_loss": 0.008240330964326859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.240330498665571e-05, + "grad_norm": 4.115060806274414, + "learning_rate": 1e-06, + "loss": 0.275, + "mean_token_accuracy": 0.9040290117263794, + "num_tokens": 656376923.0, + "step": 17203 + }, + { + "epoch": 2.188525632871136, + "ewc_loss": 0.008267662487924099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.267662633443251e-05, + "grad_norm": 4.143218040466309, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8799166083335876, + "num_tokens": 656414058.0, + "step": 17204 + }, + { + "epoch": 2.1886528431497263, + "ewc_loss": 0.008289378136396408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289377728942782e-05, + "grad_norm": 4.13820219039917, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8758402466773987, + "num_tokens": 656450077.0, + "step": 17205 + }, + { + "epoch": 2.188780053428317, + "ewc_loss": 0.00825969222933054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.259692549472675e-05, + "grad_norm": 4.129848003387451, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8738439679145813, + "num_tokens": 656487258.0, + "step": 17206 + }, + { + "epoch": 2.1889072637069074, + "ewc_loss": 0.008272526785731316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.272526611108333e-05, + "grad_norm": 4.094624996185303, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8966900110244751, + "num_tokens": 656525141.0, + "step": 17207 + }, + { + "epoch": 2.189034473985498, + "ewc_loss": 0.008264478296041489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264478674391285e-05, + "grad_norm": 4.127786636352539, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8854318857192993, + "num_tokens": 656561517.0, + "step": 17208 + }, + { + "epoch": 2.1891616842640884, + "ewc_loss": 0.008303423412144184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303423237521201e-05, + "grad_norm": 4.148066997528076, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8897140026092529, + "num_tokens": 656595859.0, + "step": 17209 + }, + { + "epoch": 2.189288894542679, + "ewc_loss": 0.008266670629382133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.266670192824677e-05, + "grad_norm": 4.134247303009033, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8785382509231567, + "num_tokens": 656627652.0, + "step": 17210 + }, + { + "epoch": 2.1894161048212695, + "ewc_loss": 0.008287276141345501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.287275704788044e-05, + "grad_norm": 4.108968734741211, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8885657787322998, + "num_tokens": 656669951.0, + "step": 17211 + }, + { + "epoch": 2.18954331509986, + "ewc_loss": 0.008277041837573051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.277042070403695e-05, + "grad_norm": 4.160841941833496, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8948932886123657, + "num_tokens": 656702216.0, + "step": 17212 + }, + { + "epoch": 2.1896705253784505, + "ewc_loss": 0.008319473825395107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31947400001809e-05, + "grad_norm": 4.074294090270996, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8725796341896057, + "num_tokens": 656744816.0, + "step": 17213 + }, + { + "epoch": 2.189797735657041, + "ewc_loss": 0.00824340246617794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.24340240797028e-05, + "grad_norm": 4.114647388458252, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.877590537071228, + "num_tokens": 656785533.0, + "step": 17214 + }, + { + "epoch": 2.1899249459356316, + "ewc_loss": 0.008311756886541843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311757119372487e-05, + "grad_norm": 4.095571041107178, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8796015977859497, + "num_tokens": 656825375.0, + "step": 17215 + }, + { + "epoch": 2.190052156214222, + "ewc_loss": 0.008285476826131344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.285477088065818e-05, + "grad_norm": 4.19098424911499, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8916281461715698, + "num_tokens": 656857951.0, + "step": 17216 + }, + { + "epoch": 2.1901793664928126, + "ewc_loss": 0.008359878323972225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.359878120245412e-05, + "grad_norm": 4.10153341293335, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8925871849060059, + "num_tokens": 656896911.0, + "step": 17217 + }, + { + "epoch": 2.190306576771403, + "ewc_loss": 0.008274160325527191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.274160791188478e-05, + "grad_norm": 4.093553066253662, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8905156850814819, + "num_tokens": 656935723.0, + "step": 17218 + }, + { + "epoch": 2.1904337870499937, + "ewc_loss": 0.008299598470330238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299598266603425e-05, + "grad_norm": 4.098301410675049, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.893169641494751, + "num_tokens": 656970881.0, + "step": 17219 + }, + { + "epoch": 2.1905609973285842, + "ewc_loss": 0.008314327336847782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314326987601817e-05, + "grad_norm": 4.148624420166016, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8775631785392761, + "num_tokens": 657007721.0, + "step": 17220 + }, + { + "epoch": 2.1906882076071748, + "ewc_loss": 0.008342307060956955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34230741020292e-05, + "grad_norm": 4.149754524230957, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8944705724716187, + "num_tokens": 657042359.0, + "step": 17221 + }, + { + "epoch": 2.1908154178857653, + "ewc_loss": 0.008318117819726467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318117761518806e-05, + "grad_norm": 4.07430362701416, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8866443037986755, + "num_tokens": 657084662.0, + "step": 17222 + }, + { + "epoch": 2.190942628164356, + "ewc_loss": 0.008289321325719357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289320976473391e-05, + "grad_norm": 4.101931095123291, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8843020796775818, + "num_tokens": 657122002.0, + "step": 17223 + }, + { + "epoch": 2.1910698384429463, + "ewc_loss": 0.008312927559018135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312927820952609e-05, + "grad_norm": 4.123355388641357, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8853445053100586, + "num_tokens": 657158768.0, + "step": 17224 + }, + { + "epoch": 2.191197048721537, + "ewc_loss": 0.008316587656736374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316587627632543e-05, + "grad_norm": 4.159107208251953, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8729453086853027, + "num_tokens": 657193445.0, + "step": 17225 + }, + { + "epoch": 2.1913242590001274, + "ewc_loss": 0.008338219486176968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338219777215272e-05, + "grad_norm": 4.082556247711182, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8821884393692017, + "num_tokens": 657234922.0, + "step": 17226 + }, + { + "epoch": 2.191451469278718, + "ewc_loss": 0.008286294527351856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286294178105891e-05, + "grad_norm": 4.220882415771484, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8754726648330688, + "num_tokens": 657270774.0, + "step": 17227 + }, + { + "epoch": 2.191578679557308, + "ewc_loss": 0.008390865288674831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390865696128458e-05, + "grad_norm": 4.074371337890625, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8963041305541992, + "num_tokens": 657307046.0, + "step": 17228 + }, + { + "epoch": 2.1917058898358985, + "ewc_loss": 0.008271371945738792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271371916634962e-05, + "grad_norm": 4.083410739898682, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8743951320648193, + "num_tokens": 657352865.0, + "step": 17229 + }, + { + "epoch": 2.191833100114489, + "ewc_loss": 0.008354755118489265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354755118489265e-05, + "grad_norm": 4.155463218688965, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8647058606147766, + "num_tokens": 657393372.0, + "step": 17230 + }, + { + "epoch": 2.1919603103930796, + "ewc_loss": 0.008366125635802746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366125985048711e-05, + "grad_norm": 4.1336989402771, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8621776103973389, + "num_tokens": 657432445.0, + "step": 17231 + }, + { + "epoch": 2.19208752067167, + "ewc_loss": 0.008339066058397293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339065971085802e-05, + "grad_norm": 4.131322383880615, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.880858302116394, + "num_tokens": 657469145.0, + "step": 17232 + }, + { + "epoch": 2.1922147309502606, + "ewc_loss": 0.008358861319720745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35886166896671e-05, + "grad_norm": 4.110872745513916, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8979413509368896, + "num_tokens": 657510369.0, + "step": 17233 + }, + { + "epoch": 2.192341941228851, + "ewc_loss": 0.008333797566592693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333797450177372e-05, + "grad_norm": 4.073056697845459, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.887717604637146, + "num_tokens": 657550257.0, + "step": 17234 + }, + { + "epoch": 2.1924691515074417, + "ewc_loss": 0.00831750687211752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317506581079215e-05, + "grad_norm": 4.10526180267334, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8862063884735107, + "num_tokens": 657587095.0, + "step": 17235 + }, + { + "epoch": 2.1925963617860322, + "ewc_loss": 0.008344086818397045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344086381839588e-05, + "grad_norm": 4.168979644775391, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8868530988693237, + "num_tokens": 657619710.0, + "step": 17236 + }, + { + "epoch": 2.1927235720646228, + "ewc_loss": 0.008383995853364468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383995736949146e-05, + "grad_norm": 4.169686317443848, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8784183263778687, + "num_tokens": 657653316.0, + "step": 17237 + }, + { + "epoch": 2.1928507823432133, + "ewc_loss": 0.008349653333425522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3496532170102e-05, + "grad_norm": 4.1059417724609375, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8787197470664978, + "num_tokens": 657693570.0, + "step": 17238 + }, + { + "epoch": 2.192977992621804, + "ewc_loss": 0.008318498730659485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31849902169779e-05, + "grad_norm": 4.06018590927124, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8829107880592346, + "num_tokens": 657736781.0, + "step": 17239 + }, + { + "epoch": 2.1931052029003943, + "ewc_loss": 0.00832865759730339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328657713718712e-05, + "grad_norm": 4.143962383270264, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8746923208236694, + "num_tokens": 657775603.0, + "step": 17240 + }, + { + "epoch": 2.193232413178985, + "ewc_loss": 0.008379637263715267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37963743833825e-05, + "grad_norm": 4.18609094619751, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8800950646400452, + "num_tokens": 657809065.0, + "step": 17241 + }, + { + "epoch": 2.1933596234575754, + "ewc_loss": 0.008384401910007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384401735384017e-05, + "grad_norm": 4.121882438659668, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8800532817840576, + "num_tokens": 657851369.0, + "step": 17242 + }, + { + "epoch": 2.193486833736166, + "ewc_loss": 0.008327513933181763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327513933181763e-05, + "grad_norm": 4.094146728515625, + "learning_rate": 1e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.9023655652999878, + "num_tokens": 657889008.0, + "step": 17243 + }, + { + "epoch": 2.1936140440147565, + "ewc_loss": 0.00833830051124096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33830054034479e-05, + "grad_norm": 4.174784183502197, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.880679726600647, + "num_tokens": 657925867.0, + "step": 17244 + }, + { + "epoch": 2.193741254293347, + "ewc_loss": 0.008390519767999649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390519360546023e-05, + "grad_norm": 4.178518295288086, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8725250959396362, + "num_tokens": 657959502.0, + "step": 17245 + }, + { + "epoch": 2.1938684645719375, + "ewc_loss": 0.008333047851920128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33304802654311e-05, + "grad_norm": 4.0690016746521, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8864362239837646, + "num_tokens": 657999063.0, + "step": 17246 + }, + { + "epoch": 2.193995674850528, + "ewc_loss": 0.00830769818276167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307698590215296e-05, + "grad_norm": 4.193645000457764, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8725497722625732, + "num_tokens": 658033016.0, + "step": 17247 + }, + { + "epoch": 2.1941228851291186, + "ewc_loss": 0.008388695307075977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388695277972147e-05, + "grad_norm": 4.126373291015625, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8895483613014221, + "num_tokens": 658071259.0, + "step": 17248 + }, + { + "epoch": 2.194250095407709, + "ewc_loss": 0.008313755504786968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.313755097333342e-05, + "grad_norm": 4.188477516174316, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8847764730453491, + "num_tokens": 658103190.0, + "step": 17249 + }, + { + "epoch": 2.1943773056862996, + "ewc_loss": 0.008391707204282284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391706796828657e-05, + "grad_norm": 4.104530334472656, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8757572174072266, + "num_tokens": 658143346.0, + "step": 17250 + }, + { + "epoch": 2.19450451596489, + "ewc_loss": 0.008328412659466267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328412513947114e-05, + "grad_norm": 4.139011383056641, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8901006579399109, + "num_tokens": 658181747.0, + "step": 17251 + }, + { + "epoch": 2.1946317262434807, + "ewc_loss": 0.008363759145140648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363759116036817e-05, + "grad_norm": 4.090551376342773, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8843545913696289, + "num_tokens": 658221770.0, + "step": 17252 + }, + { + "epoch": 2.1947589365220708, + "ewc_loss": 0.008315663784742355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315663581015542e-05, + "grad_norm": 4.115503311157227, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8871371150016785, + "num_tokens": 658260157.0, + "step": 17253 + }, + { + "epoch": 2.1948861468006613, + "ewc_loss": 0.008357184939086437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357184560736641e-05, + "grad_norm": 4.1074018478393555, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8733792901039124, + "num_tokens": 658301815.0, + "step": 17254 + }, + { + "epoch": 2.195013357079252, + "ewc_loss": 0.008342218585312366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342218643520027e-05, + "grad_norm": 4.104825019836426, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.885002613067627, + "num_tokens": 658340732.0, + "step": 17255 + }, + { + "epoch": 2.1951405673578424, + "ewc_loss": 0.00834443885833025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344438538188115e-05, + "grad_norm": 4.137930870056152, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8779953122138977, + "num_tokens": 658380569.0, + "step": 17256 + }, + { + "epoch": 2.195267777636433, + "ewc_loss": 0.00835658609867096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356585749424994e-05, + "grad_norm": 4.132826328277588, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8944907188415527, + "num_tokens": 658415842.0, + "step": 17257 + }, + { + "epoch": 2.1953949879150234, + "ewc_loss": 0.008344518952071667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344519301317632e-05, + "grad_norm": 4.129972457885742, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8879294395446777, + "num_tokens": 658452861.0, + "step": 17258 + }, + { + "epoch": 2.195522198193614, + "ewc_loss": 0.008327232673764229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327233081217855e-05, + "grad_norm": 4.155200481414795, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.883958101272583, + "num_tokens": 658487019.0, + "step": 17259 + }, + { + "epoch": 2.1956494084722045, + "ewc_loss": 0.008350475691258907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350476127816364e-05, + "grad_norm": 4.108796119689941, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.882333517074585, + "num_tokens": 658529290.0, + "step": 17260 + }, + { + "epoch": 2.195776618750795, + "ewc_loss": 0.008290756493806839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29075652291067e-05, + "grad_norm": 4.088573455810547, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8787715435028076, + "num_tokens": 658572118.0, + "step": 17261 + }, + { + "epoch": 2.1959038290293855, + "ewc_loss": 0.008289027959108353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28902775538154e-05, + "grad_norm": 4.100765228271484, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.878809928894043, + "num_tokens": 658615972.0, + "step": 17262 + }, + { + "epoch": 2.196031039307976, + "ewc_loss": 0.00831439346075058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314393198816106e-05, + "grad_norm": 4.123205184936523, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.889299750328064, + "num_tokens": 658653848.0, + "step": 17263 + }, + { + "epoch": 2.1961582495865666, + "ewc_loss": 0.008298814296722412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298814645968378e-05, + "grad_norm": 4.1717848777771, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8826808929443359, + "num_tokens": 658686247.0, + "step": 17264 + }, + { + "epoch": 2.196285459865157, + "ewc_loss": 0.008342241868376732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342241926584393e-05, + "grad_norm": 4.118022441864014, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8801358938217163, + "num_tokens": 658725787.0, + "step": 17265 + }, + { + "epoch": 2.1964126701437476, + "ewc_loss": 0.008276971988379955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276972221210599e-05, + "grad_norm": 4.07765531539917, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8792139291763306, + "num_tokens": 658765957.0, + "step": 17266 + }, + { + "epoch": 2.196539880422338, + "ewc_loss": 0.00829248782247305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.292487473227084e-05, + "grad_norm": 4.167502403259277, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8925712704658508, + "num_tokens": 658800996.0, + "step": 17267 + }, + { + "epoch": 2.1966670907009287, + "ewc_loss": 0.008354013785719872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35401369840838e-05, + "grad_norm": 4.174586772918701, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8806691765785217, + "num_tokens": 658833779.0, + "step": 17268 + }, + { + "epoch": 2.196794300979519, + "ewc_loss": 0.008306681178510189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.306681411340833e-05, + "grad_norm": 4.087698936462402, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.886343240737915, + "num_tokens": 658874425.0, + "step": 17269 + }, + { + "epoch": 2.1969215112581097, + "ewc_loss": 0.008268779143691063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.268778765341267e-05, + "grad_norm": 4.093293190002441, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8906859159469604, + "num_tokens": 658912007.0, + "step": 17270 + }, + { + "epoch": 2.1970487215367003, + "ewc_loss": 0.00829765759408474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.297657768707722e-05, + "grad_norm": 4.072147369384766, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8749783635139465, + "num_tokens": 658956775.0, + "step": 17271 + }, + { + "epoch": 2.197175931815291, + "ewc_loss": 0.008289244957268238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289244578918442e-05, + "grad_norm": 4.1095290184021, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8949288725852966, + "num_tokens": 658997132.0, + "step": 17272 + }, + { + "epoch": 2.1973031420938813, + "ewc_loss": 0.008311552926898003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31155339255929e-05, + "grad_norm": 4.086600303649902, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8893313407897949, + "num_tokens": 659040437.0, + "step": 17273 + }, + { + "epoch": 2.197430352372472, + "ewc_loss": 0.008253098465502262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.25309834908694e-05, + "grad_norm": 4.116837501525879, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8966119885444641, + "num_tokens": 659074461.0, + "step": 17274 + }, + { + "epoch": 2.1975575626510624, + "ewc_loss": 0.008289815858006477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289815741591156e-05, + "grad_norm": 4.117183685302734, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8798221945762634, + "num_tokens": 659113818.0, + "step": 17275 + }, + { + "epoch": 2.197684772929653, + "ewc_loss": 0.008271460421383381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271460683317855e-05, + "grad_norm": 4.124731540679932, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8712983131408691, + "num_tokens": 659149297.0, + "step": 17276 + }, + { + "epoch": 2.1978119832082434, + "ewc_loss": 0.008271471597254276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271471597254276e-05, + "grad_norm": 4.130109786987305, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8858641386032104, + "num_tokens": 659187500.0, + "step": 17277 + }, + { + "epoch": 2.1979391934868335, + "ewc_loss": 0.008273179642856121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.273179992102087e-05, + "grad_norm": 4.130692005157471, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8594878315925598, + "num_tokens": 659228624.0, + "step": 17278 + }, + { + "epoch": 2.198066403765424, + "ewc_loss": 0.008277391083538532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.277391316369176e-05, + "grad_norm": 4.091174602508545, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.890409529209137, + "num_tokens": 659270317.0, + "step": 17279 + }, + { + "epoch": 2.1981936140440146, + "ewc_loss": 0.008255338296294212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.255337888840586e-05, + "grad_norm": 4.162357330322266, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8833111524581909, + "num_tokens": 659302133.0, + "step": 17280 + }, + { + "epoch": 2.198320824322605, + "ewc_loss": 0.00833332072943449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333320874953642e-05, + "grad_norm": 4.106656074523926, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8884631395339966, + "num_tokens": 659342469.0, + "step": 17281 + }, + { + "epoch": 2.1984480346011956, + "ewc_loss": 0.008267790079116821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2677899627015e-05, + "grad_norm": 4.149753570556641, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8935650587081909, + "num_tokens": 659373786.0, + "step": 17282 + }, + { + "epoch": 2.198575244879786, + "ewc_loss": 0.008324774913489819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324775262735784e-05, + "grad_norm": 4.153261184692383, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8768002986907959, + "num_tokens": 659412498.0, + "step": 17283 + }, + { + "epoch": 2.1987024551583767, + "ewc_loss": 0.008301209658384323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.301209891214967e-05, + "grad_norm": 4.147346496582031, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8684712648391724, + "num_tokens": 659446415.0, + "step": 17284 + }, + { + "epoch": 2.1988296654369672, + "ewc_loss": 0.008321810513734818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321810310008004e-05, + "grad_norm": 4.085756301879883, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8752753138542175, + "num_tokens": 659488824.0, + "step": 17285 + }, + { + "epoch": 2.1989568757155578, + "ewc_loss": 0.008314833045005798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314833394251764e-05, + "grad_norm": 4.154545307159424, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8860052227973938, + "num_tokens": 659523036.0, + "step": 17286 + }, + { + "epoch": 2.1990840859941483, + "ewc_loss": 0.008366182446479797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36618200992234e-05, + "grad_norm": 4.071096897125244, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8767862319946289, + "num_tokens": 659562780.0, + "step": 17287 + }, + { + "epoch": 2.199211296272739, + "ewc_loss": 0.008306704461574554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.306704694405198e-05, + "grad_norm": 4.0974836349487305, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8898528814315796, + "num_tokens": 659599364.0, + "step": 17288 + }, + { + "epoch": 2.1993385065513293, + "ewc_loss": 0.008355480618774891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3554805314634e-05, + "grad_norm": 4.148244857788086, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8761320114135742, + "num_tokens": 659636275.0, + "step": 17289 + }, + { + "epoch": 2.19946571682992, + "ewc_loss": 0.008371297270059586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371297008125111e-05, + "grad_norm": 4.126308917999268, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.882899284362793, + "num_tokens": 659670315.0, + "step": 17290 + }, + { + "epoch": 2.1995929271085104, + "ewc_loss": 0.008352577686309814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352578151971102e-05, + "grad_norm": 4.12087345123291, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8780818581581116, + "num_tokens": 659710968.0, + "step": 17291 + }, + { + "epoch": 2.199720137387101, + "ewc_loss": 0.008357254788279533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357255137525499e-05, + "grad_norm": 4.107614994049072, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8912277817726135, + "num_tokens": 659751334.0, + "step": 17292 + }, + { + "epoch": 2.1998473476656915, + "ewc_loss": 0.008366531692445278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366531983483583e-05, + "grad_norm": 4.100090503692627, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8839313983917236, + "num_tokens": 659793867.0, + "step": 17293 + }, + { + "epoch": 2.199974557944282, + "ewc_loss": 0.008342393673956394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342393266502768e-05, + "grad_norm": 4.059106349945068, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8840795159339905, + "num_tokens": 659838724.0, + "step": 17294 + }, + { + "epoch": 2.2001017682228725, + "ewc_loss": 0.008326170966029167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326170791406184e-05, + "grad_norm": 4.146429538726807, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8809289336204529, + "num_tokens": 659876589.0, + "step": 17295 + }, + { + "epoch": 2.200228978501463, + "ewc_loss": 0.008378027938306332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378027996513993e-05, + "grad_norm": 4.1875152587890625, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8749880790710449, + "num_tokens": 659914864.0, + "step": 17296 + }, + { + "epoch": 2.2003561887800536, + "ewc_loss": 0.008351434953510761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.351435099029914e-05, + "grad_norm": 4.113891124725342, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8892383575439453, + "num_tokens": 659952482.0, + "step": 17297 + }, + { + "epoch": 2.200483399058644, + "ewc_loss": 0.008288328535854816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288328535854816e-05, + "grad_norm": 4.197973251342773, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8825777173042297, + "num_tokens": 659985192.0, + "step": 17298 + }, + { + "epoch": 2.2006106093372346, + "ewc_loss": 0.008365529589354992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365529356524348e-05, + "grad_norm": 4.1507720947265625, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8897614479064941, + "num_tokens": 660019505.0, + "step": 17299 + }, + { + "epoch": 2.200737819615825, + "ewc_loss": 0.008312657475471497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312657882925123e-05, + "grad_norm": 4.124188423156738, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8776481747627258, + "num_tokens": 660054942.0, + "step": 17300 + }, + { + "epoch": 2.2008650298944152, + "ewc_loss": 0.008326264098286629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326263923663646e-05, + "grad_norm": 4.119088172912598, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8817095756530762, + "num_tokens": 660092836.0, + "step": 17301 + }, + { + "epoch": 2.200992240173006, + "ewc_loss": 0.008331380784511566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331381104653701e-05, + "grad_norm": 4.129604816436768, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8766058087348938, + "num_tokens": 660129419.0, + "step": 17302 + }, + { + "epoch": 2.2011194504515963, + "ewc_loss": 0.00833782646805048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.337826875504106e-05, + "grad_norm": 4.065798282623291, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.884762704372406, + "num_tokens": 660168116.0, + "step": 17303 + }, + { + "epoch": 2.201246660730187, + "ewc_loss": 0.008300868794322014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300868648802862e-05, + "grad_norm": 4.139322757720947, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8776991367340088, + "num_tokens": 660208006.0, + "step": 17304 + }, + { + "epoch": 2.2013738710087773, + "ewc_loss": 0.008354314602911472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354314195457846e-05, + "grad_norm": 4.07838773727417, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8819609880447388, + "num_tokens": 660254046.0, + "step": 17305 + }, + { + "epoch": 2.201501081287368, + "ewc_loss": 0.008307092823088169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307093230541795e-05, + "grad_norm": 4.158614158630371, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8813718557357788, + "num_tokens": 660289321.0, + "step": 17306 + }, + { + "epoch": 2.2016282915659584, + "ewc_loss": 0.008366793394088745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366793190361932e-05, + "grad_norm": 4.156451225280762, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.881040096282959, + "num_tokens": 660325052.0, + "step": 17307 + }, + { + "epoch": 2.201755501844549, + "ewc_loss": 0.00834495760500431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344958041561767e-05, + "grad_norm": 4.194110870361328, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8758086562156677, + "num_tokens": 660357942.0, + "step": 17308 + }, + { + "epoch": 2.2018827121231395, + "ewc_loss": 0.00837723445147276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377234189538285e-05, + "grad_norm": 4.211253643035889, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8584792613983154, + "num_tokens": 660390654.0, + "step": 17309 + }, + { + "epoch": 2.20200992240173, + "ewc_loss": 0.008386150002479553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386150147998706e-05, + "grad_norm": 4.104414463043213, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8836255073547363, + "num_tokens": 660432020.0, + "step": 17310 + }, + { + "epoch": 2.2021371326803205, + "ewc_loss": 0.008321378380060196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321378118125722e-05, + "grad_norm": 4.125251293182373, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8836177587509155, + "num_tokens": 660465051.0, + "step": 17311 + }, + { + "epoch": 2.202264342958911, + "ewc_loss": 0.00838275533169508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382755186175928e-05, + "grad_norm": 4.137396812438965, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8796483278274536, + "num_tokens": 660504967.0, + "step": 17312 + }, + { + "epoch": 2.2023915532375016, + "ewc_loss": 0.00835230853408575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352308213943616e-05, + "grad_norm": 4.139078617095947, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8738694190979004, + "num_tokens": 660543826.0, + "step": 17313 + }, + { + "epoch": 2.202518763516092, + "ewc_loss": 0.008372829295694828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372829324798658e-05, + "grad_norm": 4.118195533752441, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.880131721496582, + "num_tokens": 660583074.0, + "step": 17314 + }, + { + "epoch": 2.2026459737946826, + "ewc_loss": 0.008372106589376926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37210682220757e-05, + "grad_norm": 4.116952896118164, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8972119092941284, + "num_tokens": 660619974.0, + "step": 17315 + }, + { + "epoch": 2.202773184073273, + "ewc_loss": 0.008380645886063576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380645886063576e-05, + "grad_norm": 4.10393762588501, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8875781893730164, + "num_tokens": 660654005.0, + "step": 17316 + }, + { + "epoch": 2.2029003943518637, + "ewc_loss": 0.008354060351848602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354060264537111e-05, + "grad_norm": 4.1538310050964355, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8725227117538452, + "num_tokens": 660689179.0, + "step": 17317 + }, + { + "epoch": 2.203027604630454, + "ewc_loss": 0.008399919606745243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399919897783548e-05, + "grad_norm": 4.152159690856934, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8715552091598511, + "num_tokens": 660724923.0, + "step": 17318 + }, + { + "epoch": 2.2031548149090447, + "ewc_loss": 0.008384077809751034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384077955270186e-05, + "grad_norm": 4.138691425323486, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8868767619132996, + "num_tokens": 660760763.0, + "step": 17319 + }, + { + "epoch": 2.2032820251876353, + "ewc_loss": 0.008376186713576317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376186451641843e-05, + "grad_norm": 4.134852409362793, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8827359676361084, + "num_tokens": 660795520.0, + "step": 17320 + }, + { + "epoch": 2.203409235466226, + "ewc_loss": 0.00839963462203741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399634680245072e-05, + "grad_norm": 4.080246925354004, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8928220868110657, + "num_tokens": 660834858.0, + "step": 17321 + }, + { + "epoch": 2.2035364457448163, + "ewc_loss": 0.00834763515740633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347634866368026e-05, + "grad_norm": 4.130886554718018, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8923941850662231, + "num_tokens": 660870319.0, + "step": 17322 + }, + { + "epoch": 2.203663656023407, + "ewc_loss": 0.008400154300034046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400154183618724e-05, + "grad_norm": 4.1306071281433105, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.870489776134491, + "num_tokens": 660907313.0, + "step": 17323 + }, + { + "epoch": 2.2037908663019974, + "ewc_loss": 0.008380144834518433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38014530017972e-05, + "grad_norm": 4.073849201202393, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8978873491287231, + "num_tokens": 660948087.0, + "step": 17324 + }, + { + "epoch": 2.203918076580588, + "ewc_loss": 0.008356114849448204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356114994967356e-05, + "grad_norm": 4.140072822570801, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.888646125793457, + "num_tokens": 660984984.0, + "step": 17325 + }, + { + "epoch": 2.204045286859178, + "ewc_loss": 0.008426195941865444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426195563515648e-05, + "grad_norm": 4.132367134094238, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8807032108306885, + "num_tokens": 661022898.0, + "step": 17326 + }, + { + "epoch": 2.2041724971377685, + "ewc_loss": 0.0083764074370265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376407640753314e-05, + "grad_norm": 4.110993385314941, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8916045427322388, + "num_tokens": 661058645.0, + "step": 17327 + }, + { + "epoch": 2.204299707416359, + "ewc_loss": 0.008375865407288074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375865581911057e-05, + "grad_norm": 4.088422775268555, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.9026811718940735, + "num_tokens": 661096954.0, + "step": 17328 + }, + { + "epoch": 2.2044269176949496, + "ewc_loss": 0.008382044732570648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382044325117022e-05, + "grad_norm": 4.137482643127441, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8720578551292419, + "num_tokens": 661136947.0, + "step": 17329 + }, + { + "epoch": 2.20455412797354, + "ewc_loss": 0.008398751728236675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39875137899071e-05, + "grad_norm": 4.087808609008789, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8847101926803589, + "num_tokens": 661175376.0, + "step": 17330 + }, + { + "epoch": 2.2046813382521306, + "ewc_loss": 0.008358472026884556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35847167763859e-05, + "grad_norm": 4.1250715255737305, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8756752014160156, + "num_tokens": 661213238.0, + "step": 17331 + }, + { + "epoch": 2.204808548530721, + "ewc_loss": 0.008401013910770416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401014201808721e-05, + "grad_norm": 4.092345714569092, + "learning_rate": 1e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.8970323801040649, + "num_tokens": 661246512.0, + "step": 17332 + }, + { + "epoch": 2.2049357588093117, + "ewc_loss": 0.008376263082027435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376262849196792e-05, + "grad_norm": 4.084589004516602, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8926127552986145, + "num_tokens": 661288272.0, + "step": 17333 + }, + { + "epoch": 2.2050629690879022, + "ewc_loss": 0.008359256200492382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.359256025869399e-05, + "grad_norm": 4.081551551818848, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8882275223731995, + "num_tokens": 661328705.0, + "step": 17334 + }, + { + "epoch": 2.2051901793664928, + "ewc_loss": 0.008366013877093792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366013935301453e-05, + "grad_norm": 4.106616973876953, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8868363499641418, + "num_tokens": 661369877.0, + "step": 17335 + }, + { + "epoch": 2.2053173896450833, + "ewc_loss": 0.00835525244474411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355252066394314e-05, + "grad_norm": 4.110174179077148, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8864455819129944, + "num_tokens": 661405209.0, + "step": 17336 + }, + { + "epoch": 2.205444599923674, + "ewc_loss": 0.008341139182448387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341138891410083e-05, + "grad_norm": 4.116909027099609, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8881716728210449, + "num_tokens": 661441083.0, + "step": 17337 + }, + { + "epoch": 2.2055718102022643, + "ewc_loss": 0.008352963253855705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352963050128892e-05, + "grad_norm": 4.137645244598389, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.886867344379425, + "num_tokens": 661476938.0, + "step": 17338 + }, + { + "epoch": 2.205699020480855, + "ewc_loss": 0.00834747590124607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347476250492036e-05, + "grad_norm": 4.082818508148193, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8757280111312866, + "num_tokens": 661519281.0, + "step": 17339 + }, + { + "epoch": 2.2058262307594454, + "ewc_loss": 0.00832472462207079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324724331032485e-05, + "grad_norm": 4.0932936668396, + "learning_rate": 1e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8968942761421204, + "num_tokens": 661555880.0, + "step": 17340 + }, + { + "epoch": 2.205953441038036, + "ewc_loss": 0.008334096521139145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.334096492035314e-05, + "grad_norm": 4.14316987991333, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8758548498153687, + "num_tokens": 661595937.0, + "step": 17341 + }, + { + "epoch": 2.2060806513166265, + "ewc_loss": 0.008345725014805794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345724927494302e-05, + "grad_norm": 4.122854232788086, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8701326847076416, + "num_tokens": 661636868.0, + "step": 17342 + }, + { + "epoch": 2.206207861595217, + "ewc_loss": 0.008322346955537796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32234654808417e-05, + "grad_norm": 4.0739030838012695, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8760851621627808, + "num_tokens": 661681503.0, + "step": 17343 + }, + { + "epoch": 2.2063350718738075, + "ewc_loss": 0.008290369063615799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.290369441965595e-05, + "grad_norm": 4.12508487701416, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8855500221252441, + "num_tokens": 661718438.0, + "step": 17344 + }, + { + "epoch": 2.206462282152398, + "ewc_loss": 0.008350718766450882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350719144800678e-05, + "grad_norm": 4.146524429321289, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8639663457870483, + "num_tokens": 661759048.0, + "step": 17345 + }, + { + "epoch": 2.2065894924309886, + "ewc_loss": 0.008324571885168552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324572263518348e-05, + "grad_norm": 4.076822280883789, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8921922445297241, + "num_tokens": 661803131.0, + "step": 17346 + }, + { + "epoch": 2.206716702709579, + "ewc_loss": 0.008278310298919678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.278310269815847e-05, + "grad_norm": 4.166574001312256, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8820381164550781, + "num_tokens": 661838139.0, + "step": 17347 + }, + { + "epoch": 2.2068439129881696, + "ewc_loss": 0.008360903710126877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36090330267325e-05, + "grad_norm": 4.127034664154053, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8764668703079224, + "num_tokens": 661878568.0, + "step": 17348 + }, + { + "epoch": 2.20697112326676, + "ewc_loss": 0.008290576748549938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.290576806757599e-05, + "grad_norm": 4.206869125366211, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8752225637435913, + "num_tokens": 661909110.0, + "step": 17349 + }, + { + "epoch": 2.2070983335453507, + "ewc_loss": 0.008346646092832088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346646063728258e-05, + "grad_norm": 4.1098103523254395, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8946851491928101, + "num_tokens": 661944472.0, + "step": 17350 + }, + { + "epoch": 2.2072255438239408, + "ewc_loss": 0.008282668888568878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.282669296022505e-05, + "grad_norm": 4.0538177490234375, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8942102789878845, + "num_tokens": 661985865.0, + "step": 17351 + }, + { + "epoch": 2.2073527541025313, + "ewc_loss": 0.008289910852909088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289911056635901e-05, + "grad_norm": 4.2207207679748535, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8777012825012207, + "num_tokens": 662019460.0, + "step": 17352 + }, + { + "epoch": 2.207479964381122, + "ewc_loss": 0.008402097970247269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402097591897473e-05, + "grad_norm": 4.107128143310547, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8904687762260437, + "num_tokens": 662059109.0, + "step": 17353 + }, + { + "epoch": 2.2076071746597123, + "ewc_loss": 0.008288837037980556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288837125292048e-05, + "grad_norm": 4.141180992126465, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8773660659790039, + "num_tokens": 662096040.0, + "step": 17354 + }, + { + "epoch": 2.207734384938303, + "ewc_loss": 0.00835796445608139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357964543392882e-05, + "grad_norm": 4.074803829193115, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8863999247550964, + "num_tokens": 662139351.0, + "step": 17355 + }, + { + "epoch": 2.2078615952168934, + "ewc_loss": 0.008300156332552433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300156332552433e-05, + "grad_norm": 4.130237102508545, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8853470087051392, + "num_tokens": 662177353.0, + "step": 17356 + }, + { + "epoch": 2.207988805495484, + "ewc_loss": 0.008373006246984005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373006130568683e-05, + "grad_norm": 4.1609907150268555, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8724897503852844, + "num_tokens": 662209519.0, + "step": 17357 + }, + { + "epoch": 2.2081160157740745, + "ewc_loss": 0.008367172442376614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367172267753631e-05, + "grad_norm": 4.160293102264404, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8648912906646729, + "num_tokens": 662251311.0, + "step": 17358 + }, + { + "epoch": 2.208243226052665, + "ewc_loss": 0.008345951326191425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345951209776103e-05, + "grad_norm": 4.183697700500488, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8890665173530579, + "num_tokens": 662282770.0, + "step": 17359 + }, + { + "epoch": 2.2083704363312555, + "ewc_loss": 0.00836899597197771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368995622731745e-05, + "grad_norm": 4.05399227142334, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8792495727539062, + "num_tokens": 662326770.0, + "step": 17360 + }, + { + "epoch": 2.208497646609846, + "ewc_loss": 0.008279395289719105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.279395115096122e-05, + "grad_norm": 4.104433059692383, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8828413486480713, + "num_tokens": 662366811.0, + "step": 17361 + }, + { + "epoch": 2.2086248568884366, + "ewc_loss": 0.008361976593732834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361976506421342e-05, + "grad_norm": 4.087127685546875, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8784770965576172, + "num_tokens": 662404517.0, + "step": 17362 + }, + { + "epoch": 2.208752067167027, + "ewc_loss": 0.00835206639021635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352066652150825e-05, + "grad_norm": 4.14035701751709, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8890175223350525, + "num_tokens": 662442091.0, + "step": 17363 + }, + { + "epoch": 2.2088792774456176, + "ewc_loss": 0.008357121609151363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357121987501159e-05, + "grad_norm": 4.108770370483398, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8596445918083191, + "num_tokens": 662483343.0, + "step": 17364 + }, + { + "epoch": 2.209006487724208, + "ewc_loss": 0.00833132490515709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331325079780072e-05, + "grad_norm": 4.150907039642334, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8773140907287598, + "num_tokens": 662521994.0, + "step": 17365 + }, + { + "epoch": 2.2091336980027987, + "ewc_loss": 0.008363766595721245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363766391994432e-05, + "grad_norm": 4.065210342407227, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8784067034721375, + "num_tokens": 662562737.0, + "step": 17366 + }, + { + "epoch": 2.209260908281389, + "ewc_loss": 0.008318043313920498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318043546751142e-05, + "grad_norm": 4.190421104431152, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8809163570404053, + "num_tokens": 662596927.0, + "step": 17367 + }, + { + "epoch": 2.2093881185599797, + "ewc_loss": 0.008397640660405159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397640340263024e-05, + "grad_norm": 4.112540245056152, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8906953930854797, + "num_tokens": 662632831.0, + "step": 17368 + }, + { + "epoch": 2.2095153288385703, + "ewc_loss": 0.008326425217092037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326425449922681e-05, + "grad_norm": 4.087528705596924, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.881203293800354, + "num_tokens": 662675554.0, + "step": 17369 + }, + { + "epoch": 2.209642539117161, + "ewc_loss": 0.008326847106218338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326847455464303e-05, + "grad_norm": 4.046947956085205, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.893204391002655, + "num_tokens": 662720248.0, + "step": 17370 + }, + { + "epoch": 2.2097697493957513, + "ewc_loss": 0.008320317603647709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320317283505574e-05, + "grad_norm": 4.120415210723877, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8840367794036865, + "num_tokens": 662761388.0, + "step": 17371 + }, + { + "epoch": 2.209896959674342, + "ewc_loss": 0.008351577445864677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35157698020339e-05, + "grad_norm": 4.132652759552002, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8824403882026672, + "num_tokens": 662798007.0, + "step": 17372 + }, + { + "epoch": 2.2100241699529324, + "ewc_loss": 0.008321324363350868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321324276039377e-05, + "grad_norm": 4.122021198272705, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8806759715080261, + "num_tokens": 662836042.0, + "step": 17373 + }, + { + "epoch": 2.210151380231523, + "ewc_loss": 0.008314034901559353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314034494105726e-05, + "grad_norm": 4.193950176239014, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8723608255386353, + "num_tokens": 662869465.0, + "step": 17374 + }, + { + "epoch": 2.2102785905101134, + "ewc_loss": 0.008353821001946926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353820885531604e-05, + "grad_norm": 4.087367534637451, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8753273487091064, + "num_tokens": 662909440.0, + "step": 17375 + }, + { + "epoch": 2.2104058007887035, + "ewc_loss": 0.008272441104054451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.272441482404247e-05, + "grad_norm": 4.12105655670166, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8787854909896851, + "num_tokens": 662949637.0, + "step": 17376 + }, + { + "epoch": 2.210533011067294, + "ewc_loss": 0.008346357382833958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346357208210975e-05, + "grad_norm": 4.128911972045898, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8819137811660767, + "num_tokens": 662987679.0, + "step": 17377 + }, + { + "epoch": 2.2106602213458846, + "ewc_loss": 0.008326028473675251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326028910232708e-05, + "grad_norm": 4.162198066711426, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8634635210037231, + "num_tokens": 663027651.0, + "step": 17378 + }, + { + "epoch": 2.210787431624475, + "ewc_loss": 0.008350237272679806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350237476406619e-05, + "grad_norm": 4.12076473236084, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8952541947364807, + "num_tokens": 663063708.0, + "step": 17379 + }, + { + "epoch": 2.2109146419030656, + "ewc_loss": 0.008313389495015144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31338984426111e-05, + "grad_norm": 4.190181255340576, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.874250054359436, + "num_tokens": 663097255.0, + "step": 17380 + }, + { + "epoch": 2.211041852181656, + "ewc_loss": 0.008352619595825672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352619624929503e-05, + "grad_norm": 4.101571083068848, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8791149854660034, + "num_tokens": 663134496.0, + "step": 17381 + }, + { + "epoch": 2.2111690624602467, + "ewc_loss": 0.008299210108816624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299209730466828e-05, + "grad_norm": 4.150148868560791, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.889212429523468, + "num_tokens": 663170278.0, + "step": 17382 + }, + { + "epoch": 2.211296272738837, + "ewc_loss": 0.008350910618901253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350910502485931e-05, + "grad_norm": 4.180766582489014, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8803548812866211, + "num_tokens": 663205407.0, + "step": 17383 + }, + { + "epoch": 2.2114234830174277, + "ewc_loss": 0.00835622288286686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356222679140046e-05, + "grad_norm": 4.099906921386719, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8860530257225037, + "num_tokens": 663243166.0, + "step": 17384 + }, + { + "epoch": 2.2115506932960183, + "ewc_loss": 0.008308902382850647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308902033604681e-05, + "grad_norm": 4.096140384674072, + "learning_rate": 1e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.901939868927002, + "num_tokens": 663282267.0, + "step": 17385 + }, + { + "epoch": 2.211677903574609, + "ewc_loss": 0.008336147293448448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336147584486753e-05, + "grad_norm": 4.158740520477295, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8754752278327942, + "num_tokens": 663318620.0, + "step": 17386 + }, + { + "epoch": 2.2118051138531993, + "ewc_loss": 0.008370669558644295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370669092983007e-05, + "grad_norm": 4.1420488357543945, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.8988120555877686, + "num_tokens": 663353256.0, + "step": 17387 + }, + { + "epoch": 2.21193232413179, + "ewc_loss": 0.008312961086630821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312961290357634e-05, + "grad_norm": 4.14699125289917, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8852534890174866, + "num_tokens": 663386691.0, + "step": 17388 + }, + { + "epoch": 2.2120595344103804, + "ewc_loss": 0.008335769176483154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.335769234690815e-05, + "grad_norm": 4.184908390045166, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8792957067489624, + "num_tokens": 663416550.0, + "step": 17389 + }, + { + "epoch": 2.212186744688971, + "ewc_loss": 0.008371972478926182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371972216991708e-05, + "grad_norm": 4.048142433166504, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8831143379211426, + "num_tokens": 663464912.0, + "step": 17390 + }, + { + "epoch": 2.2123139549675614, + "ewc_loss": 0.008276754058897495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.276753942482173e-05, + "grad_norm": 4.124967098236084, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8742895126342773, + "num_tokens": 663503786.0, + "step": 17391 + }, + { + "epoch": 2.212441165246152, + "ewc_loss": 0.00839006807655096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390068251173943e-05, + "grad_norm": 4.1439127922058105, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8889488577842712, + "num_tokens": 663540579.0, + "step": 17392 + }, + { + "epoch": 2.2125683755247425, + "ewc_loss": 0.008354061283171177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354061719728634e-05, + "grad_norm": 4.109660625457764, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8851586580276489, + "num_tokens": 663577284.0, + "step": 17393 + }, + { + "epoch": 2.212695585803333, + "ewc_loss": 0.008344817906618118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344818343175575e-05, + "grad_norm": 4.1505351066589355, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8782811164855957, + "num_tokens": 663613118.0, + "step": 17394 + }, + { + "epoch": 2.2128227960819236, + "ewc_loss": 0.008378060534596443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378060738323256e-05, + "grad_norm": 4.121620178222656, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8864244818687439, + "num_tokens": 663651548.0, + "step": 17395 + }, + { + "epoch": 2.212950006360514, + "ewc_loss": 0.008333299309015274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3332990470808e-05, + "grad_norm": 4.139211654663086, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8668340444564819, + "num_tokens": 663694381.0, + "step": 17396 + }, + { + "epoch": 2.2130772166391046, + "ewc_loss": 0.00834870059043169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348700794158503e-05, + "grad_norm": 4.086093902587891, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.882748007774353, + "num_tokens": 663734686.0, + "step": 17397 + }, + { + "epoch": 2.213204426917695, + "ewc_loss": 0.00833179708570242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331797289429232e-05, + "grad_norm": 4.145513534545898, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8747928142547607, + "num_tokens": 663768764.0, + "step": 17398 + }, + { + "epoch": 2.2133316371962852, + "ewc_loss": 0.008379796519875526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379796054214239e-05, + "grad_norm": 4.068244457244873, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.900098979473114, + "num_tokens": 663807650.0, + "step": 17399 + }, + { + "epoch": 2.213458847474876, + "ewc_loss": 0.008313259109854698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.313258877024055e-05, + "grad_norm": 4.10540246963501, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8791567087173462, + "num_tokens": 663849308.0, + "step": 17400 + }, + { + "epoch": 2.2135860577534663, + "ewc_loss": 0.008371628820896149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371628791792318e-05, + "grad_norm": 4.159471035003662, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8681706190109253, + "num_tokens": 663888917.0, + "step": 17401 + }, + { + "epoch": 2.213713268032057, + "ewc_loss": 0.008371185511350632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371185685973614e-05, + "grad_norm": 4.210212707519531, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8826165199279785, + "num_tokens": 663921048.0, + "step": 17402 + }, + { + "epoch": 2.2138404783106473, + "ewc_loss": 0.008382901549339294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382901432923973e-05, + "grad_norm": 4.084560871124268, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8870487213134766, + "num_tokens": 663963433.0, + "step": 17403 + }, + { + "epoch": 2.213967688589238, + "ewc_loss": 0.008289615623652935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289615652756765e-05, + "grad_norm": 4.129580497741699, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8665487766265869, + "num_tokens": 664003294.0, + "step": 17404 + }, + { + "epoch": 2.2140948988678284, + "ewc_loss": 0.008396918885409832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396918565267697e-05, + "grad_norm": 4.140429496765137, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8711392879486084, + "num_tokens": 664043161.0, + "step": 17405 + }, + { + "epoch": 2.214222109146419, + "ewc_loss": 0.008350202813744545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350203279405832e-05, + "grad_norm": 4.084990501403809, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8916009664535522, + "num_tokens": 664082457.0, + "step": 17406 + }, + { + "epoch": 2.2143493194250095, + "ewc_loss": 0.008309910073876381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309909753734246e-05, + "grad_norm": 4.0690436363220215, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.8918150663375854, + "num_tokens": 664125999.0, + "step": 17407 + }, + { + "epoch": 2.2144765297036, + "ewc_loss": 0.008351468481123447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.351468568434939e-05, + "grad_norm": 4.125904560089111, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8877254128456116, + "num_tokens": 664167256.0, + "step": 17408 + }, + { + "epoch": 2.2146037399821905, + "ewc_loss": 0.00834088958799839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.340889326063916e-05, + "grad_norm": 4.133434295654297, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8785229325294495, + "num_tokens": 664201887.0, + "step": 17409 + }, + { + "epoch": 2.214730950260781, + "ewc_loss": 0.008330042473971844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330042328452691e-05, + "grad_norm": 4.1146321296691895, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8878244161605835, + "num_tokens": 664238986.0, + "step": 17410 + }, + { + "epoch": 2.2148581605393716, + "ewc_loss": 0.008318129926919937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31813013064675e-05, + "grad_norm": 4.112537860870361, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8800790309906006, + "num_tokens": 664283436.0, + "step": 17411 + }, + { + "epoch": 2.214985370817962, + "ewc_loss": 0.008326366543769836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326366514666006e-05, + "grad_norm": 4.1000823974609375, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8776248097419739, + "num_tokens": 664325196.0, + "step": 17412 + }, + { + "epoch": 2.2151125810965526, + "ewc_loss": 0.008318884298205376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318884647451341e-05, + "grad_norm": 4.122893333435059, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.873531699180603, + "num_tokens": 664365722.0, + "step": 17413 + }, + { + "epoch": 2.215239791375143, + "ewc_loss": 0.008324844762682915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324844384333119e-05, + "grad_norm": 4.064154624938965, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8740330934524536, + "num_tokens": 664411570.0, + "step": 17414 + }, + { + "epoch": 2.2153670016537337, + "ewc_loss": 0.008279499597847462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.279499888885766e-05, + "grad_norm": 4.132351398468018, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8855351805686951, + "num_tokens": 664453545.0, + "step": 17415 + }, + { + "epoch": 2.215494211932324, + "ewc_loss": 0.008324168622493744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324168447870761e-05, + "grad_norm": 4.118940353393555, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8839883208274841, + "num_tokens": 664489882.0, + "step": 17416 + }, + { + "epoch": 2.2156214222109147, + "ewc_loss": 0.008310572244226933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.310572593472898e-05, + "grad_norm": 4.149984836578369, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8794686198234558, + "num_tokens": 664528720.0, + "step": 17417 + }, + { + "epoch": 2.2157486324895053, + "ewc_loss": 0.008314980193972588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31498036859557e-05, + "grad_norm": 4.1405792236328125, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8739606142044067, + "num_tokens": 664569645.0, + "step": 17418 + }, + { + "epoch": 2.215875842768096, + "ewc_loss": 0.00828508473932743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.285084913950413e-05, + "grad_norm": 4.145752429962158, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8762431144714355, + "num_tokens": 664612033.0, + "step": 17419 + }, + { + "epoch": 2.2160030530466863, + "ewc_loss": 0.00831072498112917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.310724660987034e-05, + "grad_norm": 4.132392883300781, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8902608156204224, + "num_tokens": 664651408.0, + "step": 17420 + }, + { + "epoch": 2.216130263325277, + "ewc_loss": 0.008286058902740479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.286058437079191e-05, + "grad_norm": 4.099851131439209, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8679172992706299, + "num_tokens": 664695764.0, + "step": 17421 + }, + { + "epoch": 2.2162574736038674, + "ewc_loss": 0.008271955884993076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271956176031381e-05, + "grad_norm": 4.123885154724121, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8848568201065063, + "num_tokens": 664735528.0, + "step": 17422 + }, + { + "epoch": 2.216384683882458, + "ewc_loss": 0.008277914486825466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.277914457721636e-05, + "grad_norm": 4.108835220336914, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8748422265052795, + "num_tokens": 664775612.0, + "step": 17423 + }, + { + "epoch": 2.216511894161048, + "ewc_loss": 0.008273965679109097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.273965795524418e-05, + "grad_norm": 4.142817974090576, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8780096769332886, + "num_tokens": 664812485.0, + "step": 17424 + }, + { + "epoch": 2.2166391044396385, + "ewc_loss": 0.008307913318276405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307913230964914e-05, + "grad_norm": 4.117574214935303, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8690677285194397, + "num_tokens": 664853166.0, + "step": 17425 + }, + { + "epoch": 2.216766314718229, + "ewc_loss": 0.008270288817584515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270289254141971e-05, + "grad_norm": 4.145462989807129, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8727808594703674, + "num_tokens": 664888778.0, + "step": 17426 + }, + { + "epoch": 2.2168935249968196, + "ewc_loss": 0.008311450481414795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311450073961169e-05, + "grad_norm": 4.063583850860596, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8824887275695801, + "num_tokens": 664933783.0, + "step": 17427 + }, + { + "epoch": 2.21702073527541, + "ewc_loss": 0.008245284669101238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.245284698205069e-05, + "grad_norm": 4.189066410064697, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8658832907676697, + "num_tokens": 664965228.0, + "step": 17428 + }, + { + "epoch": 2.2171479455540006, + "ewc_loss": 0.008357061073184013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3570608694572e-05, + "grad_norm": 4.188971042633057, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8798328042030334, + "num_tokens": 664996691.0, + "step": 17429 + }, + { + "epoch": 2.217275155832591, + "ewc_loss": 0.008334344252943993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.334344602189958e-05, + "grad_norm": 4.099209785461426, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8912241458892822, + "num_tokens": 665032006.0, + "step": 17430 + }, + { + "epoch": 2.2174023661111817, + "ewc_loss": 0.008284222334623337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.284221985377371e-05, + "grad_norm": 4.1181488037109375, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8718768358230591, + "num_tokens": 665071819.0, + "step": 17431 + }, + { + "epoch": 2.217529576389772, + "ewc_loss": 0.008335655555129051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.335655729752034e-05, + "grad_norm": 4.080810546875, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8714087605476379, + "num_tokens": 665118883.0, + "step": 17432 + }, + { + "epoch": 2.2176567866683627, + "ewc_loss": 0.008291480131447315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.291480480693281e-05, + "grad_norm": 4.186980247497559, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8627155423164368, + "num_tokens": 665154926.0, + "step": 17433 + }, + { + "epoch": 2.2177839969469533, + "ewc_loss": 0.008390061557292938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39006170281209e-05, + "grad_norm": 4.113880157470703, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.889885663986206, + "num_tokens": 665189910.0, + "step": 17434 + }, + { + "epoch": 2.217911207225544, + "ewc_loss": 0.008307087235152721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307087409775704e-05, + "grad_norm": 4.106015205383301, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8869534730911255, + "num_tokens": 665235849.0, + "step": 17435 + }, + { + "epoch": 2.2180384175041343, + "ewc_loss": 0.008336487226188183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336487371707335e-05, + "grad_norm": 4.137825012207031, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8713839650154114, + "num_tokens": 665274231.0, + "step": 17436 + }, + { + "epoch": 2.218165627782725, + "ewc_loss": 0.008343086577951908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3430866652634e-05, + "grad_norm": 4.099335193634033, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8994266986846924, + "num_tokens": 665309057.0, + "step": 17437 + }, + { + "epoch": 2.2182928380613154, + "ewc_loss": 0.008326124399900436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326124225277454e-05, + "grad_norm": 4.258337497711182, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8808740377426147, + "num_tokens": 665337031.0, + "step": 17438 + }, + { + "epoch": 2.218420048339906, + "ewc_loss": 0.008433199487626553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433199400315061e-05, + "grad_norm": 4.124816417694092, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8826368451118469, + "num_tokens": 665376557.0, + "step": 17439 + }, + { + "epoch": 2.2185472586184964, + "ewc_loss": 0.008292818441987038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.292818529298529e-05, + "grad_norm": 4.1135334968566895, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8786489963531494, + "num_tokens": 665419928.0, + "step": 17440 + }, + { + "epoch": 2.218674468897087, + "ewc_loss": 0.008373945951461792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373946184292436e-05, + "grad_norm": 4.102185249328613, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8872454166412354, + "num_tokens": 665458827.0, + "step": 17441 + }, + { + "epoch": 2.2188016791756775, + "ewc_loss": 0.008343420922756195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34342063171789e-05, + "grad_norm": 4.145569324493408, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8842709064483643, + "num_tokens": 665498623.0, + "step": 17442 + }, + { + "epoch": 2.218928889454268, + "ewc_loss": 0.008374068886041641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374068420380354e-05, + "grad_norm": 4.131511211395264, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8776222467422485, + "num_tokens": 665536103.0, + "step": 17443 + }, + { + "epoch": 2.2190560997328586, + "ewc_loss": 0.008365957997739315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365957910427824e-05, + "grad_norm": 4.171743392944336, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8797297477722168, + "num_tokens": 665572089.0, + "step": 17444 + }, + { + "epoch": 2.219183310011449, + "ewc_loss": 0.008378294296562672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378294296562672e-05, + "grad_norm": 4.1705522537231445, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8742854595184326, + "num_tokens": 665608088.0, + "step": 17445 + }, + { + "epoch": 2.2193105202900396, + "ewc_loss": 0.008392786607146263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.392786548938602e-05, + "grad_norm": 4.14459228515625, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8906803131103516, + "num_tokens": 665641883.0, + "step": 17446 + }, + { + "epoch": 2.21943773056863, + "ewc_loss": 0.008363456465303898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363456436200067e-05, + "grad_norm": 4.087759494781494, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8864461779594421, + "num_tokens": 665682211.0, + "step": 17447 + }, + { + "epoch": 2.2195649408472207, + "ewc_loss": 0.008352652192115784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352652366738766e-05, + "grad_norm": 4.117074012756348, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8865352869033813, + "num_tokens": 665726182.0, + "step": 17448 + }, + { + "epoch": 2.2196921511258108, + "ewc_loss": 0.00838480331003666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38480336824432e-05, + "grad_norm": 4.190371036529541, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8844537734985352, + "num_tokens": 665760404.0, + "step": 17449 + }, + { + "epoch": 2.2198193614044013, + "ewc_loss": 0.00838885735720396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388857531826943e-05, + "grad_norm": 4.1149115562438965, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8815897703170776, + "num_tokens": 665798235.0, + "step": 17450 + }, + { + "epoch": 2.219946571682992, + "ewc_loss": 0.008332072757184505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.332072320627049e-05, + "grad_norm": 4.099762439727783, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8813842535018921, + "num_tokens": 665840118.0, + "step": 17451 + }, + { + "epoch": 2.2200737819615823, + "ewc_loss": 0.008353828452527523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353828161489218e-05, + "grad_norm": 4.162463665008545, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8816118836402893, + "num_tokens": 665878498.0, + "step": 17452 + }, + { + "epoch": 2.220200992240173, + "ewc_loss": 0.008375526405870914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375526522286236e-05, + "grad_norm": 4.03834867477417, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8777374029159546, + "num_tokens": 665924179.0, + "step": 17453 + }, + { + "epoch": 2.2203282025187634, + "ewc_loss": 0.008312859572470188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312859426951036e-05, + "grad_norm": 4.178467273712158, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8947567343711853, + "num_tokens": 665963160.0, + "step": 17454 + }, + { + "epoch": 2.220455412797354, + "ewc_loss": 0.008436531759798527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436531788902357e-05, + "grad_norm": 4.1310505867004395, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8907301425933838, + "num_tokens": 665998854.0, + "step": 17455 + }, + { + "epoch": 2.2205826230759445, + "ewc_loss": 0.008345882408320904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34588281577453e-05, + "grad_norm": 4.110799312591553, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8893859386444092, + "num_tokens": 666036420.0, + "step": 17456 + }, + { + "epoch": 2.220709833354535, + "ewc_loss": 0.008345683105289936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345683454535902e-05, + "grad_norm": 4.143555641174316, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.863609790802002, + "num_tokens": 666076692.0, + "step": 17457 + }, + { + "epoch": 2.2208370436331255, + "ewc_loss": 0.008373735472559929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373735909117386e-05, + "grad_norm": 4.1590070724487305, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8689367771148682, + "num_tokens": 666118009.0, + "step": 17458 + }, + { + "epoch": 2.220964253911716, + "ewc_loss": 0.008376809768378735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376810001209378e-05, + "grad_norm": 4.135478973388672, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8716484308242798, + "num_tokens": 666158132.0, + "step": 17459 + }, + { + "epoch": 2.2210914641903066, + "ewc_loss": 0.008343853987753391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343854278791696e-05, + "grad_norm": 4.329530715942383, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8763777017593384, + "num_tokens": 666187195.0, + "step": 17460 + }, + { + "epoch": 2.221218674468897, + "ewc_loss": 0.008466837927699089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.466838335152715e-05, + "grad_norm": 4.078061580657959, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8864637017250061, + "num_tokens": 666227790.0, + "step": 17461 + }, + { + "epoch": 2.2213458847474876, + "ewc_loss": 0.00827769935131073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.277699089376256e-05, + "grad_norm": 4.098679542541504, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8742014169692993, + "num_tokens": 666268517.0, + "step": 17462 + }, + { + "epoch": 2.221473095026078, + "ewc_loss": 0.008379391394555569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37939151097089e-05, + "grad_norm": 4.055412769317627, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8962874412536621, + "num_tokens": 666313727.0, + "step": 17463 + }, + { + "epoch": 2.2216003053046687, + "ewc_loss": 0.008365139365196228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365139365196228e-05, + "grad_norm": 4.132364273071289, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8844757080078125, + "num_tokens": 666352713.0, + "step": 17464 + }, + { + "epoch": 2.221727515583259, + "ewc_loss": 0.008377652615308762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377652557101101e-05, + "grad_norm": 4.0806779861450195, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8886016607284546, + "num_tokens": 666395065.0, + "step": 17465 + }, + { + "epoch": 2.2218547258618497, + "ewc_loss": 0.008336578495800495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336578321177512e-05, + "grad_norm": 4.162587642669678, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8818981051445007, + "num_tokens": 666428019.0, + "step": 17466 + }, + { + "epoch": 2.2219819361404403, + "ewc_loss": 0.008398735895752907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39873609947972e-05, + "grad_norm": 4.135195732116699, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8801705241203308, + "num_tokens": 666466830.0, + "step": 17467 + }, + { + "epoch": 2.222109146419031, + "ewc_loss": 0.008368446491658688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368446287931874e-05, + "grad_norm": 4.124266147613525, + "learning_rate": 1e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.8992696404457092, + "num_tokens": 666501028.0, + "step": 17468 + }, + { + "epoch": 2.2222363566976213, + "ewc_loss": 0.008379585109651089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379585051443428e-05, + "grad_norm": 4.119939804077148, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.893552303314209, + "num_tokens": 666538554.0, + "step": 17469 + }, + { + "epoch": 2.222363566976212, + "ewc_loss": 0.00836347509175539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363475353689864e-05, + "grad_norm": 4.142815113067627, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8795256018638611, + "num_tokens": 666577019.0, + "step": 17470 + }, + { + "epoch": 2.2224907772548024, + "ewc_loss": 0.00838333647698164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383336535189301e-05, + "grad_norm": 4.1028032302856445, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8959733247756958, + "num_tokens": 666616597.0, + "step": 17471 + }, + { + "epoch": 2.222617987533393, + "ewc_loss": 0.00834745541214943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347455877810717e-05, + "grad_norm": 4.187448024749756, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8850098252296448, + "num_tokens": 666651523.0, + "step": 17472 + }, + { + "epoch": 2.2227451978119834, + "ewc_loss": 0.008413905277848244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41390501591377e-05, + "grad_norm": 4.167620658874512, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8737977147102356, + "num_tokens": 666689270.0, + "step": 17473 + }, + { + "epoch": 2.2228724080905735, + "ewc_loss": 0.008364555425941944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36455583339557e-05, + "grad_norm": 4.145902156829834, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8794587850570679, + "num_tokens": 666726836.0, + "step": 17474 + }, + { + "epoch": 2.222999618369164, + "ewc_loss": 0.008370788767933846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370789146283641e-05, + "grad_norm": 4.152871608734131, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.880955696105957, + "num_tokens": 666761207.0, + "step": 17475 + }, + { + "epoch": 2.2231268286477546, + "ewc_loss": 0.008366101421415806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366101246792823e-05, + "grad_norm": 4.097174644470215, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.890575647354126, + "num_tokens": 666800293.0, + "step": 17476 + }, + { + "epoch": 2.223254038926345, + "ewc_loss": 0.008346359245479107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346359390998259e-05, + "grad_norm": 4.099603652954102, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8897731900215149, + "num_tokens": 666843046.0, + "step": 17477 + }, + { + "epoch": 2.2233812492049356, + "ewc_loss": 0.008369057439267635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369057468371466e-05, + "grad_norm": 4.105494499206543, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.89453125, + "num_tokens": 666885840.0, + "step": 17478 + }, + { + "epoch": 2.223508459483526, + "ewc_loss": 0.008346072398126125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34607271826826e-05, + "grad_norm": 4.122715473175049, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8827663660049438, + "num_tokens": 666925654.0, + "step": 17479 + }, + { + "epoch": 2.2236356697621167, + "ewc_loss": 0.008348800241947174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348800474777818e-05, + "grad_norm": 4.129818916320801, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8859078884124756, + "num_tokens": 666968098.0, + "step": 17480 + }, + { + "epoch": 2.223762880040707, + "ewc_loss": 0.00833117589354515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331175922648981e-05, + "grad_norm": 4.1017303466796875, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8910006284713745, + "num_tokens": 667010349.0, + "step": 17481 + }, + { + "epoch": 2.2238900903192977, + "ewc_loss": 0.008303046226501465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303046342916787e-05, + "grad_norm": 4.139308929443359, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8746618032455444, + "num_tokens": 667047086.0, + "step": 17482 + }, + { + "epoch": 2.2240173005978883, + "ewc_loss": 0.008331608027219772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331608114531264e-05, + "grad_norm": 4.188409328460693, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8661646842956543, + "num_tokens": 667083955.0, + "step": 17483 + }, + { + "epoch": 2.224144510876479, + "ewc_loss": 0.00835481658577919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354816964128986e-05, + "grad_norm": 4.1615519523620605, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8883488178253174, + "num_tokens": 667119751.0, + "step": 17484 + }, + { + "epoch": 2.2242717211550693, + "ewc_loss": 0.008328862488269806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328862895723432e-05, + "grad_norm": 4.203307628631592, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8767132759094238, + "num_tokens": 667150980.0, + "step": 17485 + }, + { + "epoch": 2.22439893143366, + "ewc_loss": 0.008366640657186508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366641122847795e-05, + "grad_norm": 4.155686378479004, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.880401611328125, + "num_tokens": 667185198.0, + "step": 17486 + }, + { + "epoch": 2.2245261417122504, + "ewc_loss": 0.008332678116858006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.332677680300549e-05, + "grad_norm": 4.111242294311523, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8864161968231201, + "num_tokens": 667225286.0, + "step": 17487 + }, + { + "epoch": 2.224653351990841, + "ewc_loss": 0.008340784348547459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.340784552274272e-05, + "grad_norm": 4.139922618865967, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8765503168106079, + "num_tokens": 667266294.0, + "step": 17488 + }, + { + "epoch": 2.2247805622694314, + "ewc_loss": 0.008376308716833591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376308687729761e-05, + "grad_norm": 4.122507095336914, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8797395825386047, + "num_tokens": 667302430.0, + "step": 17489 + }, + { + "epoch": 2.224907772548022, + "ewc_loss": 0.008343773894011974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34377424325794e-05, + "grad_norm": 4.117618560791016, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8766156435012817, + "num_tokens": 667342881.0, + "step": 17490 + }, + { + "epoch": 2.2250349828266125, + "ewc_loss": 0.008377197198569775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377197082154453e-05, + "grad_norm": 4.137645721435547, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8835376501083374, + "num_tokens": 667376122.0, + "step": 17491 + }, + { + "epoch": 2.225162193105203, + "ewc_loss": 0.008377077989280224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37707775644958e-05, + "grad_norm": 4.1706671714782715, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.874241054058075, + "num_tokens": 667415120.0, + "step": 17492 + }, + { + "epoch": 2.2252894033837936, + "ewc_loss": 0.008399229496717453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399229409405962e-05, + "grad_norm": 4.1290717124938965, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8822305202484131, + "num_tokens": 667451014.0, + "step": 17493 + }, + { + "epoch": 2.225416613662384, + "ewc_loss": 0.008365088142454624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365088433492929e-05, + "grad_norm": 4.1865434646606445, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8721591234207153, + "num_tokens": 667485600.0, + "step": 17494 + }, + { + "epoch": 2.2255438239409746, + "ewc_loss": 0.00841314997524023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413149771513417e-05, + "grad_norm": 4.080355167388916, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8780361413955688, + "num_tokens": 667527561.0, + "step": 17495 + }, + { + "epoch": 2.225671034219565, + "ewc_loss": 0.008347447030246258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34744714666158e-05, + "grad_norm": 4.104145050048828, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8889972567558289, + "num_tokens": 667569677.0, + "step": 17496 + }, + { + "epoch": 2.225798244498155, + "ewc_loss": 0.008380933664739132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380933286389336e-05, + "grad_norm": 4.134519577026367, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8817752599716187, + "num_tokens": 667605954.0, + "step": 17497 + }, + { + "epoch": 2.225925454776746, + "ewc_loss": 0.00840492732822895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404927211813629e-05, + "grad_norm": 4.116602420806885, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8862325549125671, + "num_tokens": 667645082.0, + "step": 17498 + }, + { + "epoch": 2.2260526650553363, + "ewc_loss": 0.008363707922399044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363708184333518e-05, + "grad_norm": 4.1427130699157715, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8785768151283264, + "num_tokens": 667681390.0, + "step": 17499 + }, + { + "epoch": 2.226179875333927, + "ewc_loss": 0.008393472991883755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39347267174162e-05, + "grad_norm": 4.117344856262207, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8718484044075012, + "num_tokens": 667722772.0, + "step": 17500 + }, + { + "epoch": 2.2263070856125173, + "ewc_loss": 0.008395521901547909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395522309001535e-05, + "grad_norm": 4.1744866371154785, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8589193820953369, + "num_tokens": 667758647.0, + "step": 17501 + }, + { + "epoch": 2.226434295891108, + "ewc_loss": 0.0084131034091115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413103205384687e-05, + "grad_norm": 4.1934590339660645, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8829998970031738, + "num_tokens": 667790625.0, + "step": 17502 + }, + { + "epoch": 2.2265615061696984, + "ewc_loss": 0.008415953256189823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415953197982162e-05, + "grad_norm": 4.161487579345703, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.872198224067688, + "num_tokens": 667823724.0, + "step": 17503 + }, + { + "epoch": 2.226688716448289, + "ewc_loss": 0.008389336988329887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389337017433718e-05, + "grad_norm": 4.0702900886535645, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8953583240509033, + "num_tokens": 667862548.0, + "step": 17504 + }, + { + "epoch": 2.2268159267268794, + "ewc_loss": 0.008357963524758816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35796381579712e-05, + "grad_norm": 4.116535186767578, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8747892379760742, + "num_tokens": 667905994.0, + "step": 17505 + }, + { + "epoch": 2.22694313700547, + "ewc_loss": 0.00840942095965147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409420843236148e-05, + "grad_norm": 4.126116752624512, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8769791126251221, + "num_tokens": 667945087.0, + "step": 17506 + }, + { + "epoch": 2.2270703472840605, + "ewc_loss": 0.008416630327701569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416630589636043e-05, + "grad_norm": 4.1046953201293945, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8903279304504395, + "num_tokens": 667984736.0, + "step": 17507 + }, + { + "epoch": 2.227197557562651, + "ewc_loss": 0.008379379287362099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379379141842946e-05, + "grad_norm": 4.138026714324951, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8773940801620483, + "num_tokens": 668024020.0, + "step": 17508 + }, + { + "epoch": 2.2273247678412416, + "ewc_loss": 0.008428888395428658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428888395428658e-05, + "grad_norm": 4.181647300720215, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8773735165596008, + "num_tokens": 668059567.0, + "step": 17509 + }, + { + "epoch": 2.227451978119832, + "ewc_loss": 0.008422930724918842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422930841334164e-05, + "grad_norm": 4.102543830871582, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8866629004478455, + "num_tokens": 668099901.0, + "step": 17510 + }, + { + "epoch": 2.2275791883984226, + "ewc_loss": 0.008366992697119713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36699255160056e-05, + "grad_norm": 4.116329193115234, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8946234583854675, + "num_tokens": 668135974.0, + "step": 17511 + }, + { + "epoch": 2.227706398677013, + "ewc_loss": 0.008388703688979149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388704009121284e-05, + "grad_norm": 4.101766109466553, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8838301301002502, + "num_tokens": 668179200.0, + "step": 17512 + }, + { + "epoch": 2.2278336089556037, + "ewc_loss": 0.008379642851650715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37964253150858e-05, + "grad_norm": 4.188387870788574, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8625466823577881, + "num_tokens": 668219683.0, + "step": 17513 + }, + { + "epoch": 2.227960819234194, + "ewc_loss": 0.008421206846833229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421206439379603e-05, + "grad_norm": 4.164843559265137, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8841664791107178, + "num_tokens": 668253899.0, + "step": 17514 + }, + { + "epoch": 2.2280880295127847, + "ewc_loss": 0.00835483893752098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354838792001829e-05, + "grad_norm": 4.1244587898254395, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8762251138687134, + "num_tokens": 668294140.0, + "step": 17515 + }, + { + "epoch": 2.2282152397913753, + "ewc_loss": 0.008375059813261032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375059405807406e-05, + "grad_norm": 4.190557479858398, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8883575201034546, + "num_tokens": 668328442.0, + "step": 17516 + }, + { + "epoch": 2.228342450069966, + "ewc_loss": 0.008415743708610535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415743650402874e-05, + "grad_norm": 4.085176467895508, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8855219483375549, + "num_tokens": 668367903.0, + "step": 17517 + }, + { + "epoch": 2.2284696603485563, + "ewc_loss": 0.008344443514943123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344443631358445e-05, + "grad_norm": 4.08319091796875, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8927807807922363, + "num_tokens": 668409889.0, + "step": 17518 + }, + { + "epoch": 2.228596870627147, + "ewc_loss": 0.00837233942002058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372339652851224e-05, + "grad_norm": 4.113826274871826, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8852753639221191, + "num_tokens": 668451061.0, + "step": 17519 + }, + { + "epoch": 2.2287240809057374, + "ewc_loss": 0.008392161689698696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.392161544179544e-05, + "grad_norm": 4.1547393798828125, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8605554699897766, + "num_tokens": 668493038.0, + "step": 17520 + }, + { + "epoch": 2.228851291184328, + "ewc_loss": 0.00840798020362854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40798020362854e-05, + "grad_norm": 4.1306352615356445, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8889617919921875, + "num_tokens": 668528087.0, + "step": 17521 + }, + { + "epoch": 2.228978501462918, + "ewc_loss": 0.008375904522836208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375904144486412e-05, + "grad_norm": 4.113166809082031, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8846036195755005, + "num_tokens": 668562325.0, + "step": 17522 + }, + { + "epoch": 2.2291057117415085, + "ewc_loss": 0.008380850777029991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380851068068296e-05, + "grad_norm": 4.095982551574707, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8856289386749268, + "num_tokens": 668601336.0, + "step": 17523 + }, + { + "epoch": 2.229232922020099, + "ewc_loss": 0.008373197168111801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373196760658175e-05, + "grad_norm": 4.083407402038574, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.9000107049942017, + "num_tokens": 668637113.0, + "step": 17524 + }, + { + "epoch": 2.2293601322986896, + "ewc_loss": 0.008380813524127007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380813233088702e-05, + "grad_norm": 4.195807933807373, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8793627023696899, + "num_tokens": 668675645.0, + "step": 17525 + }, + { + "epoch": 2.22948734257728, + "ewc_loss": 0.008452137932181358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452137990389019e-05, + "grad_norm": 4.188807964324951, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8828715682029724, + "num_tokens": 668706480.0, + "step": 17526 + }, + { + "epoch": 2.2296145528558706, + "ewc_loss": 0.0084098931401968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409893052885309e-05, + "grad_norm": 4.168859481811523, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8756773471832275, + "num_tokens": 668743956.0, + "step": 17527 + }, + { + "epoch": 2.229741763134461, + "ewc_loss": 0.008411461487412453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411461021751165e-05, + "grad_norm": 4.0919294357299805, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8917292356491089, + "num_tokens": 668779134.0, + "step": 17528 + }, + { + "epoch": 2.2298689734130517, + "ewc_loss": 0.008385922759771347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385922410525382e-05, + "grad_norm": 4.168328762054443, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8836214542388916, + "num_tokens": 668814978.0, + "step": 17529 + }, + { + "epoch": 2.229996183691642, + "ewc_loss": 0.00843895971775055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43895977595821e-05, + "grad_norm": 4.131829261779785, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.868415355682373, + "num_tokens": 668854194.0, + "step": 17530 + }, + { + "epoch": 2.2301233939702327, + "ewc_loss": 0.008401603437960148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40160355437547e-05, + "grad_norm": 4.236790180206299, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.874957799911499, + "num_tokens": 668893095.0, + "step": 17531 + }, + { + "epoch": 2.2302506042488233, + "ewc_loss": 0.008485096506774426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485096623189747e-05, + "grad_norm": 4.1537885665893555, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8880853056907654, + "num_tokens": 668930626.0, + "step": 17532 + }, + { + "epoch": 2.230377814527414, + "ewc_loss": 0.008406138978898525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406139386352152e-05, + "grad_norm": 4.199004173278809, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8884857892990112, + "num_tokens": 668960941.0, + "step": 17533 + }, + { + "epoch": 2.2305050248060043, + "ewc_loss": 0.008464979939162731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464980055578053e-05, + "grad_norm": 4.148725509643555, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8859905004501343, + "num_tokens": 668997771.0, + "step": 17534 + }, + { + "epoch": 2.230632235084595, + "ewc_loss": 0.008420847356319427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420847007073462e-05, + "grad_norm": 4.135941505432129, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8949037790298462, + "num_tokens": 669038714.0, + "step": 17535 + }, + { + "epoch": 2.2307594453631854, + "ewc_loss": 0.008425495587289333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425495616393164e-05, + "grad_norm": 4.131326198577881, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8919026255607605, + "num_tokens": 669072336.0, + "step": 17536 + }, + { + "epoch": 2.230886655641776, + "ewc_loss": 0.008428514003753662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428514411207289e-05, + "grad_norm": 4.093618869781494, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.8915331363677979, + "num_tokens": 669110303.0, + "step": 17537 + }, + { + "epoch": 2.2310138659203664, + "ewc_loss": 0.00840634386986494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40634384076111e-05, + "grad_norm": 4.149284839630127, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8727560043334961, + "num_tokens": 669146874.0, + "step": 17538 + }, + { + "epoch": 2.231141076198957, + "ewc_loss": 0.008457109332084656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457109652226791e-05, + "grad_norm": 4.144536972045898, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8941486477851868, + "num_tokens": 669181682.0, + "step": 17539 + }, + { + "epoch": 2.2312682864775475, + "ewc_loss": 0.008443371392786503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44337118905969e-05, + "grad_norm": 4.084511756896973, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8814483880996704, + "num_tokens": 669227201.0, + "step": 17540 + }, + { + "epoch": 2.231395496756138, + "ewc_loss": 0.008387667126953602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387667185161263e-05, + "grad_norm": 4.158993244171143, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8829048871994019, + "num_tokens": 669265915.0, + "step": 17541 + }, + { + "epoch": 2.2315227070347285, + "ewc_loss": 0.008492590859532356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492590859532356e-05, + "grad_norm": 4.161048412322998, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8879503607749939, + "num_tokens": 669304130.0, + "step": 17542 + }, + { + "epoch": 2.231649917313319, + "ewc_loss": 0.008434425108134747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434425399173051e-05, + "grad_norm": 4.142857074737549, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8804968595504761, + "num_tokens": 669344225.0, + "step": 17543 + }, + { + "epoch": 2.2317771275919096, + "ewc_loss": 0.00839825626462698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398255886277184e-05, + "grad_norm": 4.129159927368164, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8649278283119202, + "num_tokens": 669385126.0, + "step": 17544 + }, + { + "epoch": 2.2319043378705, + "ewc_loss": 0.008389119058847427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389118738705292e-05, + "grad_norm": 4.097917079925537, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8714932203292847, + "num_tokens": 669425774.0, + "step": 17545 + }, + { + "epoch": 2.2320315481490907, + "ewc_loss": 0.00837130006402731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371299918508157e-05, + "grad_norm": 4.207788944244385, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8733927011489868, + "num_tokens": 669459789.0, + "step": 17546 + }, + { + "epoch": 2.2321587584276807, + "ewc_loss": 0.008450829423964024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450829045614228e-05, + "grad_norm": 4.0839338302612305, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.895262598991394, + "num_tokens": 669498709.0, + "step": 17547 + }, + { + "epoch": 2.2322859687062713, + "ewc_loss": 0.008328502997756004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32850273582153e-05, + "grad_norm": 4.184502601623535, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8881869316101074, + "num_tokens": 669536021.0, + "step": 17548 + }, + { + "epoch": 2.232413178984862, + "ewc_loss": 0.008433717302978039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433717448497191e-05, + "grad_norm": 4.168614387512207, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8729156255722046, + "num_tokens": 669574316.0, + "step": 17549 + }, + { + "epoch": 2.2325403892634523, + "ewc_loss": 0.008388848043978214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388848073082045e-05, + "grad_norm": 4.1353960037231445, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.874190628528595, + "num_tokens": 669611826.0, + "step": 17550 + }, + { + "epoch": 2.232667599542043, + "ewc_loss": 0.008363384753465652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363384404219687e-05, + "grad_norm": 4.09799337387085, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8808596134185791, + "num_tokens": 669655277.0, + "step": 17551 + }, + { + "epoch": 2.2327948098206334, + "ewc_loss": 0.008358405902981758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358405466424301e-05, + "grad_norm": 4.138655185699463, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8757920265197754, + "num_tokens": 669693481.0, + "step": 17552 + }, + { + "epoch": 2.232922020099224, + "ewc_loss": 0.00838426686823368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384266402572393e-05, + "grad_norm": 4.182572841644287, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.880077064037323, + "num_tokens": 669724104.0, + "step": 17553 + }, + { + "epoch": 2.2330492303778144, + "ewc_loss": 0.00839712843298912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397128112846985e-05, + "grad_norm": 4.110297679901123, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8653713464736938, + "num_tokens": 669768725.0, + "step": 17554 + }, + { + "epoch": 2.233176440656405, + "ewc_loss": 0.008347196504473686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34719612612389e-05, + "grad_norm": 4.09092378616333, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8896225094795227, + "num_tokens": 669809508.0, + "step": 17555 + }, + { + "epoch": 2.2333036509349955, + "ewc_loss": 0.008356229402124882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356229227501899e-05, + "grad_norm": 4.119270324707031, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8832268714904785, + "num_tokens": 669848047.0, + "step": 17556 + }, + { + "epoch": 2.233430861213586, + "ewc_loss": 0.00836622528731823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366224938072264e-05, + "grad_norm": 4.115979194641113, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8964453935623169, + "num_tokens": 669881274.0, + "step": 17557 + }, + { + "epoch": 2.2335580714921766, + "ewc_loss": 0.00837266631424427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372666343348101e-05, + "grad_norm": 4.1397881507873535, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8760920763015747, + "num_tokens": 669918517.0, + "step": 17558 + }, + { + "epoch": 2.233685281770767, + "ewc_loss": 0.008390414528548717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390414586756378e-05, + "grad_norm": 4.148275375366211, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8739117383956909, + "num_tokens": 669960990.0, + "step": 17559 + }, + { + "epoch": 2.2338124920493576, + "ewc_loss": 0.008404037915170193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404038089793175e-05, + "grad_norm": 4.194013595581055, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.887165904045105, + "num_tokens": 669995542.0, + "step": 17560 + }, + { + "epoch": 2.233939702327948, + "ewc_loss": 0.008402845822274685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402845560340211e-05, + "grad_norm": 4.057828903198242, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8826907873153687, + "num_tokens": 670036798.0, + "step": 17561 + }, + { + "epoch": 2.2340669126065387, + "ewc_loss": 0.008315136656165123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315136801684275e-05, + "grad_norm": 4.181621551513672, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8813946843147278, + "num_tokens": 670071763.0, + "step": 17562 + }, + { + "epoch": 2.234194122885129, + "ewc_loss": 0.008437956683337688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437956421403214e-05, + "grad_norm": 4.227008819580078, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8809882402420044, + "num_tokens": 670100864.0, + "step": 17563 + }, + { + "epoch": 2.2343213331637197, + "ewc_loss": 0.008428424596786499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428424916928634e-05, + "grad_norm": 4.108330249786377, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8710860013961792, + "num_tokens": 670144114.0, + "step": 17564 + }, + { + "epoch": 2.2344485434423103, + "ewc_loss": 0.008354476653039455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354476449312642e-05, + "grad_norm": 4.12941837310791, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8861405849456787, + "num_tokens": 670182719.0, + "step": 17565 + }, + { + "epoch": 2.234575753720901, + "ewc_loss": 0.008412737399339676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.412737224716693e-05, + "grad_norm": 4.126556396484375, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8849391937255859, + "num_tokens": 670218059.0, + "step": 17566 + }, + { + "epoch": 2.2347029639994913, + "ewc_loss": 0.008403581567108631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403581159655005e-05, + "grad_norm": 4.09372091293335, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8790897130966187, + "num_tokens": 670259589.0, + "step": 17567 + }, + { + "epoch": 2.234830174278082, + "ewc_loss": 0.008390192873775959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390192670049146e-05, + "grad_norm": 4.127921104431152, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.873461902141571, + "num_tokens": 670301755.0, + "step": 17568 + }, + { + "epoch": 2.2349573845566724, + "ewc_loss": 0.00841622706502676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416226773988456e-05, + "grad_norm": 4.0997314453125, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8844731450080872, + "num_tokens": 670339205.0, + "step": 17569 + }, + { + "epoch": 2.235084594835263, + "ewc_loss": 0.008377145044505596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377144695259631e-05, + "grad_norm": 4.105108737945557, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8844550251960754, + "num_tokens": 670382961.0, + "step": 17570 + }, + { + "epoch": 2.2352118051138534, + "ewc_loss": 0.008407140150666237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407139830524102e-05, + "grad_norm": 4.187942028045654, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8720728158950806, + "num_tokens": 670420747.0, + "step": 17571 + }, + { + "epoch": 2.2353390153924435, + "ewc_loss": 0.008424315601587296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424315456068143e-05, + "grad_norm": 4.112314701080322, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.896501898765564, + "num_tokens": 670457906.0, + "step": 17572 + }, + { + "epoch": 2.235466225671034, + "ewc_loss": 0.00832793116569519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327930845553055e-05, + "grad_norm": 4.180629730224609, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8716105222702026, + "num_tokens": 670493187.0, + "step": 17573 + }, + { + "epoch": 2.2355934359496246, + "ewc_loss": 0.008438972756266594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438972872681916e-05, + "grad_norm": 4.134397983551025, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8941060900688171, + "num_tokens": 670529750.0, + "step": 17574 + }, + { + "epoch": 2.235720646228215, + "ewc_loss": 0.008355218917131424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355218596989289e-05, + "grad_norm": 4.120427131652832, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8793935775756836, + "num_tokens": 670570147.0, + "step": 17575 + }, + { + "epoch": 2.2358478565068056, + "ewc_loss": 0.008368579670786858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368579437956214e-05, + "grad_norm": 4.1444621086120605, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8826743364334106, + "num_tokens": 670604113.0, + "step": 17576 + }, + { + "epoch": 2.235975066785396, + "ewc_loss": 0.008403634652495384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40363500174135e-05, + "grad_norm": 4.144797325134277, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8817709684371948, + "num_tokens": 670645451.0, + "step": 17577 + }, + { + "epoch": 2.2361022770639867, + "ewc_loss": 0.008385024033486843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38502382976003e-05, + "grad_norm": 4.098071098327637, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8853026628494263, + "num_tokens": 670684055.0, + "step": 17578 + }, + { + "epoch": 2.236229487342577, + "ewc_loss": 0.008362773805856705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362773951375857e-05, + "grad_norm": 4.114872455596924, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8729236125946045, + "num_tokens": 670724397.0, + "step": 17579 + }, + { + "epoch": 2.2363566976211677, + "ewc_loss": 0.008388468995690346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388468995690346e-05, + "grad_norm": 4.145338535308838, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8908581733703613, + "num_tokens": 670761282.0, + "step": 17580 + }, + { + "epoch": 2.2364839078997583, + "ewc_loss": 0.008386761881411076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386762056034058e-05, + "grad_norm": 4.098385334014893, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8866389989852905, + "num_tokens": 670800007.0, + "step": 17581 + }, + { + "epoch": 2.236611118178349, + "ewc_loss": 0.008344728499650955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344728848896921e-05, + "grad_norm": 4.136301040649414, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8784788250923157, + "num_tokens": 670840040.0, + "step": 17582 + }, + { + "epoch": 2.2367383284569393, + "ewc_loss": 0.008388210088014603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388209971599281e-05, + "grad_norm": 4.13071346282959, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8754210472106934, + "num_tokens": 670881193.0, + "step": 17583 + }, + { + "epoch": 2.23686553873553, + "ewc_loss": 0.00835589412599802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355893805855885e-05, + "grad_norm": 4.107530117034912, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8836250305175781, + "num_tokens": 670927275.0, + "step": 17584 + }, + { + "epoch": 2.2369927490141204, + "ewc_loss": 0.008328628726303577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328628609888256e-05, + "grad_norm": 4.094943046569824, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8843784332275391, + "num_tokens": 670967327.0, + "step": 17585 + }, + { + "epoch": 2.237119959292711, + "ewc_loss": 0.008331669494509697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331669232575223e-05, + "grad_norm": 4.116364479064941, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8846192359924316, + "num_tokens": 671009250.0, + "step": 17586 + }, + { + "epoch": 2.2372471695713014, + "ewc_loss": 0.008326279930770397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326279930770397e-05, + "grad_norm": 4.099625110626221, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8862892985343933, + "num_tokens": 671045068.0, + "step": 17587 + }, + { + "epoch": 2.237374379849892, + "ewc_loss": 0.008320233784615993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320233609993011e-05, + "grad_norm": 4.138885021209717, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.86947101354599, + "num_tokens": 671084786.0, + "step": 17588 + }, + { + "epoch": 2.2375015901284825, + "ewc_loss": 0.008352069184184074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35206956253387e-05, + "grad_norm": 4.167795658111572, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8759206533432007, + "num_tokens": 671123235.0, + "step": 17589 + }, + { + "epoch": 2.237628800407073, + "ewc_loss": 0.008350275456905365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350275311386213e-05, + "grad_norm": 4.117715835571289, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.8967567682266235, + "num_tokens": 671161702.0, + "step": 17590 + }, + { + "epoch": 2.2377560106856635, + "ewc_loss": 0.008314007893204689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314008300658315e-05, + "grad_norm": 4.198397636413574, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8917985558509827, + "num_tokens": 671195487.0, + "step": 17591 + }, + { + "epoch": 2.237883220964254, + "ewc_loss": 0.008372618816792965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372619049623609e-05, + "grad_norm": 4.1628546714782715, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8787599802017212, + "num_tokens": 671232288.0, + "step": 17592 + }, + { + "epoch": 2.2380104312428446, + "ewc_loss": 0.008317889645695686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.317889296449721e-05, + "grad_norm": 4.0989155769348145, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8673835396766663, + "num_tokens": 671275154.0, + "step": 17593 + }, + { + "epoch": 2.238137641521435, + "ewc_loss": 0.008298734202980995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298734610434622e-05, + "grad_norm": 4.142452239990234, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8736974000930786, + "num_tokens": 671312266.0, + "step": 17594 + }, + { + "epoch": 2.238264851800025, + "ewc_loss": 0.008359762839972973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.359762432519346e-05, + "grad_norm": 4.142348289489746, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8824276328086853, + "num_tokens": 671349690.0, + "step": 17595 + }, + { + "epoch": 2.238392062078616, + "ewc_loss": 0.008339337073266506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33933663670905e-05, + "grad_norm": 4.095876693725586, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8866376876831055, + "num_tokens": 671387898.0, + "step": 17596 + }, + { + "epoch": 2.2385192723572063, + "ewc_loss": 0.00832503754645586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.325037197209895e-05, + "grad_norm": 4.140198707580566, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8613228797912598, + "num_tokens": 671428054.0, + "step": 17597 + }, + { + "epoch": 2.238646482635797, + "ewc_loss": 0.00838510226458311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385102410102263e-05, + "grad_norm": 4.141747951507568, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8890469074249268, + "num_tokens": 671467096.0, + "step": 17598 + }, + { + "epoch": 2.2387736929143873, + "ewc_loss": 0.008366992697119713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36699255160056e-05, + "grad_norm": 4.097105026245117, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8806571364402771, + "num_tokens": 671506752.0, + "step": 17599 + }, + { + "epoch": 2.238900903192978, + "ewc_loss": 0.008337671868503094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.337671897606924e-05, + "grad_norm": 4.172065258026123, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8776254653930664, + "num_tokens": 671542403.0, + "step": 17600 + }, + { + "epoch": 2.2390281134715684, + "ewc_loss": 0.00840326864272356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403269021073356e-05, + "grad_norm": 4.121609210968018, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.893944263458252, + "num_tokens": 671578115.0, + "step": 17601 + }, + { + "epoch": 2.239155323750159, + "ewc_loss": 0.008367963135242462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367963164346293e-05, + "grad_norm": 4.099042892456055, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8923302888870239, + "num_tokens": 671619854.0, + "step": 17602 + }, + { + "epoch": 2.2392825340287494, + "ewc_loss": 0.008371351286768913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371351577807218e-05, + "grad_norm": 4.075721263885498, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8938170671463013, + "num_tokens": 671661617.0, + "step": 17603 + }, + { + "epoch": 2.23940974430734, + "ewc_loss": 0.008377917110919952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377917401958257e-05, + "grad_norm": 4.137143611907959, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8715018033981323, + "num_tokens": 671702231.0, + "step": 17604 + }, + { + "epoch": 2.2395369545859305, + "ewc_loss": 0.008406112901866436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40611319290474e-05, + "grad_norm": 4.114006519317627, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8810541033744812, + "num_tokens": 671741483.0, + "step": 17605 + }, + { + "epoch": 2.239664164864521, + "ewc_loss": 0.008368582464754581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36858234833926e-05, + "grad_norm": 4.147761344909668, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8788466453552246, + "num_tokens": 671780168.0, + "step": 17606 + }, + { + "epoch": 2.2397913751431116, + "ewc_loss": 0.00840100646018982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401006198255345e-05, + "grad_norm": 4.173909664154053, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8604531288146973, + "num_tokens": 671818980.0, + "step": 17607 + }, + { + "epoch": 2.239918585421702, + "ewc_loss": 0.008388345129787922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388345304410905e-05, + "grad_norm": 4.195724964141846, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8803741335868835, + "num_tokens": 671850010.0, + "step": 17608 + }, + { + "epoch": 2.2400457957002926, + "ewc_loss": 0.008406179025769234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406178676523268e-05, + "grad_norm": 4.19391393661499, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.873860239982605, + "num_tokens": 671888506.0, + "step": 17609 + }, + { + "epoch": 2.240173005978883, + "ewc_loss": 0.00840059481561184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400594379054382e-05, + "grad_norm": 4.074519634246826, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8894058465957642, + "num_tokens": 671930965.0, + "step": 17610 + }, + { + "epoch": 2.2403002162574737, + "ewc_loss": 0.008343599736690521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3435996202752e-05, + "grad_norm": 4.1403608322143555, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8903283476829529, + "num_tokens": 671967472.0, + "step": 17611 + }, + { + "epoch": 2.240427426536064, + "ewc_loss": 0.008414779789745808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414779586018994e-05, + "grad_norm": 4.06753396987915, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.9026306867599487, + "num_tokens": 672005986.0, + "step": 17612 + }, + { + "epoch": 2.2405546368146547, + "ewc_loss": 0.008353661745786667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353661542059854e-05, + "grad_norm": 4.110543251037598, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8834065794944763, + "num_tokens": 672044729.0, + "step": 17613 + }, + { + "epoch": 2.2406818470932452, + "ewc_loss": 0.008432636968791485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432636968791485e-05, + "grad_norm": 4.155102729797363, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8829042911529541, + "num_tokens": 672080141.0, + "step": 17614 + }, + { + "epoch": 2.2408090573718358, + "ewc_loss": 0.008425571024417877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425571286352351e-05, + "grad_norm": 4.09000825881958, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8891645073890686, + "num_tokens": 672119277.0, + "step": 17615 + }, + { + "epoch": 2.2409362676504263, + "ewc_loss": 0.008382396772503853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38239720906131e-05, + "grad_norm": 4.16056489944458, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8806980848312378, + "num_tokens": 672154205.0, + "step": 17616 + }, + { + "epoch": 2.241063477929017, + "ewc_loss": 0.008449442684650421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449442975688726e-05, + "grad_norm": 4.168118953704834, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8817948698997498, + "num_tokens": 672193667.0, + "step": 17617 + }, + { + "epoch": 2.2411906882076074, + "ewc_loss": 0.008402816951274872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402817184105515e-05, + "grad_norm": 4.190526008605957, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8542396426200867, + "num_tokens": 672232360.0, + "step": 17618 + }, + { + "epoch": 2.241317898486198, + "ewc_loss": 0.008419015444815159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419015648541972e-05, + "grad_norm": 4.189260005950928, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8701224327087402, + "num_tokens": 672264127.0, + "step": 17619 + }, + { + "epoch": 2.241445108764788, + "ewc_loss": 0.008437519893050194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437519863946363e-05, + "grad_norm": 4.108351707458496, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8901579976081848, + "num_tokens": 672300719.0, + "step": 17620 + }, + { + "epoch": 2.2415723190433785, + "ewc_loss": 0.008404779247939587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404779509874061e-05, + "grad_norm": 4.086268901824951, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8912557363510132, + "num_tokens": 672342796.0, + "step": 17621 + }, + { + "epoch": 2.241699529321969, + "ewc_loss": 0.008395419456064701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395419717999175e-05, + "grad_norm": 4.120110988616943, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8840356469154358, + "num_tokens": 672382597.0, + "step": 17622 + }, + { + "epoch": 2.2418267396005596, + "ewc_loss": 0.00843018013983965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430179877905175e-05, + "grad_norm": 4.082675933837891, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.87134850025177, + "num_tokens": 672427350.0, + "step": 17623 + }, + { + "epoch": 2.24195394987915, + "ewc_loss": 0.008405911736190319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405911648878828e-05, + "grad_norm": 4.140629291534424, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8760697841644287, + "num_tokens": 672468997.0, + "step": 17624 + }, + { + "epoch": 2.2420811601577406, + "ewc_loss": 0.008440081030130386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440081001026556e-05, + "grad_norm": 4.118508815765381, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8884806632995605, + "num_tokens": 672505061.0, + "step": 17625 + }, + { + "epoch": 2.242208370436331, + "ewc_loss": 0.008407515473663807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407515269936994e-05, + "grad_norm": 4.117575168609619, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8803730010986328, + "num_tokens": 672544241.0, + "step": 17626 + }, + { + "epoch": 2.2423355807149217, + "ewc_loss": 0.008404172956943512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404173422604799e-05, + "grad_norm": 4.103468418121338, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8804618120193481, + "num_tokens": 672586334.0, + "step": 17627 + }, + { + "epoch": 2.242462790993512, + "ewc_loss": 0.008394057862460613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3940576587338e-05, + "grad_norm": 4.114530563354492, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8793637156486511, + "num_tokens": 672628627.0, + "step": 17628 + }, + { + "epoch": 2.2425900012721027, + "ewc_loss": 0.008400781080126762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400781371165067e-05, + "grad_norm": 4.1530866622924805, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8732621073722839, + "num_tokens": 672663046.0, + "step": 17629 + }, + { + "epoch": 2.2427172115506933, + "ewc_loss": 0.008406875655055046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406875713262707e-05, + "grad_norm": 4.075110912322998, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8857101202011108, + "num_tokens": 672703394.0, + "step": 17630 + }, + { + "epoch": 2.242844421829284, + "ewc_loss": 0.008353753946721554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353753946721554e-05, + "grad_norm": 4.120546817779541, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8736146092414856, + "num_tokens": 672743262.0, + "step": 17631 + }, + { + "epoch": 2.2429716321078743, + "ewc_loss": 0.00840205792337656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402057574130595e-05, + "grad_norm": 4.179108142852783, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8523693680763245, + "num_tokens": 672782662.0, + "step": 17632 + }, + { + "epoch": 2.243098842386465, + "ewc_loss": 0.00840845238417387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4084524132777e-05, + "grad_norm": 4.22175931930542, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8789092302322388, + "num_tokens": 672818400.0, + "step": 17633 + }, + { + "epoch": 2.2432260526650554, + "ewc_loss": 0.008412222377955914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41222281451337e-05, + "grad_norm": 4.088374614715576, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8933818340301514, + "num_tokens": 672858843.0, + "step": 17634 + }, + { + "epoch": 2.243353262943646, + "ewc_loss": 0.008315999060869217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315999002661556e-05, + "grad_norm": 4.111304759979248, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8828152418136597, + "num_tokens": 672900397.0, + "step": 17635 + }, + { + "epoch": 2.2434804732222364, + "ewc_loss": 0.008386253379285336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386253466596827e-05, + "grad_norm": 4.11337423324585, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8799421191215515, + "num_tokens": 672936025.0, + "step": 17636 + }, + { + "epoch": 2.243607683500827, + "ewc_loss": 0.008354760706424713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354760939255357e-05, + "grad_norm": 4.120765209197998, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8636981248855591, + "num_tokens": 672979180.0, + "step": 17637 + }, + { + "epoch": 2.2437348937794175, + "ewc_loss": 0.0083653274923563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365327812498435e-05, + "grad_norm": 4.118719577789307, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8897613286972046, + "num_tokens": 673018053.0, + "step": 17638 + }, + { + "epoch": 2.243862104058008, + "ewc_loss": 0.008355553261935711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355553291039541e-05, + "grad_norm": 4.152665615081787, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8642655611038208, + "num_tokens": 673057596.0, + "step": 17639 + }, + { + "epoch": 2.2439893143365985, + "ewc_loss": 0.008371355012059212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371355215786025e-05, + "grad_norm": 4.086465835571289, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8934326171875, + "num_tokens": 673097740.0, + "step": 17640 + }, + { + "epoch": 2.244116524615189, + "ewc_loss": 0.008323218673467636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323218207806349e-05, + "grad_norm": 4.083031177520752, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8858635425567627, + "num_tokens": 673141821.0, + "step": 17641 + }, + { + "epoch": 2.2442437348937796, + "ewc_loss": 0.00834342185407877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343422086909413e-05, + "grad_norm": 4.079174518585205, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8903544545173645, + "num_tokens": 673184279.0, + "step": 17642 + }, + { + "epoch": 2.24437094517237, + "ewc_loss": 0.008326678536832333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.326678653247654e-05, + "grad_norm": 4.112821102142334, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8916817903518677, + "num_tokens": 673224328.0, + "step": 17643 + }, + { + "epoch": 2.2444981554509607, + "ewc_loss": 0.008353053592145443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353053272003308e-05, + "grad_norm": 4.185686111450195, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8681528568267822, + "num_tokens": 673263328.0, + "step": 17644 + }, + { + "epoch": 2.2446253657295507, + "ewc_loss": 0.008351124823093414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35112514323555e-05, + "grad_norm": 4.151524066925049, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8842719793319702, + "num_tokens": 673301663.0, + "step": 17645 + }, + { + "epoch": 2.2447525760081413, + "ewc_loss": 0.008307206444442272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.307206735480577e-05, + "grad_norm": 4.141428470611572, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8791115283966064, + "num_tokens": 673340606.0, + "step": 17646 + }, + { + "epoch": 2.244879786286732, + "ewc_loss": 0.008299960754811764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.299960609292611e-05, + "grad_norm": 4.086400985717773, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8798372149467468, + "num_tokens": 673382201.0, + "step": 17647 + }, + { + "epoch": 2.2450069965653223, + "ewc_loss": 0.008275844156742096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.2758444477804e-05, + "grad_norm": 4.094912052154541, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8911588191986084, + "num_tokens": 673423330.0, + "step": 17648 + }, + { + "epoch": 2.245134206843913, + "ewc_loss": 0.008305015042424202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305015217047185e-05, + "grad_norm": 4.102147579193115, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.897028923034668, + "num_tokens": 673459690.0, + "step": 17649 + }, + { + "epoch": 2.2452614171225034, + "ewc_loss": 0.008305290713906288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.305290975840762e-05, + "grad_norm": 4.178664684295654, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8777409791946411, + "num_tokens": 673494827.0, + "step": 17650 + }, + { + "epoch": 2.245388627401094, + "ewc_loss": 0.008338947780430317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33894737297669e-05, + "grad_norm": 4.118077754974365, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8700217604637146, + "num_tokens": 673540568.0, + "step": 17651 + }, + { + "epoch": 2.2455158376796844, + "ewc_loss": 0.008266991935670376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.266991790151224e-05, + "grad_norm": 4.148924827575684, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8964189291000366, + "num_tokens": 673574022.0, + "step": 17652 + }, + { + "epoch": 2.245643047958275, + "ewc_loss": 0.008320382796227932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320382767124102e-05, + "grad_norm": 4.159189701080322, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8808778524398804, + "num_tokens": 673612493.0, + "step": 17653 + }, + { + "epoch": 2.2457702582368655, + "ewc_loss": 0.008322765119373798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322764915646985e-05, + "grad_norm": 4.109882354736328, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8780608773231506, + "num_tokens": 673653729.0, + "step": 17654 + }, + { + "epoch": 2.245897468515456, + "ewc_loss": 0.008290575817227364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.290575351566076e-05, + "grad_norm": 4.074533939361572, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8934465646743774, + "num_tokens": 673695626.0, + "step": 17655 + }, + { + "epoch": 2.2460246787940465, + "ewc_loss": 0.008297096006572247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.297096064779907e-05, + "grad_norm": 4.1551618576049805, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8871234059333801, + "num_tokens": 673732188.0, + "step": 17656 + }, + { + "epoch": 2.246151889072637, + "ewc_loss": 0.008342377841472626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342377986991778e-05, + "grad_norm": 4.149782657623291, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8730624914169312, + "num_tokens": 673770259.0, + "step": 17657 + }, + { + "epoch": 2.2462790993512276, + "ewc_loss": 0.008294878527522087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.294878352899104e-05, + "grad_norm": 4.15915584564209, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8780801892280579, + "num_tokens": 673805018.0, + "step": 17658 + }, + { + "epoch": 2.246406309629818, + "ewc_loss": 0.008335785940289497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.335785969393328e-05, + "grad_norm": 4.157403945922852, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8773029446601868, + "num_tokens": 673846058.0, + "step": 17659 + }, + { + "epoch": 2.2465335199084087, + "ewc_loss": 0.008303164504468441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303164941025898e-05, + "grad_norm": 4.149013519287109, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8684805631637573, + "num_tokens": 673886437.0, + "step": 17660 + }, + { + "epoch": 2.246660730186999, + "ewc_loss": 0.008325603790581226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32560399430804e-05, + "grad_norm": 4.123183250427246, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8855608105659485, + "num_tokens": 673925701.0, + "step": 17661 + }, + { + "epoch": 2.2467879404655897, + "ewc_loss": 0.008298048749566078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298048487631604e-05, + "grad_norm": 4.102391719818115, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8852230310440063, + "num_tokens": 673969854.0, + "step": 17662 + }, + { + "epoch": 2.2469151507441802, + "ewc_loss": 0.008300507441163063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.300507033709437e-05, + "grad_norm": 4.1527323722839355, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8836939930915833, + "num_tokens": 674005048.0, + "step": 17663 + }, + { + "epoch": 2.2470423610227708, + "ewc_loss": 0.008342776447534561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342775981873274e-05, + "grad_norm": 4.110020160675049, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.876503586769104, + "num_tokens": 674046548.0, + "step": 17664 + }, + { + "epoch": 2.2471695713013613, + "ewc_loss": 0.008301359601318836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.301359775941819e-05, + "grad_norm": 4.161262512207031, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8776566386222839, + "num_tokens": 674086905.0, + "step": 17665 + }, + { + "epoch": 2.247296781579952, + "ewc_loss": 0.008339963853359222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339963824255392e-05, + "grad_norm": 4.094607830047607, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.89034104347229, + "num_tokens": 674124589.0, + "step": 17666 + }, + { + "epoch": 2.2474239918585424, + "ewc_loss": 0.008292261511087418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.292261190945283e-05, + "grad_norm": 4.120277404785156, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8931537866592407, + "num_tokens": 674165135.0, + "step": 17667 + }, + { + "epoch": 2.247551202137133, + "ewc_loss": 0.00831567496061325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315675222547725e-05, + "grad_norm": 4.13769006729126, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8760970830917358, + "num_tokens": 674207269.0, + "step": 17668 + }, + { + "epoch": 2.2476784124157234, + "ewc_loss": 0.008314748294651508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.314748265547678e-05, + "grad_norm": 4.121641159057617, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8963073492050171, + "num_tokens": 674240806.0, + "step": 17669 + }, + { + "epoch": 2.2478056226943135, + "ewc_loss": 0.008311453275382519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311452984344214e-05, + "grad_norm": 4.199184417724609, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8746770620346069, + "num_tokens": 674274697.0, + "step": 17670 + }, + { + "epoch": 2.247932832972904, + "ewc_loss": 0.008358527906239033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35852770251222e-05, + "grad_norm": 4.140446662902832, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8713967800140381, + "num_tokens": 674311362.0, + "step": 17671 + }, + { + "epoch": 2.2480600432514946, + "ewc_loss": 0.0082984184846282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298418106278405e-05, + "grad_norm": 4.110880374908447, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8881379961967468, + "num_tokens": 674352505.0, + "step": 17672 + }, + { + "epoch": 2.248187253530085, + "ewc_loss": 0.008322752080857754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32275181892328e-05, + "grad_norm": 4.15846586227417, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8873826265335083, + "num_tokens": 674383938.0, + "step": 17673 + }, + { + "epoch": 2.2483144638086756, + "ewc_loss": 0.008376223035156727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376222831429914e-05, + "grad_norm": 4.106607913970947, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8780572414398193, + "num_tokens": 674421665.0, + "step": 17674 + }, + { + "epoch": 2.248441674087266, + "ewc_loss": 0.008330589160323143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330588752869517e-05, + "grad_norm": 4.07063627243042, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8829476833343506, + "num_tokens": 674466065.0, + "step": 17675 + }, + { + "epoch": 2.2485688843658567, + "ewc_loss": 0.008326138369739056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32613804959692e-05, + "grad_norm": 4.090514659881592, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8825235962867737, + "num_tokens": 674506868.0, + "step": 17676 + }, + { + "epoch": 2.248696094644447, + "ewc_loss": 0.008362218737602234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362218795809895e-05, + "grad_norm": 4.131614685058594, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.87798011302948, + "num_tokens": 674545756.0, + "step": 17677 + }, + { + "epoch": 2.2488233049230377, + "ewc_loss": 0.008359134197235107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.359134517377242e-05, + "grad_norm": 4.107355117797852, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8772167563438416, + "num_tokens": 674590680.0, + "step": 17678 + }, + { + "epoch": 2.2489505152016283, + "ewc_loss": 0.008333022706210613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333022560691461e-05, + "grad_norm": 4.108254909515381, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8863285779953003, + "num_tokens": 674631624.0, + "step": 17679 + }, + { + "epoch": 2.249077725480219, + "ewc_loss": 0.008343704044818878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343703666469082e-05, + "grad_norm": 4.158843517303467, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8785657286643982, + "num_tokens": 674667365.0, + "step": 17680 + }, + { + "epoch": 2.2492049357588093, + "ewc_loss": 0.008360116742551327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360116771655157e-05, + "grad_norm": 4.111064434051514, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8797131776809692, + "num_tokens": 674706587.0, + "step": 17681 + }, + { + "epoch": 2.2493321460374, + "ewc_loss": 0.00832324754446745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323247311636806e-05, + "grad_norm": 4.140747547149658, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8809984922409058, + "num_tokens": 674744480.0, + "step": 17682 + }, + { + "epoch": 2.2494593563159904, + "ewc_loss": 0.00836972240358591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369722490897402e-05, + "grad_norm": 4.110805988311768, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8810466527938843, + "num_tokens": 674783104.0, + "step": 17683 + }, + { + "epoch": 2.249586566594581, + "ewc_loss": 0.008320392109453678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320392225869e-05, + "grad_norm": 4.130990982055664, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8766034841537476, + "num_tokens": 674823488.0, + "step": 17684 + }, + { + "epoch": 2.2497137768731714, + "ewc_loss": 0.008352844044566154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352844452019781e-05, + "grad_norm": 4.13038969039917, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8876297473907471, + "num_tokens": 674857322.0, + "step": 17685 + }, + { + "epoch": 2.249840987151762, + "ewc_loss": 0.008341487497091293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341487409779802e-05, + "grad_norm": 4.10101318359375, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8730977773666382, + "num_tokens": 674901566.0, + "step": 17686 + }, + { + "epoch": 2.2499681974303525, + "ewc_loss": 0.008333123289048672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333123696502298e-05, + "grad_norm": 4.082369804382324, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8863568305969238, + "num_tokens": 674942803.0, + "step": 17687 + }, + { + "epoch": 2.250095407708943, + "ewc_loss": 0.008324755355715752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324755617650226e-05, + "grad_norm": 4.138768196105957, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8737889528274536, + "num_tokens": 674983160.0, + "step": 17688 + }, + { + "epoch": 2.2502226179875335, + "ewc_loss": 0.008374677039682865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374677418032661e-05, + "grad_norm": 4.145337104797363, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8833831548690796, + "num_tokens": 675021243.0, + "step": 17689 + }, + { + "epoch": 2.250349828266124, + "ewc_loss": 0.008350933901965618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350933785550296e-05, + "grad_norm": 4.120800971984863, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8781355619430542, + "num_tokens": 675059435.0, + "step": 17690 + }, + { + "epoch": 2.2504770385447146, + "ewc_loss": 0.00833872053772211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338720363099128e-05, + "grad_norm": 4.1905622482299805, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8841021656990051, + "num_tokens": 675091689.0, + "step": 17691 + }, + { + "epoch": 2.250604248823305, + "ewc_loss": 0.008397641591727734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397641795454547e-05, + "grad_norm": 4.140100002288818, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.877310037612915, + "num_tokens": 675130368.0, + "step": 17692 + }, + { + "epoch": 2.250731459101895, + "ewc_loss": 0.008322340436279774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322339999722317e-05, + "grad_norm": 4.088164806365967, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8854754567146301, + "num_tokens": 675171745.0, + "step": 17693 + }, + { + "epoch": 2.250858669380486, + "ewc_loss": 0.008338193409144878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338193583767861e-05, + "grad_norm": 4.170886516571045, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8802236914634705, + "num_tokens": 675206052.0, + "step": 17694 + }, + { + "epoch": 2.2509858796590763, + "ewc_loss": 0.008397960104048252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397959754802287e-05, + "grad_norm": 4.137203216552734, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8783040046691895, + "num_tokens": 675243231.0, + "step": 17695 + }, + { + "epoch": 2.251113089937667, + "ewc_loss": 0.008356000296771526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356000034837052e-05, + "grad_norm": 4.252844333648682, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8740321397781372, + "num_tokens": 675273582.0, + "step": 17696 + }, + { + "epoch": 2.2512403002162573, + "ewc_loss": 0.008425462059676647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425462146988139e-05, + "grad_norm": 4.099721908569336, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8787920475006104, + "num_tokens": 675314247.0, + "step": 17697 + }, + { + "epoch": 2.251367510494848, + "ewc_loss": 0.008333965204656124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333965524798259e-05, + "grad_norm": 4.1088690757751465, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8847953081130981, + "num_tokens": 675355675.0, + "step": 17698 + }, + { + "epoch": 2.2514947207734384, + "ewc_loss": 0.008381133899092674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.381134102819487e-05, + "grad_norm": 4.121639728546143, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8806722164154053, + "num_tokens": 675394290.0, + "step": 17699 + }, + { + "epoch": 2.251621931052029, + "ewc_loss": 0.00839480385184288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394804171985015e-05, + "grad_norm": 4.1193461418151855, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.883950412273407, + "num_tokens": 675434251.0, + "step": 17700 + }, + { + "epoch": 2.2517491413306194, + "ewc_loss": 0.008386407047510147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386406989302486e-05, + "grad_norm": 4.136072158813477, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8871316909790039, + "num_tokens": 675474731.0, + "step": 17701 + }, + { + "epoch": 2.25187635160921, + "ewc_loss": 0.008391953073441982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391952724196017e-05, + "grad_norm": 4.118426322937012, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8771402835845947, + "num_tokens": 675515180.0, + "step": 17702 + }, + { + "epoch": 2.2520035618878005, + "ewc_loss": 0.008376185782253742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376185724046081e-05, + "grad_norm": 4.137423992156982, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8715108036994934, + "num_tokens": 675553429.0, + "step": 17703 + }, + { + "epoch": 2.252130772166391, + "ewc_loss": 0.008395415730774403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395415352424607e-05, + "grad_norm": 4.125487327575684, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8857393264770508, + "num_tokens": 675592919.0, + "step": 17704 + }, + { + "epoch": 2.2522579824449815, + "ewc_loss": 0.008372270502150059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372270531253889e-05, + "grad_norm": 4.180561065673828, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8888574242591858, + "num_tokens": 675627528.0, + "step": 17705 + }, + { + "epoch": 2.252385192723572, + "ewc_loss": 0.008399143815040588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399143553106114e-05, + "grad_norm": 4.172638416290283, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8879040479660034, + "num_tokens": 675663721.0, + "step": 17706 + }, + { + "epoch": 2.2525124030021626, + "ewc_loss": 0.008360679261386395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360679203178734e-05, + "grad_norm": 4.13029670715332, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8758211731910706, + "num_tokens": 675705187.0, + "step": 17707 + }, + { + "epoch": 2.252639613280753, + "ewc_loss": 0.008362224325537682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362223888980225e-05, + "grad_norm": 4.033631801605225, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8899846076965332, + "num_tokens": 675752713.0, + "step": 17708 + }, + { + "epoch": 2.2527668235593437, + "ewc_loss": 0.008318295702338219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318295294884592e-05, + "grad_norm": 4.172036170959473, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8808270692825317, + "num_tokens": 675791009.0, + "step": 17709 + }, + { + "epoch": 2.252894033837934, + "ewc_loss": 0.008439269848167896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439269731752574e-05, + "grad_norm": 4.131124973297119, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8857768774032593, + "num_tokens": 675828890.0, + "step": 17710 + }, + { + "epoch": 2.2530212441165247, + "ewc_loss": 0.008345749229192734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345748938154429e-05, + "grad_norm": 4.184634685516357, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8822950720787048, + "num_tokens": 675862855.0, + "step": 17711 + }, + { + "epoch": 2.2531484543951152, + "ewc_loss": 0.008387808687984943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38780906633474e-05, + "grad_norm": 4.118423938751221, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.878932774066925, + "num_tokens": 675903032.0, + "step": 17712 + }, + { + "epoch": 2.2532756646737058, + "ewc_loss": 0.008354238234460354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354237797902897e-05, + "grad_norm": 4.1172919273376465, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8784410357475281, + "num_tokens": 675943035.0, + "step": 17713 + }, + { + "epoch": 2.2534028749522963, + "ewc_loss": 0.008355571888387203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355571480933577e-05, + "grad_norm": 4.094249725341797, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8838164806365967, + "num_tokens": 675983073.0, + "step": 17714 + }, + { + "epoch": 2.253530085230887, + "ewc_loss": 0.008343597874045372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343597437487915e-05, + "grad_norm": 4.1125898361206055, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8860516548156738, + "num_tokens": 676022341.0, + "step": 17715 + }, + { + "epoch": 2.2536572955094774, + "ewc_loss": 0.008347280323505402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347280527232215e-05, + "grad_norm": 4.134394645690918, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8791861534118652, + "num_tokens": 676060907.0, + "step": 17716 + }, + { + "epoch": 2.253784505788068, + "ewc_loss": 0.008371969684958458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371969306608662e-05, + "grad_norm": 4.138073444366455, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8806147575378418, + "num_tokens": 676100693.0, + "step": 17717 + }, + { + "epoch": 2.253911716066658, + "ewc_loss": 0.008350061252713203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350061398232356e-05, + "grad_norm": 4.098579406738281, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8948287963867188, + "num_tokens": 676137526.0, + "step": 17718 + }, + { + "epoch": 2.254038926345249, + "ewc_loss": 0.008317057974636555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.31705765449442e-05, + "grad_norm": 4.119329929351807, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.883986234664917, + "num_tokens": 676177245.0, + "step": 17719 + }, + { + "epoch": 2.254166136623839, + "ewc_loss": 0.008360980078577995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360980427823961e-05, + "grad_norm": 4.141715049743652, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8888821601867676, + "num_tokens": 676214568.0, + "step": 17720 + }, + { + "epoch": 2.2542933469024296, + "ewc_loss": 0.008362416177988052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362415974261239e-05, + "grad_norm": 4.184717178344727, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8827371597290039, + "num_tokens": 676252552.0, + "step": 17721 + }, + { + "epoch": 2.25442055718102, + "ewc_loss": 0.008346828632056713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346828690264374e-05, + "grad_norm": 4.163181781768799, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8654414415359497, + "num_tokens": 676290480.0, + "step": 17722 + }, + { + "epoch": 2.2545477674596106, + "ewc_loss": 0.008330870419740677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.330870332429186e-05, + "grad_norm": 4.158472061157227, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8929917812347412, + "num_tokens": 676324950.0, + "step": 17723 + }, + { + "epoch": 2.254674977738201, + "ewc_loss": 0.008360152132809162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360152423847467e-05, + "grad_norm": 4.119050979614258, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8739592432975769, + "num_tokens": 676363851.0, + "step": 17724 + }, + { + "epoch": 2.2548021880167917, + "ewc_loss": 0.008324363268911839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324363443534821e-05, + "grad_norm": 4.121647357940674, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8909634947776794, + "num_tokens": 676399920.0, + "step": 17725 + }, + { + "epoch": 2.254929398295382, + "ewc_loss": 0.008352929726243019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352929580723867e-05, + "grad_norm": 4.090658664703369, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.879726767539978, + "num_tokens": 676443243.0, + "step": 17726 + }, + { + "epoch": 2.2550566085739727, + "ewc_loss": 0.008325435221195221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32543519209139e-05, + "grad_norm": 4.177873134613037, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8688304424285889, + "num_tokens": 676480272.0, + "step": 17727 + }, + { + "epoch": 2.2551838188525632, + "ewc_loss": 0.008394383825361729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394383621634915e-05, + "grad_norm": 4.138888835906982, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8771375417709351, + "num_tokens": 676516630.0, + "step": 17728 + }, + { + "epoch": 2.2553110291311538, + "ewc_loss": 0.008354198187589645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354197780136019e-05, + "grad_norm": 4.1096978187561035, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8880906105041504, + "num_tokens": 676557842.0, + "step": 17729 + }, + { + "epoch": 2.2554382394097443, + "ewc_loss": 0.008357813581824303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357813203474507e-05, + "grad_norm": 4.092894554138184, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8906046152114868, + "num_tokens": 676599413.0, + "step": 17730 + }, + { + "epoch": 2.255565449688335, + "ewc_loss": 0.00836064014583826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360639913007617e-05, + "grad_norm": 4.185708045959473, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8741801977157593, + "num_tokens": 676637791.0, + "step": 17731 + }, + { + "epoch": 2.2556926599669254, + "ewc_loss": 0.008424127474427223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424127736361697e-05, + "grad_norm": 4.131316661834717, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8782126903533936, + "num_tokens": 676677108.0, + "step": 17732 + }, + { + "epoch": 2.255819870245516, + "ewc_loss": 0.008358935825526714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358935883734375e-05, + "grad_norm": 4.102898597717285, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8833117485046387, + "num_tokens": 676720698.0, + "step": 17733 + }, + { + "epoch": 2.2559470805241064, + "ewc_loss": 0.008378071710467339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378071652259678e-05, + "grad_norm": 4.1642889976501465, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.872437596321106, + "num_tokens": 676763719.0, + "step": 17734 + }, + { + "epoch": 2.256074290802697, + "ewc_loss": 0.00841047614812851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410475857090205e-05, + "grad_norm": 4.182153224945068, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.882094144821167, + "num_tokens": 676801083.0, + "step": 17735 + }, + { + "epoch": 2.2562015010812875, + "ewc_loss": 0.008391565643250942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391565643250942e-05, + "grad_norm": 4.222280979156494, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.874861478805542, + "num_tokens": 676840861.0, + "step": 17736 + }, + { + "epoch": 2.256328711359878, + "ewc_loss": 0.008396245539188385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396245539188385e-05, + "grad_norm": 4.1588311195373535, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8925561904907227, + "num_tokens": 676876864.0, + "step": 17737 + }, + { + "epoch": 2.2564559216384685, + "ewc_loss": 0.008346064947545528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346064714714885e-05, + "grad_norm": 4.027824401855469, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.9023042917251587, + "num_tokens": 676920983.0, + "step": 17738 + }, + { + "epoch": 2.256583131917059, + "ewc_loss": 0.008321329019963741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321329369209707e-05, + "grad_norm": 4.109925270080566, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8909609317779541, + "num_tokens": 676961294.0, + "step": 17739 + }, + { + "epoch": 2.2567103421956496, + "ewc_loss": 0.0083873076364398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387307752855122e-05, + "grad_norm": 4.18380069732666, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8746403455734253, + "num_tokens": 676996318.0, + "step": 17740 + }, + { + "epoch": 2.2568375524742397, + "ewc_loss": 0.00836408231407404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364082168554887e-05, + "grad_norm": 4.131288051605225, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8838474750518799, + "num_tokens": 677034083.0, + "step": 17741 + }, + { + "epoch": 2.2569647627528306, + "ewc_loss": 0.008316726423799992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316726598422974e-05, + "grad_norm": 4.126184940338135, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8844500780105591, + "num_tokens": 677071206.0, + "step": 17742 + }, + { + "epoch": 2.2570919730314207, + "ewc_loss": 0.00834400113672018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344001253135502e-05, + "grad_norm": 4.1826558113098145, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8865534663200378, + "num_tokens": 677103573.0, + "step": 17743 + }, + { + "epoch": 2.2572191833100113, + "ewc_loss": 0.008378779515624046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378779602935538e-05, + "grad_norm": 4.122707366943359, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8918219804763794, + "num_tokens": 677141406.0, + "step": 17744 + }, + { + "epoch": 2.257346393588602, + "ewc_loss": 0.00831865705549717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318656909978017e-05, + "grad_norm": 4.0702972412109375, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8727025985717773, + "num_tokens": 677185968.0, + "step": 17745 + }, + { + "epoch": 2.2574736038671923, + "ewc_loss": 0.008308010175824165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308010001201183e-05, + "grad_norm": 4.129518508911133, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.874472975730896, + "num_tokens": 677229004.0, + "step": 17746 + }, + { + "epoch": 2.257600814145783, + "ewc_loss": 0.008358570747077465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358570630662143e-05, + "grad_norm": 4.093440532684326, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8900278806686401, + "num_tokens": 677266317.0, + "step": 17747 + }, + { + "epoch": 2.2577280244243734, + "ewc_loss": 0.008299467153847218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29946729936637e-05, + "grad_norm": 4.136022090911865, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8923290967941284, + "num_tokens": 677302344.0, + "step": 17748 + }, + { + "epoch": 2.257855234702964, + "ewc_loss": 0.008352403528988361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352403528988361e-05, + "grad_norm": 4.164193630218506, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8724462985992432, + "num_tokens": 677342103.0, + "step": 17749 + }, + { + "epoch": 2.2579824449815544, + "ewc_loss": 0.008343778550624847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343778608832508e-05, + "grad_norm": 4.199911594390869, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8708194494247437, + "num_tokens": 677378602.0, + "step": 17750 + }, + { + "epoch": 2.258109655260145, + "ewc_loss": 0.008344452828168869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344453090103343e-05, + "grad_norm": 4.16420841217041, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8818755745887756, + "num_tokens": 677415005.0, + "step": 17751 + }, + { + "epoch": 2.2582368655387355, + "ewc_loss": 0.008320173248648643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320173219544813e-05, + "grad_norm": 4.129676342010498, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8632652163505554, + "num_tokens": 677457908.0, + "step": 17752 + }, + { + "epoch": 2.258364075817326, + "ewc_loss": 0.00834081880748272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.340818749275059e-05, + "grad_norm": 4.186960220336914, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8711156845092773, + "num_tokens": 677492759.0, + "step": 17753 + }, + { + "epoch": 2.2584912860959165, + "ewc_loss": 0.008358736522495747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358736522495747e-05, + "grad_norm": 4.103736400604248, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8802188634872437, + "num_tokens": 677535771.0, + "step": 17754 + }, + { + "epoch": 2.258618496374507, + "ewc_loss": 0.008321438916027546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321439236169681e-05, + "grad_norm": 4.116196632385254, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8942946195602417, + "num_tokens": 677575325.0, + "step": 17755 + }, + { + "epoch": 2.2587457066530976, + "ewc_loss": 0.00835157185792923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35157188703306e-05, + "grad_norm": 4.11188268661499, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8816946148872375, + "num_tokens": 677614132.0, + "step": 17756 + }, + { + "epoch": 2.258872916931688, + "ewc_loss": 0.008362104184925556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362103835679591e-05, + "grad_norm": 4.2198805809021, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8768515586853027, + "num_tokens": 677645657.0, + "step": 17757 + }, + { + "epoch": 2.2590001272102787, + "ewc_loss": 0.008417979814112186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417979552177712e-05, + "grad_norm": 4.104143142700195, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8895580172538757, + "num_tokens": 677686238.0, + "step": 17758 + }, + { + "epoch": 2.259127337488869, + "ewc_loss": 0.008338448591530323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338448969880119e-05, + "grad_norm": 4.132397651672363, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8863810896873474, + "num_tokens": 677724080.0, + "step": 17759 + }, + { + "epoch": 2.2592545477674597, + "ewc_loss": 0.008401107974350452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401108061661944e-05, + "grad_norm": 4.140307426452637, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8814259767532349, + "num_tokens": 677765768.0, + "step": 17760 + }, + { + "epoch": 2.2593817580460502, + "ewc_loss": 0.008396720513701439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396720659220591e-05, + "grad_norm": 4.153834342956543, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8833898901939392, + "num_tokens": 677803937.0, + "step": 17761 + }, + { + "epoch": 2.2595089683246408, + "ewc_loss": 0.008402631618082523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402631647186354e-05, + "grad_norm": 4.111305236816406, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8909045457839966, + "num_tokens": 677839685.0, + "step": 17762 + }, + { + "epoch": 2.2596361786032313, + "ewc_loss": 0.008373712189495564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37371262605302e-05, + "grad_norm": 4.163741588592529, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8682423233985901, + "num_tokens": 677880411.0, + "step": 17763 + }, + { + "epoch": 2.259763388881822, + "ewc_loss": 0.008410487323999405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410487498622388e-05, + "grad_norm": 4.106504917144775, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8887494802474976, + "num_tokens": 677921177.0, + "step": 17764 + }, + { + "epoch": 2.2598905991604123, + "ewc_loss": 0.008375369012355804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375368634006009e-05, + "grad_norm": 4.133979797363281, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8911075592041016, + "num_tokens": 677956393.0, + "step": 17765 + }, + { + "epoch": 2.2600178094390024, + "ewc_loss": 0.008409768342971802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409768634010106e-05, + "grad_norm": 4.207252502441406, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8792723417282104, + "num_tokens": 677988364.0, + "step": 17766 + }, + { + "epoch": 2.2601450197175934, + "ewc_loss": 0.008443345315754414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44334572320804e-05, + "grad_norm": 4.1445631980896, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8894151449203491, + "num_tokens": 678028093.0, + "step": 17767 + }, + { + "epoch": 2.2602722299961835, + "ewc_loss": 0.008381213992834091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.381214138353243e-05, + "grad_norm": 4.149181365966797, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8826969861984253, + "num_tokens": 678068174.0, + "step": 17768 + }, + { + "epoch": 2.260399440274774, + "ewc_loss": 0.008408824913203716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408824942307547e-05, + "grad_norm": 4.104644775390625, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8825638294219971, + "num_tokens": 678105784.0, + "step": 17769 + }, + { + "epoch": 2.2605266505533645, + "ewc_loss": 0.008400441147387028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400441583944485e-05, + "grad_norm": 4.118938446044922, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8939236402511597, + "num_tokens": 678142873.0, + "step": 17770 + }, + { + "epoch": 2.260653860831955, + "ewc_loss": 0.008411623537540436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411624003201723e-05, + "grad_norm": 4.166998386383057, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.879725992679596, + "num_tokens": 678181527.0, + "step": 17771 + }, + { + "epoch": 2.2607810711105456, + "ewc_loss": 0.008444544859230518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444544801022857e-05, + "grad_norm": 4.077272415161133, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8861213326454163, + "num_tokens": 678221225.0, + "step": 17772 + }, + { + "epoch": 2.260908281389136, + "ewc_loss": 0.008362827822566032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362827793462202e-05, + "grad_norm": 4.155191421508789, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8897167444229126, + "num_tokens": 678254767.0, + "step": 17773 + }, + { + "epoch": 2.2610354916677267, + "ewc_loss": 0.008437124080955982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437124051852152e-05, + "grad_norm": 4.178834438323975, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8850038051605225, + "num_tokens": 678290830.0, + "step": 17774 + }, + { + "epoch": 2.261162701946317, + "ewc_loss": 0.008417049422860146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417048957198858e-05, + "grad_norm": 4.087789058685303, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8925189971923828, + "num_tokens": 678327102.0, + "step": 17775 + }, + { + "epoch": 2.2612899122249077, + "ewc_loss": 0.008343519642949104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343519584741443e-05, + "grad_norm": 4.121666431427002, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8941558599472046, + "num_tokens": 678364872.0, + "step": 17776 + }, + { + "epoch": 2.2614171225034982, + "ewc_loss": 0.008410109207034111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41010914882645e-05, + "grad_norm": 4.186870098114014, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8756424784660339, + "num_tokens": 678399233.0, + "step": 17777 + }, + { + "epoch": 2.2615443327820888, + "ewc_loss": 0.008430580608546734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430580783169717e-05, + "grad_norm": 4.157925128936768, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8898512125015259, + "num_tokens": 678434577.0, + "step": 17778 + }, + { + "epoch": 2.2616715430606793, + "ewc_loss": 0.008363308385014534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3633087342605e-05, + "grad_norm": 4.163780689239502, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8887420892715454, + "num_tokens": 678466835.0, + "step": 17779 + }, + { + "epoch": 2.26179875333927, + "ewc_loss": 0.008409455418586731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409455040236935e-05, + "grad_norm": 4.109569549560547, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8969587087631226, + "num_tokens": 678506817.0, + "step": 17780 + }, + { + "epoch": 2.2619259636178604, + "ewc_loss": 0.008374033495783806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374033495783806e-05, + "grad_norm": 4.146012783050537, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.883873701095581, + "num_tokens": 678543734.0, + "step": 17781 + }, + { + "epoch": 2.262053173896451, + "ewc_loss": 0.008406256325542927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40625652926974e-05, + "grad_norm": 4.122625827789307, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8791593313217163, + "num_tokens": 678585066.0, + "step": 17782 + }, + { + "epoch": 2.2621803841750414, + "ewc_loss": 0.008360943756997585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360943320440128e-05, + "grad_norm": 4.152226448059082, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8952236175537109, + "num_tokens": 678614951.0, + "step": 17783 + }, + { + "epoch": 2.262307594453632, + "ewc_loss": 0.008413463830947876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41346409288235e-05, + "grad_norm": 4.169373035430908, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8836774230003357, + "num_tokens": 678649671.0, + "step": 17784 + }, + { + "epoch": 2.2624348047322225, + "ewc_loss": 0.008376472629606724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37647239677608e-05, + "grad_norm": 4.023554801940918, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8989748954772949, + "num_tokens": 678693418.0, + "step": 17785 + }, + { + "epoch": 2.262562015010813, + "ewc_loss": 0.008315135724842548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315135346492752e-05, + "grad_norm": 4.132506847381592, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8924486637115479, + "num_tokens": 678730891.0, + "step": 17786 + }, + { + "epoch": 2.2626892252894035, + "ewc_loss": 0.008430736139416695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43073648866266e-05, + "grad_norm": 4.144132614135742, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8921377658843994, + "num_tokens": 678766369.0, + "step": 17787 + }, + { + "epoch": 2.262816435567994, + "ewc_loss": 0.008369811810553074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369811985176057e-05, + "grad_norm": 4.101863861083984, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8725054264068604, + "num_tokens": 678809118.0, + "step": 17788 + }, + { + "epoch": 2.2629436458465846, + "ewc_loss": 0.008353102020919323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353102020919323e-05, + "grad_norm": 4.130919456481934, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8812692165374756, + "num_tokens": 678850057.0, + "step": 17789 + }, + { + "epoch": 2.263070856125175, + "ewc_loss": 0.008401153609156609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401153172599152e-05, + "grad_norm": 4.129290580749512, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8814301490783691, + "num_tokens": 678890253.0, + "step": 17790 + }, + { + "epoch": 2.263198066403765, + "ewc_loss": 0.008364995941519737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364996028831229e-05, + "grad_norm": 4.156495571136475, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8968151807785034, + "num_tokens": 678922535.0, + "step": 17791 + }, + { + "epoch": 2.263325276682356, + "ewc_loss": 0.008383290842175484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383290696656331e-05, + "grad_norm": 4.161891937255859, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8813854455947876, + "num_tokens": 678956136.0, + "step": 17792 + }, + { + "epoch": 2.2634524869609463, + "ewc_loss": 0.008376053534448147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376053301617503e-05, + "grad_norm": 4.155737400054932, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8865473866462708, + "num_tokens": 678990864.0, + "step": 17793 + }, + { + "epoch": 2.263579697239537, + "ewc_loss": 0.008369593881070614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369593706447631e-05, + "grad_norm": 4.125924110412598, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8928108215332031, + "num_tokens": 679028345.0, + "step": 17794 + }, + { + "epoch": 2.2637069075181273, + "ewc_loss": 0.008355454541742802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355454338015988e-05, + "grad_norm": 4.104455947875977, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8703473210334778, + "num_tokens": 679073679.0, + "step": 17795 + }, + { + "epoch": 2.263834117796718, + "ewc_loss": 0.008348957635462284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348957635462284e-05, + "grad_norm": 4.154149532318115, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8834618330001831, + "num_tokens": 679109274.0, + "step": 17796 + }, + { + "epoch": 2.2639613280753084, + "ewc_loss": 0.008372901938855648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3729020843748e-05, + "grad_norm": 4.181241989135742, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8833260536193848, + "num_tokens": 679141644.0, + "step": 17797 + }, + { + "epoch": 2.264088538353899, + "ewc_loss": 0.008371170610189438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371170406462625e-05, + "grad_norm": 4.104532718658447, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8677716255187988, + "num_tokens": 679184351.0, + "step": 17798 + }, + { + "epoch": 2.2642157486324894, + "ewc_loss": 0.00832349807024002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.323498332174495e-05, + "grad_norm": 4.116910934448242, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.877912700176239, + "num_tokens": 679222509.0, + "step": 17799 + }, + { + "epoch": 2.26434295891108, + "ewc_loss": 0.008373184129595757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373183663934469e-05, + "grad_norm": 4.172360897064209, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8674334287643433, + "num_tokens": 679263362.0, + "step": 17800 + }, + { + "epoch": 2.2644701691896705, + "ewc_loss": 0.00838444847613573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384448301512748e-05, + "grad_norm": 4.100151538848877, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8696788549423218, + "num_tokens": 679307859.0, + "step": 17801 + }, + { + "epoch": 2.264597379468261, + "ewc_loss": 0.00833809468895197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338094630744308e-05, + "grad_norm": 4.1471662521362305, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8928531408309937, + "num_tokens": 679344801.0, + "step": 17802 + }, + { + "epoch": 2.2647245897468515, + "ewc_loss": 0.008384651504456997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384651300730184e-05, + "grad_norm": 4.139644622802734, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8763564229011536, + "num_tokens": 679382795.0, + "step": 17803 + }, + { + "epoch": 2.264851800025442, + "ewc_loss": 0.008361531421542168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361531217815354e-05, + "grad_norm": 4.132326126098633, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.88047856092453, + "num_tokens": 679425282.0, + "step": 17804 + }, + { + "epoch": 2.2649790103040326, + "ewc_loss": 0.008350311778485775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350311691174284e-05, + "grad_norm": 4.147488117218018, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8694397807121277, + "num_tokens": 679466881.0, + "step": 17805 + }, + { + "epoch": 2.265106220582623, + "ewc_loss": 0.008387135341763496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387135312659666e-05, + "grad_norm": 4.085902214050293, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8941395282745361, + "num_tokens": 679509268.0, + "step": 17806 + }, + { + "epoch": 2.2652334308612136, + "ewc_loss": 0.008338231593370438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338231418747455e-05, + "grad_norm": 4.13991117477417, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8810853958129883, + "num_tokens": 679545082.0, + "step": 17807 + }, + { + "epoch": 2.265360641139804, + "ewc_loss": 0.00837655644863844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376556797884405e-05, + "grad_norm": 4.175928115844727, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8846137523651123, + "num_tokens": 679581795.0, + "step": 17808 + }, + { + "epoch": 2.2654878514183947, + "ewc_loss": 0.00838545337319374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385453111259267e-05, + "grad_norm": 4.1254658699035645, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8809611797332764, + "num_tokens": 679618978.0, + "step": 17809 + }, + { + "epoch": 2.2656150616969852, + "ewc_loss": 0.008343679830431938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343679655808955e-05, + "grad_norm": 4.140506744384766, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8845100998878479, + "num_tokens": 679655002.0, + "step": 17810 + }, + { + "epoch": 2.2657422719755758, + "ewc_loss": 0.00835773441940546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357734623132274e-05, + "grad_norm": 4.149630069732666, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8954908847808838, + "num_tokens": 679691143.0, + "step": 17811 + }, + { + "epoch": 2.2658694822541663, + "ewc_loss": 0.008360397070646286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360396896023303e-05, + "grad_norm": 4.1169257164001465, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8838543891906738, + "num_tokens": 679729168.0, + "step": 17812 + }, + { + "epoch": 2.265996692532757, + "ewc_loss": 0.008331510238349438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.331510616699234e-05, + "grad_norm": 4.104857921600342, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.881202220916748, + "num_tokens": 679773385.0, + "step": 17813 + }, + { + "epoch": 2.2661239028113473, + "ewc_loss": 0.008350050076842308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350049756700173e-05, + "grad_norm": 4.101624488830566, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8943526744842529, + "num_tokens": 679813884.0, + "step": 17814 + }, + { + "epoch": 2.266251113089938, + "ewc_loss": 0.008338716812431812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338716725120321e-05, + "grad_norm": 4.116761684417725, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8836749792098999, + "num_tokens": 679850834.0, + "step": 17815 + }, + { + "epoch": 2.266378323368528, + "ewc_loss": 0.00835542194545269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355422323802486e-05, + "grad_norm": 4.1636223793029785, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.888260543346405, + "num_tokens": 679890210.0, + "step": 17816 + }, + { + "epoch": 2.266505533647119, + "ewc_loss": 0.008364558219909668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364558016182855e-05, + "grad_norm": 4.169814586639404, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8911013603210449, + "num_tokens": 679926252.0, + "step": 17817 + }, + { + "epoch": 2.266632743925709, + "ewc_loss": 0.008341101929545403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34110178402625e-05, + "grad_norm": 4.140253067016602, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8808290362358093, + "num_tokens": 679959626.0, + "step": 17818 + }, + { + "epoch": 2.2667599542042995, + "ewc_loss": 0.008316229097545147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316228922922164e-05, + "grad_norm": 4.085458755493164, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.877869188785553, + "num_tokens": 679999973.0, + "step": 17819 + }, + { + "epoch": 2.26688716448289, + "ewc_loss": 0.008321581408381462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.321581117343158e-05, + "grad_norm": 4.136316299438477, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8776234984397888, + "num_tokens": 680041081.0, + "step": 17820 + }, + { + "epoch": 2.2670143747614806, + "ewc_loss": 0.008365177549421787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365177200175822e-05, + "grad_norm": 4.139493465423584, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8880279064178467, + "num_tokens": 680078953.0, + "step": 17821 + }, + { + "epoch": 2.267141585040071, + "ewc_loss": 0.008345863781869411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345863898284733e-05, + "grad_norm": 4.143962860107422, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8917205333709717, + "num_tokens": 680112309.0, + "step": 17822 + }, + { + "epoch": 2.2672687953186617, + "ewc_loss": 0.008369462564587593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369462739210576e-05, + "grad_norm": 4.170060634613037, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8788356781005859, + "num_tokens": 680148607.0, + "step": 17823 + }, + { + "epoch": 2.267396005597252, + "ewc_loss": 0.00837419182062149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374191384064034e-05, + "grad_norm": 4.066888809204102, + "learning_rate": 1e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.8990533947944641, + "num_tokens": 680189617.0, + "step": 17824 + }, + { + "epoch": 2.2675232158758427, + "ewc_loss": 0.008302655071020126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.302654896397144e-05, + "grad_norm": 4.07976770401001, + "learning_rate": 1e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.8964664936065674, + "num_tokens": 680227940.0, + "step": 17825 + }, + { + "epoch": 2.2676504261544332, + "ewc_loss": 0.008360432460904121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360432548215613e-05, + "grad_norm": 4.159551620483398, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8932842016220093, + "num_tokens": 680263473.0, + "step": 17826 + }, + { + "epoch": 2.2677776364330238, + "ewc_loss": 0.008375531993806362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375531615456566e-05, + "grad_norm": 4.199544906616211, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8764022588729858, + "num_tokens": 680297895.0, + "step": 17827 + }, + { + "epoch": 2.2679048467116143, + "ewc_loss": 0.008358013816177845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358014019904658e-05, + "grad_norm": 4.1255974769592285, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.886211633682251, + "num_tokens": 680337658.0, + "step": 17828 + }, + { + "epoch": 2.268032056990205, + "ewc_loss": 0.008329839445650578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329839329235256e-05, + "grad_norm": 4.167942047119141, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8788511753082275, + "num_tokens": 680375912.0, + "step": 17829 + }, + { + "epoch": 2.2681592672687954, + "ewc_loss": 0.00837785191833973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377851918339729e-05, + "grad_norm": 4.148612976074219, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8959133625030518, + "num_tokens": 680415231.0, + "step": 17830 + }, + { + "epoch": 2.268286477547386, + "ewc_loss": 0.008350996300578117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350996358785778e-05, + "grad_norm": 4.102096080780029, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8785474300384521, + "num_tokens": 680460037.0, + "step": 17831 + }, + { + "epoch": 2.2684136878259764, + "ewc_loss": 0.008308915421366692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.308915130328387e-05, + "grad_norm": 4.135664939880371, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8628291487693787, + "num_tokens": 680505521.0, + "step": 17832 + }, + { + "epoch": 2.268540898104567, + "ewc_loss": 0.008344590663909912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344590605702251e-05, + "grad_norm": 4.129570484161377, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8971589803695679, + "num_tokens": 680547682.0, + "step": 17833 + }, + { + "epoch": 2.2686681083831575, + "ewc_loss": 0.008313787169754505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.313787111546844e-05, + "grad_norm": 4.09943962097168, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8891224265098572, + "num_tokens": 680587510.0, + "step": 17834 + }, + { + "epoch": 2.268795318661748, + "ewc_loss": 0.008287993259727955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.287993114208803e-05, + "grad_norm": 4.108890533447266, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8862491846084595, + "num_tokens": 680624233.0, + "step": 17835 + }, + { + "epoch": 2.2689225289403385, + "ewc_loss": 0.008318544365465641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318544132634997e-05, + "grad_norm": 4.132970333099365, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8786792159080505, + "num_tokens": 680666091.0, + "step": 17836 + }, + { + "epoch": 2.269049739218929, + "ewc_loss": 0.00831241998821497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312419959111139e-05, + "grad_norm": 4.147943019866943, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8889574408531189, + "num_tokens": 680703062.0, + "step": 17837 + }, + { + "epoch": 2.2691769494975196, + "ewc_loss": 0.008298722095787525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.298722241306677e-05, + "grad_norm": 4.1610260009765625, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8956918716430664, + "num_tokens": 680735966.0, + "step": 17838 + }, + { + "epoch": 2.2693041597761097, + "ewc_loss": 0.008315867744386196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.315867307828739e-05, + "grad_norm": 4.078434944152832, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8844090700149536, + "num_tokens": 680780010.0, + "step": 17839 + }, + { + "epoch": 2.2694313700547006, + "ewc_loss": 0.008246462792158127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.246462675742805e-05, + "grad_norm": 4.125995635986328, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8814083337783813, + "num_tokens": 680819516.0, + "step": 17840 + }, + { + "epoch": 2.2695585803332907, + "ewc_loss": 0.008337341248989105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33734156913124e-05, + "grad_norm": 4.105759620666504, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8825006484985352, + "num_tokens": 680861374.0, + "step": 17841 + }, + { + "epoch": 2.2696857906118812, + "ewc_loss": 0.008268923498690128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.268923556897789e-05, + "grad_norm": 4.148561000823975, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8750039339065552, + "num_tokens": 680902447.0, + "step": 17842 + }, + { + "epoch": 2.2698130008904718, + "ewc_loss": 0.008322373032569885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322373469127342e-05, + "grad_norm": 4.083949565887451, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8982780575752258, + "num_tokens": 680942376.0, + "step": 17843 + }, + { + "epoch": 2.2699402111690623, + "ewc_loss": 0.00827084481716156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270844409707934e-05, + "grad_norm": 4.188182830810547, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8835515975952148, + "num_tokens": 680977060.0, + "step": 17844 + }, + { + "epoch": 2.270067421447653, + "ewc_loss": 0.008348935283720493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34893507999368e-05, + "grad_norm": 4.155571460723877, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8694865107536316, + "num_tokens": 681014504.0, + "step": 17845 + }, + { + "epoch": 2.2701946317262434, + "ewc_loss": 0.008296184241771698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.29618438729085e-05, + "grad_norm": 4.132112979888916, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8681880235671997, + "num_tokens": 681053919.0, + "step": 17846 + }, + { + "epoch": 2.270321842004834, + "ewc_loss": 0.008311578072607517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311578130815178e-05, + "grad_norm": 4.181447982788086, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8837692737579346, + "num_tokens": 681090883.0, + "step": 17847 + }, + { + "epoch": 2.2704490522834244, + "ewc_loss": 0.008365658111870289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365658140974119e-05, + "grad_norm": 4.1111226081848145, + "learning_rate": 1e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.895165205001831, + "num_tokens": 681128819.0, + "step": 17848 + }, + { + "epoch": 2.270576262562015, + "ewc_loss": 0.008288822136819363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28882257337682e-05, + "grad_norm": 4.085921764373779, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8742947578430176, + "num_tokens": 681171434.0, + "step": 17849 + }, + { + "epoch": 2.2707034728406055, + "ewc_loss": 0.008322766050696373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.322766370838508e-05, + "grad_norm": 4.169208526611328, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.893399715423584, + "num_tokens": 681206297.0, + "step": 17850 + }, + { + "epoch": 2.270830683119196, + "ewc_loss": 0.00836831796914339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368318231077865e-05, + "grad_norm": 4.142542839050293, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.874984622001648, + "num_tokens": 681242521.0, + "step": 17851 + }, + { + "epoch": 2.2709578933977865, + "ewc_loss": 0.008327854797244072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327854447998106e-05, + "grad_norm": 4.10082483291626, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8931910991668701, + "num_tokens": 681282474.0, + "step": 17852 + }, + { + "epoch": 2.271085103676377, + "ewc_loss": 0.008321890607476234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.32189034554176e-05, + "grad_norm": 4.171910285949707, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8814398646354675, + "num_tokens": 681317280.0, + "step": 17853 + }, + { + "epoch": 2.2712123139549676, + "ewc_loss": 0.00839259009808302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.392590098083019e-05, + "grad_norm": 4.195348262786865, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8815425634384155, + "num_tokens": 681349155.0, + "step": 17854 + }, + { + "epoch": 2.271339524233558, + "ewc_loss": 0.008349088951945305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34908860269934e-05, + "grad_norm": 4.193497657775879, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8817018866539001, + "num_tokens": 681380863.0, + "step": 17855 + }, + { + "epoch": 2.2714667345121486, + "ewc_loss": 0.008381670340895653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.381670340895653e-05, + "grad_norm": 4.091341972351074, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8855677843093872, + "num_tokens": 681421970.0, + "step": 17856 + }, + { + "epoch": 2.271593944790739, + "ewc_loss": 0.00830902997404337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.309030090458691e-05, + "grad_norm": 4.149708271026611, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8976396918296814, + "num_tokens": 681453511.0, + "step": 17857 + }, + { + "epoch": 2.2717211550693297, + "ewc_loss": 0.00840329471975565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403294486925006e-05, + "grad_norm": 4.113633155822754, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8701963424682617, + "num_tokens": 681496293.0, + "step": 17858 + }, + { + "epoch": 2.2718483653479202, + "ewc_loss": 0.008353219367563725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353219163836911e-05, + "grad_norm": 4.149299144744873, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8778045177459717, + "num_tokens": 681537074.0, + "step": 17859 + }, + { + "epoch": 2.2719755756265108, + "ewc_loss": 0.008390559814870358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3905593783129e-05, + "grad_norm": 4.129052639007568, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8769648671150208, + "num_tokens": 681578911.0, + "step": 17860 + }, + { + "epoch": 2.2721027859051013, + "ewc_loss": 0.008374943397939205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37494371808134e-05, + "grad_norm": 4.085633754730225, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8790175914764404, + "num_tokens": 681622134.0, + "step": 17861 + }, + { + "epoch": 2.272229996183692, + "ewc_loss": 0.008365096524357796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365096437046304e-05, + "grad_norm": 4.20272970199585, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8851232528686523, + "num_tokens": 681655919.0, + "step": 17862 + }, + { + "epoch": 2.2723572064622823, + "ewc_loss": 0.008434941992163658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434941992163658e-05, + "grad_norm": 4.141003608703613, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8760578632354736, + "num_tokens": 681696539.0, + "step": 17863 + }, + { + "epoch": 2.2724844167408724, + "ewc_loss": 0.00835572648793459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355726458830759e-05, + "grad_norm": 4.172595024108887, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8829861879348755, + "num_tokens": 681734630.0, + "step": 17864 + }, + { + "epoch": 2.2726116270194634, + "ewc_loss": 0.008392279967665672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.392280142288655e-05, + "grad_norm": 4.097079277038574, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8701391816139221, + "num_tokens": 681779669.0, + "step": 17865 + }, + { + "epoch": 2.2727388372980535, + "ewc_loss": 0.00836078729480505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360786887351424e-05, + "grad_norm": 4.144696235656738, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8801048994064331, + "num_tokens": 681821920.0, + "step": 17866 + }, + { + "epoch": 2.272866047576644, + "ewc_loss": 0.008387980982661247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387980778934434e-05, + "grad_norm": 4.153545379638672, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8813248872756958, + "num_tokens": 681859239.0, + "step": 17867 + }, + { + "epoch": 2.2729932578552345, + "ewc_loss": 0.008379860781133175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379860810237005e-05, + "grad_norm": 4.183574199676514, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8712460994720459, + "num_tokens": 681896569.0, + "step": 17868 + }, + { + "epoch": 2.273120468133825, + "ewc_loss": 0.008397693745791912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397693454753608e-05, + "grad_norm": 4.143716812133789, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8970757126808167, + "num_tokens": 681932549.0, + "step": 17869 + }, + { + "epoch": 2.2732476784124156, + "ewc_loss": 0.00837311614304781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373115997528657e-05, + "grad_norm": 4.140481472015381, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8794547319412231, + "num_tokens": 681973981.0, + "step": 17870 + }, + { + "epoch": 2.273374888691006, + "ewc_loss": 0.008361668325960636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361668733414263e-05, + "grad_norm": 4.239108562469482, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8621509075164795, + "num_tokens": 682009512.0, + "step": 17871 + }, + { + "epoch": 2.2735020989695967, + "ewc_loss": 0.008442683145403862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442683611065149e-05, + "grad_norm": 4.166271686553955, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8815779685974121, + "num_tokens": 682048058.0, + "step": 17872 + }, + { + "epoch": 2.273629309248187, + "ewc_loss": 0.008348902687430382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348903065780178e-05, + "grad_norm": 4.185698509216309, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8761021494865417, + "num_tokens": 682084181.0, + "step": 17873 + }, + { + "epoch": 2.2737565195267777, + "ewc_loss": 0.00840078853070736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400788647122681e-05, + "grad_norm": 4.123371601104736, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.9013615846633911, + "num_tokens": 682119613.0, + "step": 17874 + }, + { + "epoch": 2.2738837298053682, + "ewc_loss": 0.008356953039765358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35695318528451e-05, + "grad_norm": 4.077609539031982, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8838857412338257, + "num_tokens": 682159501.0, + "step": 17875 + }, + { + "epoch": 2.2740109400839588, + "ewc_loss": 0.00836595892906189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365958638023585e-05, + "grad_norm": 4.122640132904053, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8905530571937561, + "num_tokens": 682197323.0, + "step": 17876 + }, + { + "epoch": 2.2741381503625493, + "ewc_loss": 0.00839906558394432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399065700359643e-05, + "grad_norm": 4.188151836395264, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8846924304962158, + "num_tokens": 682231150.0, + "step": 17877 + }, + { + "epoch": 2.27426536064114, + "ewc_loss": 0.0084187351167202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418735524173826e-05, + "grad_norm": 4.126991271972656, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8844884037971497, + "num_tokens": 682269351.0, + "step": 17878 + }, + { + "epoch": 2.2743925709197303, + "ewc_loss": 0.008364852517843246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364852692466229e-05, + "grad_norm": 4.1562933921813965, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.893046498298645, + "num_tokens": 682303633.0, + "step": 17879 + }, + { + "epoch": 2.274519781198321, + "ewc_loss": 0.00840433593839407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404335676459596e-05, + "grad_norm": 4.137334823608398, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8816585540771484, + "num_tokens": 682340890.0, + "step": 17880 + }, + { + "epoch": 2.2746469914769114, + "ewc_loss": 0.00839206576347351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.392065501539037e-05, + "grad_norm": 4.130717754364014, + "learning_rate": 1e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8959497213363647, + "num_tokens": 682374253.0, + "step": 17881 + }, + { + "epoch": 2.274774201755502, + "ewc_loss": 0.008388741873204708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388741844100878e-05, + "grad_norm": 4.133228778839111, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8824687004089355, + "num_tokens": 682412675.0, + "step": 17882 + }, + { + "epoch": 2.2749014120340925, + "ewc_loss": 0.008411352522671223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411352609982714e-05, + "grad_norm": 4.198258876800537, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8943914175033569, + "num_tokens": 682447864.0, + "step": 17883 + }, + { + "epoch": 2.275028622312683, + "ewc_loss": 0.008443864993751049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443865226581693e-05, + "grad_norm": 4.130629539489746, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8828024864196777, + "num_tokens": 682486445.0, + "step": 17884 + }, + { + "epoch": 2.2751558325912735, + "ewc_loss": 0.00837867520749569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378675556741655e-05, + "grad_norm": 4.161814212799072, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8828584551811218, + "num_tokens": 682518173.0, + "step": 17885 + }, + { + "epoch": 2.275283042869864, + "ewc_loss": 0.008421968668699265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421968232141808e-05, + "grad_norm": 4.136475563049316, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8829016089439392, + "num_tokens": 682558627.0, + "step": 17886 + }, + { + "epoch": 2.2754102531484546, + "ewc_loss": 0.008396400138735771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396400517085567e-05, + "grad_norm": 4.095139026641846, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.887076199054718, + "num_tokens": 682600821.0, + "step": 17887 + }, + { + "epoch": 2.275537463427045, + "ewc_loss": 0.008396079763770103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396079647354782e-05, + "grad_norm": 4.154669284820557, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8702463507652283, + "num_tokens": 682643947.0, + "step": 17888 + }, + { + "epoch": 2.275664673705635, + "ewc_loss": 0.008419998921453953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419998630415648e-05, + "grad_norm": 4.1612467765808105, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8770098686218262, + "num_tokens": 682683045.0, + "step": 17889 + }, + { + "epoch": 2.275791883984226, + "ewc_loss": 0.008406538516283035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406538108829409e-05, + "grad_norm": 4.165159225463867, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8836893439292908, + "num_tokens": 682721617.0, + "step": 17890 + }, + { + "epoch": 2.2759190942628162, + "ewc_loss": 0.008385634049773216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38563428260386e-05, + "grad_norm": 4.149320602416992, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8774334192276001, + "num_tokens": 682758339.0, + "step": 17891 + }, + { + "epoch": 2.2760463045414068, + "ewc_loss": 0.008405600674450397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40560023789294e-05, + "grad_norm": 4.163234233856201, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8892184495925903, + "num_tokens": 682800501.0, + "step": 17892 + }, + { + "epoch": 2.2761735148199973, + "ewc_loss": 0.008375830017030239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375829929718748e-05, + "grad_norm": 4.15433406829834, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8858078122138977, + "num_tokens": 682833231.0, + "step": 17893 + }, + { + "epoch": 2.276300725098588, + "ewc_loss": 0.008382068015635014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382068335777149e-05, + "grad_norm": 4.133082866668701, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8882043361663818, + "num_tokens": 682873945.0, + "step": 17894 + }, + { + "epoch": 2.2764279353771784, + "ewc_loss": 0.008367634378373623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367634291062132e-05, + "grad_norm": 4.061179161071777, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8800570964813232, + "num_tokens": 682918329.0, + "step": 17895 + }, + { + "epoch": 2.276555145655769, + "ewc_loss": 0.008344405330717564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34440506878309e-05, + "grad_norm": 4.1361236572265625, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8883718252182007, + "num_tokens": 682956277.0, + "step": 17896 + }, + { + "epoch": 2.2766823559343594, + "ewc_loss": 0.008409421890974045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409422298427671e-05, + "grad_norm": 4.1715545654296875, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8861877918243408, + "num_tokens": 682990648.0, + "step": 17897 + }, + { + "epoch": 2.27680956621295, + "ewc_loss": 0.008396899327635765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3968996477779e-05, + "grad_norm": 4.113858222961426, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8969155550003052, + "num_tokens": 683029046.0, + "step": 17898 + }, + { + "epoch": 2.2769367764915405, + "ewc_loss": 0.008350903168320656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350903226528317e-05, + "grad_norm": 4.230419158935547, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8839235901832581, + "num_tokens": 683064541.0, + "step": 17899 + }, + { + "epoch": 2.277063986770131, + "ewc_loss": 0.008427009917795658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427009743172675e-05, + "grad_norm": 4.146265983581543, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8825837969779968, + "num_tokens": 683105015.0, + "step": 17900 + }, + { + "epoch": 2.2771911970487215, + "ewc_loss": 0.00834550242871046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345502283191308e-05, + "grad_norm": 4.163287162780762, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8768570423126221, + "num_tokens": 683142938.0, + "step": 17901 + }, + { + "epoch": 2.277318407327312, + "ewc_loss": 0.008399391546845436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399391663260758e-05, + "grad_norm": 4.133911609649658, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8946415185928345, + "num_tokens": 683181593.0, + "step": 17902 + }, + { + "epoch": 2.2774456176059026, + "ewc_loss": 0.008358990773558617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358991181012243e-05, + "grad_norm": 4.1587395668029785, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8730272054672241, + "num_tokens": 683218641.0, + "step": 17903 + }, + { + "epoch": 2.277572827884493, + "ewc_loss": 0.008389003574848175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389003778574988e-05, + "grad_norm": 4.138429641723633, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8678921461105347, + "num_tokens": 683261479.0, + "step": 17904 + }, + { + "epoch": 2.2777000381630836, + "ewc_loss": 0.008364526554942131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364526729565114e-05, + "grad_norm": 4.215054035186768, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8599134087562561, + "num_tokens": 683300602.0, + "step": 17905 + }, + { + "epoch": 2.277827248441674, + "ewc_loss": 0.008412573486566544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.412573515670374e-05, + "grad_norm": 4.174321174621582, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8831801414489746, + "num_tokens": 683337012.0, + "step": 17906 + }, + { + "epoch": 2.2779544587202647, + "ewc_loss": 0.008377774618566036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377774793189019e-05, + "grad_norm": 4.2178778648376465, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.890534520149231, + "num_tokens": 683368949.0, + "step": 17907 + }, + { + "epoch": 2.2780816689988552, + "ewc_loss": 0.00842350721359253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.423507097177207e-05, + "grad_norm": 4.106589317321777, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8816611170768738, + "num_tokens": 683408741.0, + "step": 17908 + }, + { + "epoch": 2.2782088792774458, + "ewc_loss": 0.008356090635061264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356090256711468e-05, + "grad_norm": 4.148946762084961, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8736969232559204, + "num_tokens": 683447843.0, + "step": 17909 + }, + { + "epoch": 2.2783360895560363, + "ewc_loss": 0.008422006852924824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422006794717163e-05, + "grad_norm": 4.102565765380859, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8797064423561096, + "num_tokens": 683489695.0, + "step": 17910 + }, + { + "epoch": 2.278463299834627, + "ewc_loss": 0.008378942497074604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378942584386095e-05, + "grad_norm": 4.162276744842529, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8831852674484253, + "num_tokens": 683528381.0, + "step": 17911 + }, + { + "epoch": 2.2785905101132173, + "ewc_loss": 0.0084364153444767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43641537358053e-05, + "grad_norm": 4.117043972015381, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8845155835151672, + "num_tokens": 683567261.0, + "step": 17912 + }, + { + "epoch": 2.278717720391808, + "ewc_loss": 0.00838728342205286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387283742194995e-05, + "grad_norm": 4.280827522277832, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8727015852928162, + "num_tokens": 683601083.0, + "step": 17913 + }, + { + "epoch": 2.278844930670398, + "ewc_loss": 0.008501126430928707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501126285409555e-05, + "grad_norm": 4.16954231262207, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8759778738021851, + "num_tokens": 683643494.0, + "step": 17914 + }, + { + "epoch": 2.278972140948989, + "ewc_loss": 0.008372003212571144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372003503609449e-05, + "grad_norm": 4.137954235076904, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8784002661705017, + "num_tokens": 683684267.0, + "step": 17915 + }, + { + "epoch": 2.279099351227579, + "ewc_loss": 0.008405175060033798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405174594372511e-05, + "grad_norm": 4.1577043533325195, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8735896348953247, + "num_tokens": 683723292.0, + "step": 17916 + }, + { + "epoch": 2.2792265615061695, + "ewc_loss": 0.008407785557210445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40778520796448e-05, + "grad_norm": 4.076290130615234, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8935996294021606, + "num_tokens": 683761378.0, + "step": 17917 + }, + { + "epoch": 2.27935377178476, + "ewc_loss": 0.008370013907551765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370014256797731e-05, + "grad_norm": 4.214087009429932, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8767398595809937, + "num_tokens": 683794838.0, + "step": 17918 + }, + { + "epoch": 2.2794809820633506, + "ewc_loss": 0.008461694233119488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461694233119488e-05, + "grad_norm": 4.181807041168213, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.865954577922821, + "num_tokens": 683832307.0, + "step": 17919 + }, + { + "epoch": 2.279608192341941, + "ewc_loss": 0.008403493091464043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403493120567873e-05, + "grad_norm": 4.12266731262207, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8760302662849426, + "num_tokens": 683871848.0, + "step": 17920 + }, + { + "epoch": 2.2797354026205316, + "ewc_loss": 0.00840369239449501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403692481806502e-05, + "grad_norm": 4.101881980895996, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8853184580802917, + "num_tokens": 683915086.0, + "step": 17921 + }, + { + "epoch": 2.279862612899122, + "ewc_loss": 0.008399773389101028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399773651035503e-05, + "grad_norm": 4.193073749542236, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8782958984375, + "num_tokens": 683945411.0, + "step": 17922 + }, + { + "epoch": 2.2799898231777127, + "ewc_loss": 0.008456330746412277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456330397166312e-05, + "grad_norm": 4.121025085449219, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8826496601104736, + "num_tokens": 683979664.0, + "step": 17923 + }, + { + "epoch": 2.2801170334563032, + "ewc_loss": 0.008398817852139473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398817590204999e-05, + "grad_norm": 4.102560043334961, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8714907169342041, + "num_tokens": 684025526.0, + "step": 17924 + }, + { + "epoch": 2.2802442437348938, + "ewc_loss": 0.008410698734223843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410698501393199e-05, + "grad_norm": 4.244210720062256, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8763704299926758, + "num_tokens": 684056992.0, + "step": 17925 + }, + { + "epoch": 2.2803714540134843, + "ewc_loss": 0.008517359383404255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51735967444256e-05, + "grad_norm": 4.13808536529541, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8865278363227844, + "num_tokens": 684094515.0, + "step": 17926 + }, + { + "epoch": 2.280498664292075, + "ewc_loss": 0.008401088416576385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401088416576385e-05, + "grad_norm": 4.132816791534424, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8881616592407227, + "num_tokens": 684132294.0, + "step": 17927 + }, + { + "epoch": 2.2806258745706653, + "ewc_loss": 0.008439937606453896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439937664661556e-05, + "grad_norm": 4.172169208526611, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8854565620422363, + "num_tokens": 684164010.0, + "step": 17928 + }, + { + "epoch": 2.280753084849256, + "ewc_loss": 0.008473426103591919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473425987176597e-05, + "grad_norm": 4.152233123779297, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8901631832122803, + "num_tokens": 684202077.0, + "step": 17929 + }, + { + "epoch": 2.2808802951278464, + "ewc_loss": 0.008438682183623314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43868256197311e-05, + "grad_norm": 4.1626057624816895, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8582124710083008, + "num_tokens": 684240799.0, + "step": 17930 + }, + { + "epoch": 2.281007505406437, + "ewc_loss": 0.008491835556924343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491835615132004e-05, + "grad_norm": 4.135362148284912, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8797194957733154, + "num_tokens": 684284080.0, + "step": 17931 + }, + { + "epoch": 2.2811347156850275, + "ewc_loss": 0.008464951999485493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464952406939119e-05, + "grad_norm": 4.21744441986084, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8695342540740967, + "num_tokens": 684317915.0, + "step": 17932 + }, + { + "epoch": 2.281261925963618, + "ewc_loss": 0.008519459515810013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519459515810013e-05, + "grad_norm": 4.0986127853393555, + "learning_rate": 1e-06, + "loss": 0.2335, + "mean_token_accuracy": 0.9179010391235352, + "num_tokens": 684354665.0, + "step": 17933 + }, + { + "epoch": 2.2813891362422085, + "ewc_loss": 0.008421272039413452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.42127192299813e-05, + "grad_norm": 4.150046348571777, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8799716234207153, + "num_tokens": 684392866.0, + "step": 17934 + }, + { + "epoch": 2.281516346520799, + "ewc_loss": 0.008467573672533035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46757393446751e-05, + "grad_norm": 4.152751445770264, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8856038451194763, + "num_tokens": 684428422.0, + "step": 17935 + }, + { + "epoch": 2.2816435567993896, + "ewc_loss": 0.008475649170577526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47564879222773e-05, + "grad_norm": 4.166574478149414, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8704537153244019, + "num_tokens": 684467129.0, + "step": 17936 + }, + { + "epoch": 2.2817707670779797, + "ewc_loss": 0.008460162207484245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460162644041702e-05, + "grad_norm": 4.238640785217285, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8766880035400391, + "num_tokens": 684500093.0, + "step": 17937 + }, + { + "epoch": 2.2818979773565706, + "ewc_loss": 0.008508660830557346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508660539519042e-05, + "grad_norm": 4.147818565368652, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8734043836593628, + "num_tokens": 684541422.0, + "step": 17938 + }, + { + "epoch": 2.2820251876351607, + "ewc_loss": 0.008427603170275688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427602733718231e-05, + "grad_norm": 4.1573486328125, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.881192147731781, + "num_tokens": 684579164.0, + "step": 17939 + }, + { + "epoch": 2.2821523979137512, + "ewc_loss": 0.00845482386648655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454824273940176e-05, + "grad_norm": 4.136841297149658, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.867508053779602, + "num_tokens": 684618617.0, + "step": 17940 + }, + { + "epoch": 2.2822796081923418, + "ewc_loss": 0.008444197475910187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444197737844661e-05, + "grad_norm": 4.191488742828369, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.86773681640625, + "num_tokens": 684655525.0, + "step": 17941 + }, + { + "epoch": 2.2824068184709323, + "ewc_loss": 0.008461689576506615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461689139949158e-05, + "grad_norm": 4.166668891906738, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8805679678916931, + "num_tokens": 684694525.0, + "step": 17942 + }, + { + "epoch": 2.282534028749523, + "ewc_loss": 0.0084349000826478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434899791609496e-05, + "grad_norm": 4.117584705352783, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8821748495101929, + "num_tokens": 684733247.0, + "step": 17943 + }, + { + "epoch": 2.2826612390281134, + "ewc_loss": 0.00839600246399641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396002522204071e-05, + "grad_norm": 4.201828956604004, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8935495018959045, + "num_tokens": 684763642.0, + "step": 17944 + }, + { + "epoch": 2.282788449306704, + "ewc_loss": 0.008453852497041225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453852933598682e-05, + "grad_norm": 4.082796573638916, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8888790607452393, + "num_tokens": 684806164.0, + "step": 17945 + }, + { + "epoch": 2.2829156595852944, + "ewc_loss": 0.008372134529054165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372134470846504e-05, + "grad_norm": 4.178031921386719, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8808460235595703, + "num_tokens": 684844193.0, + "step": 17946 + }, + { + "epoch": 2.283042869863885, + "ewc_loss": 0.008468780666589737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468781015835702e-05, + "grad_norm": 4.182566165924072, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8885957598686218, + "num_tokens": 684882103.0, + "step": 17947 + }, + { + "epoch": 2.2831700801424755, + "ewc_loss": 0.008415655232965946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415654883719981e-05, + "grad_norm": 4.123764514923096, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8965190649032593, + "num_tokens": 684917667.0, + "step": 17948 + }, + { + "epoch": 2.283297290421066, + "ewc_loss": 0.008361934684216976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361935033462942e-05, + "grad_norm": 4.165655136108398, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8851351737976074, + "num_tokens": 684954007.0, + "step": 17949 + }, + { + "epoch": 2.2834245006996565, + "ewc_loss": 0.00842572096735239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425721171079203e-05, + "grad_norm": 4.127691268920898, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8790010213851929, + "num_tokens": 684994693.0, + "step": 17950 + }, + { + "epoch": 2.283551710978247, + "ewc_loss": 0.008371840231120586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371840522158891e-05, + "grad_norm": 4.143901348114014, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8711561560630798, + "num_tokens": 685032713.0, + "step": 17951 + }, + { + "epoch": 2.2836789212568376, + "ewc_loss": 0.008390611037611961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390611037611961e-05, + "grad_norm": 4.202968120574951, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.876262903213501, + "num_tokens": 685066597.0, + "step": 17952 + }, + { + "epoch": 2.283806131535428, + "ewc_loss": 0.00843396782875061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433967741439119e-05, + "grad_norm": 4.108316421508789, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8771533966064453, + "num_tokens": 685106243.0, + "step": 17953 + }, + { + "epoch": 2.2839333418140186, + "ewc_loss": 0.00835508294403553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355083264177665e-05, + "grad_norm": 4.148035049438477, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8739458322525024, + "num_tokens": 685145177.0, + "step": 17954 + }, + { + "epoch": 2.284060552092609, + "ewc_loss": 0.008427944965660572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427944703726098e-05, + "grad_norm": 4.098170757293701, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8882293701171875, + "num_tokens": 685189130.0, + "step": 17955 + }, + { + "epoch": 2.2841877623711997, + "ewc_loss": 0.00838878657668829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388786227442324e-05, + "grad_norm": 4.1217546463012695, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8868628740310669, + "num_tokens": 685227536.0, + "step": 17956 + }, + { + "epoch": 2.28431497264979, + "ewc_loss": 0.008415643125772476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415643242187798e-05, + "grad_norm": 4.1356892585754395, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8869155645370483, + "num_tokens": 685267213.0, + "step": 17957 + }, + { + "epoch": 2.2844421829283807, + "ewc_loss": 0.008418973535299301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41897344798781e-05, + "grad_norm": 4.1898322105407715, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8791941404342651, + "num_tokens": 685301115.0, + "step": 17958 + }, + { + "epoch": 2.2845693932069713, + "ewc_loss": 0.008450283668935299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450284076388925e-05, + "grad_norm": 4.17459774017334, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8865471482276917, + "num_tokens": 685338497.0, + "step": 17959 + }, + { + "epoch": 2.284696603485562, + "ewc_loss": 0.00843705702573061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437057113042101e-05, + "grad_norm": 4.183794975280762, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8727489709854126, + "num_tokens": 685374983.0, + "step": 17960 + }, + { + "epoch": 2.2848238137641523, + "ewc_loss": 0.008433527313172817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433527546003461e-05, + "grad_norm": 4.1950554847717285, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8873305320739746, + "num_tokens": 685408859.0, + "step": 17961 + }, + { + "epoch": 2.2849510240427424, + "ewc_loss": 0.008427353575825691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427353895967826e-05, + "grad_norm": 4.158075332641602, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8779752254486084, + "num_tokens": 685445103.0, + "step": 17962 + }, + { + "epoch": 2.2850782343213334, + "ewc_loss": 0.008411649614572525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411649469053373e-05, + "grad_norm": 4.106297969818115, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.876697301864624, + "num_tokens": 685484657.0, + "step": 17963 + }, + { + "epoch": 2.2852054445999235, + "ewc_loss": 0.008395574986934662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395574695896357e-05, + "grad_norm": 4.158193111419678, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8908764719963074, + "num_tokens": 685524034.0, + "step": 17964 + }, + { + "epoch": 2.285332654878514, + "ewc_loss": 0.008438077755272388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438077929895371e-05, + "grad_norm": 4.167967319488525, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8864181041717529, + "num_tokens": 685560368.0, + "step": 17965 + }, + { + "epoch": 2.2854598651571045, + "ewc_loss": 0.008398030884563923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398031059186906e-05, + "grad_norm": 4.153496265411377, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8793818950653076, + "num_tokens": 685598192.0, + "step": 17966 + }, + { + "epoch": 2.285587075435695, + "ewc_loss": 0.00840499997138977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40499997138977e-05, + "grad_norm": 4.1911797523498535, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8843427896499634, + "num_tokens": 685636300.0, + "step": 17967 + }, + { + "epoch": 2.2857142857142856, + "ewc_loss": 0.008419766090810299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419765799771994e-05, + "grad_norm": 4.150193214416504, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8807015419006348, + "num_tokens": 685671457.0, + "step": 17968 + }, + { + "epoch": 2.285841495992876, + "ewc_loss": 0.008391957730054855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391957817366347e-05, + "grad_norm": 4.175821781158447, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8729193806648254, + "num_tokens": 685709687.0, + "step": 17969 + }, + { + "epoch": 2.2859687062714666, + "ewc_loss": 0.00844001118093729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44001115183346e-05, + "grad_norm": 4.203433036804199, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8603041768074036, + "num_tokens": 685747854.0, + "step": 17970 + }, + { + "epoch": 2.286095916550057, + "ewc_loss": 0.008413520641624928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413520845351741e-05, + "grad_norm": 4.100963115692139, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.9000915288925171, + "num_tokens": 685785215.0, + "step": 17971 + }, + { + "epoch": 2.2862231268286477, + "ewc_loss": 0.00836104154586792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36104154586792e-05, + "grad_norm": 4.217442512512207, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8873604536056519, + "num_tokens": 685818871.0, + "step": 17972 + }, + { + "epoch": 2.2863503371072382, + "ewc_loss": 0.008445908315479755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445908315479755e-05, + "grad_norm": 4.154721736907959, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8770961761474609, + "num_tokens": 685859089.0, + "step": 17973 + }, + { + "epoch": 2.2864775473858288, + "ewc_loss": 0.008366961032152176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36696126498282e-05, + "grad_norm": 4.284841060638428, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8733115792274475, + "num_tokens": 685900346.0, + "step": 17974 + }, + { + "epoch": 2.2866047576644193, + "ewc_loss": 0.00845676101744175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456761133857071e-05, + "grad_norm": 4.135332107543945, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.882116973400116, + "num_tokens": 685938186.0, + "step": 17975 + }, + { + "epoch": 2.28673196794301, + "ewc_loss": 0.008351145312190056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.351145515916869e-05, + "grad_norm": 4.180264949798584, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.889075756072998, + "num_tokens": 685974911.0, + "step": 17976 + }, + { + "epoch": 2.2868591782216003, + "ewc_loss": 0.008439181372523308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439180965069681e-05, + "grad_norm": 4.153570652008057, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8834236860275269, + "num_tokens": 686010430.0, + "step": 17977 + }, + { + "epoch": 2.286986388500191, + "ewc_loss": 0.008380810730159283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380810322705656e-05, + "grad_norm": 4.160093784332275, + "learning_rate": 1e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.8948442935943604, + "num_tokens": 686042727.0, + "step": 17978 + }, + { + "epoch": 2.2871135987787814, + "ewc_loss": 0.00841832347214222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418323704972863e-05, + "grad_norm": 4.154548168182373, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8850634694099426, + "num_tokens": 686078819.0, + "step": 17979 + }, + { + "epoch": 2.287240809057372, + "ewc_loss": 0.008424746803939342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424746920354664e-05, + "grad_norm": 4.124252796173096, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8848035335540771, + "num_tokens": 686117760.0, + "step": 17980 + }, + { + "epoch": 2.2873680193359625, + "ewc_loss": 0.008397867903113365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397868077736348e-05, + "grad_norm": 4.178539752960205, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8730103969573975, + "num_tokens": 686152992.0, + "step": 17981 + }, + { + "epoch": 2.287495229614553, + "ewc_loss": 0.008473964408040047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473964408040047e-05, + "grad_norm": 4.1226043701171875, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8776620030403137, + "num_tokens": 686195353.0, + "step": 17982 + }, + { + "epoch": 2.2876224398931435, + "ewc_loss": 0.008413687348365784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413687464781106e-05, + "grad_norm": 4.122004508972168, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8789828419685364, + "num_tokens": 686232990.0, + "step": 17983 + }, + { + "epoch": 2.287749650171734, + "ewc_loss": 0.008446448482573032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446448919130489e-05, + "grad_norm": 4.170423984527588, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8848162293434143, + "num_tokens": 686265933.0, + "step": 17984 + }, + { + "epoch": 2.2878768604503246, + "ewc_loss": 0.008458149619400501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458149386569858e-05, + "grad_norm": 4.114366054534912, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8887667655944824, + "num_tokens": 686307111.0, + "step": 17985 + }, + { + "epoch": 2.288004070728915, + "ewc_loss": 0.008426911197602749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426911517744884e-05, + "grad_norm": 4.174413681030273, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8827202320098877, + "num_tokens": 686343363.0, + "step": 17986 + }, + { + "epoch": 2.288131281007505, + "ewc_loss": 0.008489486761391163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489486936014146e-05, + "grad_norm": 4.1275715827941895, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8805367946624756, + "num_tokens": 686380888.0, + "step": 17987 + }, + { + "epoch": 2.288258491286096, + "ewc_loss": 0.008417821489274502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417821663897485e-05, + "grad_norm": 4.150210380554199, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8744534850120544, + "num_tokens": 686417350.0, + "step": 17988 + }, + { + "epoch": 2.2883857015646862, + "ewc_loss": 0.008462787605822086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4627878095489e-05, + "grad_norm": 4.105931282043457, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8894274830818176, + "num_tokens": 686458658.0, + "step": 17989 + }, + { + "epoch": 2.2885129118432768, + "ewc_loss": 0.008422233164310455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422233076998964e-05, + "grad_norm": 4.135403633117676, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8832398056983948, + "num_tokens": 686493945.0, + "step": 17990 + }, + { + "epoch": 2.2886401221218673, + "ewc_loss": 0.00843839906156063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438399527221918e-05, + "grad_norm": 4.151108264923096, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8838330507278442, + "num_tokens": 686527608.0, + "step": 17991 + }, + { + "epoch": 2.288767332400458, + "ewc_loss": 0.008443736471235752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443736442131922e-05, + "grad_norm": 4.178735733032227, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8915857672691345, + "num_tokens": 686562272.0, + "step": 17992 + }, + { + "epoch": 2.2888945426790483, + "ewc_loss": 0.008437464945018291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437465294264257e-05, + "grad_norm": 4.128634929656982, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8919078707695007, + "num_tokens": 686599937.0, + "step": 17993 + }, + { + "epoch": 2.289021752957639, + "ewc_loss": 0.008417472243309021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417472417932004e-05, + "grad_norm": 4.189533233642578, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8827029466629028, + "num_tokens": 686632103.0, + "step": 17994 + }, + { + "epoch": 2.2891489632362294, + "ewc_loss": 0.008470178581774235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470178727293387e-05, + "grad_norm": 4.159964084625244, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8869900703430176, + "num_tokens": 686662802.0, + "step": 17995 + }, + { + "epoch": 2.28927617351482, + "ewc_loss": 0.008446400053799152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446400170214474e-05, + "grad_norm": 4.099163055419922, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.9019126296043396, + "num_tokens": 686699386.0, + "step": 17996 + }, + { + "epoch": 2.2894033837934105, + "ewc_loss": 0.008395334705710411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395334589295089e-05, + "grad_norm": 4.134224891662598, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8852475881576538, + "num_tokens": 686738369.0, + "step": 17997 + }, + { + "epoch": 2.289530594072001, + "ewc_loss": 0.008461590856313705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461590914521366e-05, + "grad_norm": 4.177311420440674, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8771146535873413, + "num_tokens": 686774543.0, + "step": 17998 + }, + { + "epoch": 2.2896578043505915, + "ewc_loss": 0.008456747978925705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456748037133366e-05, + "grad_norm": 4.168992042541504, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8771167993545532, + "num_tokens": 686807048.0, + "step": 17999 + }, + { + "epoch": 2.289785014629182, + "ewc_loss": 0.008450913242995739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450913446722552e-05, + "grad_norm": 4.1335768699646, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8829038739204407, + "num_tokens": 686844678.0, + "step": 18000 + }, + { + "epoch": 2.2899122249077726, + "ewc_loss": 0.008454293943941593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454293856630102e-05, + "grad_norm": 4.262998580932617, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8814390897750854, + "num_tokens": 686877721.0, + "step": 18001 + }, + { + "epoch": 2.290039435186363, + "ewc_loss": 0.008526756428182125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526756573701277e-05, + "grad_norm": 4.121710777282715, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8769218921661377, + "num_tokens": 686918329.0, + "step": 18002 + }, + { + "epoch": 2.2901666454649536, + "ewc_loss": 0.008413768373429775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413768227910623e-05, + "grad_norm": 4.15139627456665, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8772743940353394, + "num_tokens": 686960050.0, + "step": 18003 + }, + { + "epoch": 2.290293855743544, + "ewc_loss": 0.00847067590802908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470675675198436e-05, + "grad_norm": 4.098043918609619, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8915714025497437, + "num_tokens": 686997849.0, + "step": 18004 + }, + { + "epoch": 2.2904210660221347, + "ewc_loss": 0.008439076133072376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439076191280037e-05, + "grad_norm": 4.190327167510986, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8843023777008057, + "num_tokens": 687032363.0, + "step": 18005 + }, + { + "epoch": 2.290548276300725, + "ewc_loss": 0.008509606122970581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509605686413124e-05, + "grad_norm": 4.183898448944092, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8685774803161621, + "num_tokens": 687069606.0, + "step": 18006 + }, + { + "epoch": 2.2906754865793157, + "ewc_loss": 0.008452702313661575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452701877104118e-05, + "grad_norm": 4.132371425628662, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8726851940155029, + "num_tokens": 687112857.0, + "step": 18007 + }, + { + "epoch": 2.2908026968579063, + "ewc_loss": 0.008440292440354824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440292731393129e-05, + "grad_norm": 4.139632225036621, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8781630992889404, + "num_tokens": 687152245.0, + "step": 18008 + }, + { + "epoch": 2.290929907136497, + "ewc_loss": 0.008460928685963154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460928802378476e-05, + "grad_norm": 4.124840259552002, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8811872005462646, + "num_tokens": 687193988.0, + "step": 18009 + }, + { + "epoch": 2.2910571174150873, + "ewc_loss": 0.008430557325482368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430557500105351e-05, + "grad_norm": 4.185027122497559, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8779326677322388, + "num_tokens": 687232375.0, + "step": 18010 + }, + { + "epoch": 2.291184327693678, + "ewc_loss": 0.008479680866003036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47968112793751e-05, + "grad_norm": 4.116087913513184, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8853653073310852, + "num_tokens": 687274566.0, + "step": 18011 + }, + { + "epoch": 2.291311537972268, + "ewc_loss": 0.008399611338973045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399611397180706e-05, + "grad_norm": 4.147383689880371, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8812698721885681, + "num_tokens": 687313231.0, + "step": 18012 + }, + { + "epoch": 2.291438748250859, + "ewc_loss": 0.008436786942183971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436787175014615e-05, + "grad_norm": 4.11274528503418, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.886847198009491, + "num_tokens": 687354502.0, + "step": 18013 + }, + { + "epoch": 2.291565958529449, + "ewc_loss": 0.00841492973268032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414930198341608e-05, + "grad_norm": 4.1525068283081055, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8789111375808716, + "num_tokens": 687399160.0, + "step": 18014 + }, + { + "epoch": 2.2916931688080395, + "ewc_loss": 0.008418153040111065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41815271996893e-05, + "grad_norm": 4.111273765563965, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8727437257766724, + "num_tokens": 687438775.0, + "step": 18015 + }, + { + "epoch": 2.29182037908663, + "ewc_loss": 0.008374052122235298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374051685677841e-05, + "grad_norm": 4.225513458251953, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8853391408920288, + "num_tokens": 687471435.0, + "step": 18016 + }, + { + "epoch": 2.2919475893652206, + "ewc_loss": 0.008448850363492966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448850712738931e-05, + "grad_norm": 4.116294860839844, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.885666012763977, + "num_tokens": 687508657.0, + "step": 18017 + }, + { + "epoch": 2.292074799643811, + "ewc_loss": 0.008342849090695381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342849469045177e-05, + "grad_norm": 4.157341480255127, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.892031192779541, + "num_tokens": 687545024.0, + "step": 18018 + }, + { + "epoch": 2.2922020099224016, + "ewc_loss": 0.008417844772338867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41784494696185e-05, + "grad_norm": 4.1832709312438965, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8863093852996826, + "num_tokens": 687579432.0, + "step": 18019 + }, + { + "epoch": 2.292329220200992, + "ewc_loss": 0.008400996215641499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400996011914685e-05, + "grad_norm": 4.193959712982178, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8743569254875183, + "num_tokens": 687618378.0, + "step": 18020 + }, + { + "epoch": 2.2924564304795827, + "ewc_loss": 0.008395475335419178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395475742872804e-05, + "grad_norm": 4.1282148361206055, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8818129897117615, + "num_tokens": 687657119.0, + "step": 18021 + }, + { + "epoch": 2.2925836407581732, + "ewc_loss": 0.008363990113139153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363989763893187e-05, + "grad_norm": 4.1923418045043945, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8721655607223511, + "num_tokens": 687691924.0, + "step": 18022 + }, + { + "epoch": 2.2927108510367638, + "ewc_loss": 0.008410798385739326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410798182012513e-05, + "grad_norm": 4.269139289855957, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8600740432739258, + "num_tokens": 687724221.0, + "step": 18023 + }, + { + "epoch": 2.2928380613153543, + "ewc_loss": 0.00843617133796215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436170901404694e-05, + "grad_norm": 4.108133792877197, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.883283257484436, + "num_tokens": 687765049.0, + "step": 18024 + }, + { + "epoch": 2.292965271593945, + "ewc_loss": 0.008312311954796314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.312312274938449e-05, + "grad_norm": 4.1190667152404785, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8744913339614868, + "num_tokens": 687810222.0, + "step": 18025 + }, + { + "epoch": 2.2930924818725353, + "ewc_loss": 0.008417777717113495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417777280556038e-05, + "grad_norm": 4.173360824584961, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8779029846191406, + "num_tokens": 687851961.0, + "step": 18026 + }, + { + "epoch": 2.293219692151126, + "ewc_loss": 0.008423023857176304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.423023973591626e-05, + "grad_norm": 4.154329299926758, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8712767362594604, + "num_tokens": 687890448.0, + "step": 18027 + }, + { + "epoch": 2.2933469024297164, + "ewc_loss": 0.00839513260871172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395133045269176e-05, + "grad_norm": 4.139448165893555, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8919671177864075, + "num_tokens": 687929509.0, + "step": 18028 + }, + { + "epoch": 2.293474112708307, + "ewc_loss": 0.008391938172280788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391938172280788e-05, + "grad_norm": 4.16587495803833, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8729469180107117, + "num_tokens": 687966814.0, + "step": 18029 + }, + { + "epoch": 2.2936013229868975, + "ewc_loss": 0.008423243649303913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.423243707511574e-05, + "grad_norm": 4.129543304443359, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8889613747596741, + "num_tokens": 688006398.0, + "step": 18030 + }, + { + "epoch": 2.293728533265488, + "ewc_loss": 0.008373270742595196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37327097542584e-05, + "grad_norm": 4.149067401885986, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8806111216545105, + "num_tokens": 688044604.0, + "step": 18031 + }, + { + "epoch": 2.2938557435440785, + "ewc_loss": 0.008409924805164337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409925067098811e-05, + "grad_norm": 4.157164096832275, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8775793313980103, + "num_tokens": 688082969.0, + "step": 18032 + }, + { + "epoch": 2.293982953822669, + "ewc_loss": 0.00839370395988226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393704047193751e-05, + "grad_norm": 4.15176248550415, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8884477615356445, + "num_tokens": 688121442.0, + "step": 18033 + }, + { + "epoch": 2.2941101641012596, + "ewc_loss": 0.008370942436158657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370942668989301e-05, + "grad_norm": 4.132124900817871, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8829979300498962, + "num_tokens": 688161096.0, + "step": 18034 + }, + { + "epoch": 2.2942373743798496, + "ewc_loss": 0.008383175358176231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383175736526027e-05, + "grad_norm": 4.1589789390563965, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8813568353652954, + "num_tokens": 688200782.0, + "step": 18035 + }, + { + "epoch": 2.2943645846584406, + "ewc_loss": 0.008379984647035599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379984501516446e-05, + "grad_norm": 4.099522590637207, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8870953917503357, + "num_tokens": 688241333.0, + "step": 18036 + }, + { + "epoch": 2.2944917949370307, + "ewc_loss": 0.008353735320270061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.353735756827518e-05, + "grad_norm": 4.208035469055176, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8802716135978699, + "num_tokens": 688278297.0, + "step": 18037 + }, + { + "epoch": 2.2946190052156212, + "ewc_loss": 0.00842704065144062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427040302194655e-05, + "grad_norm": 4.158653259277344, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8583613634109497, + "num_tokens": 688318835.0, + "step": 18038 + }, + { + "epoch": 2.2947462154942118, + "ewc_loss": 0.00836027693003416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360276842722669e-05, + "grad_norm": 4.187314033508301, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8840137124061584, + "num_tokens": 688354814.0, + "step": 18039 + }, + { + "epoch": 2.2948734257728023, + "ewc_loss": 0.008394502103328705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394502219744027e-05, + "grad_norm": 4.134387016296387, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.884631872177124, + "num_tokens": 688390008.0, + "step": 18040 + }, + { + "epoch": 2.295000636051393, + "ewc_loss": 0.008369524031877518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369523857254535e-05, + "grad_norm": 4.124257564544678, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8850035071372986, + "num_tokens": 688430938.0, + "step": 18041 + }, + { + "epoch": 2.2951278463299833, + "ewc_loss": 0.008380708284676075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380708459299058e-05, + "grad_norm": 4.280062198638916, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8523199558258057, + "num_tokens": 688466730.0, + "step": 18042 + }, + { + "epoch": 2.295255056608574, + "ewc_loss": 0.008462820202112198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462819823762402e-05, + "grad_norm": 4.150890350341797, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8801519274711609, + "num_tokens": 688506666.0, + "step": 18043 + }, + { + "epoch": 2.2953822668871644, + "ewc_loss": 0.008349346928298473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349346899194643e-05, + "grad_norm": 4.143578052520752, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8680427074432373, + "num_tokens": 688547726.0, + "step": 18044 + }, + { + "epoch": 2.295509477165755, + "ewc_loss": 0.008397966623306274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39796630316414e-05, + "grad_norm": 4.148331165313721, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8773676156997681, + "num_tokens": 688587487.0, + "step": 18045 + }, + { + "epoch": 2.2956366874443455, + "ewc_loss": 0.008387635461986065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38763517094776e-05, + "grad_norm": 4.080263137817383, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8752520084381104, + "num_tokens": 688632454.0, + "step": 18046 + }, + { + "epoch": 2.295763897722936, + "ewc_loss": 0.00836588628590107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365885878447443e-05, + "grad_norm": 4.142611503601074, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8889989256858826, + "num_tokens": 688668489.0, + "step": 18047 + }, + { + "epoch": 2.2958911080015265, + "ewc_loss": 0.008417797274887562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417796925641596e-05, + "grad_norm": 4.101139068603516, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8844128847122192, + "num_tokens": 688707899.0, + "step": 18048 + }, + { + "epoch": 2.296018318280117, + "ewc_loss": 0.008376811631023884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376811456400901e-05, + "grad_norm": 4.379350662231445, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.887151300907135, + "num_tokens": 688747171.0, + "step": 18049 + }, + { + "epoch": 2.2961455285587076, + "ewc_loss": 0.008539212867617607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53921301313676e-05, + "grad_norm": 4.107034206390381, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8792675137519836, + "num_tokens": 688789553.0, + "step": 18050 + }, + { + "epoch": 2.296272738837298, + "ewc_loss": 0.008296951651573181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.296951273223385e-05, + "grad_norm": 4.137153148651123, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8865512609481812, + "num_tokens": 688827632.0, + "step": 18051 + }, + { + "epoch": 2.2963999491158886, + "ewc_loss": 0.008413651026785374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413651084993035e-05, + "grad_norm": 4.204016208648682, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8882715702056885, + "num_tokens": 688861505.0, + "step": 18052 + }, + { + "epoch": 2.296527159394479, + "ewc_loss": 0.008433074690401554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433074981439859e-05, + "grad_norm": 4.1704840660095215, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8753534555435181, + "num_tokens": 688898270.0, + "step": 18053 + }, + { + "epoch": 2.2966543696730697, + "ewc_loss": 0.008361684158444405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361684012925252e-05, + "grad_norm": 4.142828941345215, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8825694918632507, + "num_tokens": 688937420.0, + "step": 18054 + }, + { + "epoch": 2.29678157995166, + "ewc_loss": 0.008388740010559559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388739661313593e-05, + "grad_norm": 4.131381034851074, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8871378898620605, + "num_tokens": 688972426.0, + "step": 18055 + }, + { + "epoch": 2.2969087902302507, + "ewc_loss": 0.008377795107662678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377795165870339e-05, + "grad_norm": 4.187654972076416, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8816110491752625, + "num_tokens": 689005801.0, + "step": 18056 + }, + { + "epoch": 2.2970360005088413, + "ewc_loss": 0.008427999913692474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428000001003966e-05, + "grad_norm": 4.1752471923828125, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.877234160900116, + "num_tokens": 689039999.0, + "step": 18057 + }, + { + "epoch": 2.297163210787432, + "ewc_loss": 0.008416232652962208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416232594754547e-05, + "grad_norm": 4.143233299255371, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8670597672462463, + "num_tokens": 689079404.0, + "step": 18058 + }, + { + "epoch": 2.2972904210660223, + "ewc_loss": 0.008406039327383041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406038978137076e-05, + "grad_norm": 4.1945905685424805, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.861015796661377, + "num_tokens": 689116197.0, + "step": 18059 + }, + { + "epoch": 2.2974176313446124, + "ewc_loss": 0.008468752726912498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468752639601007e-05, + "grad_norm": 4.144845008850098, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8830589652061462, + "num_tokens": 689153446.0, + "step": 18060 + }, + { + "epoch": 2.2975448416232034, + "ewc_loss": 0.008413919247686863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413919567828998e-05, + "grad_norm": 4.297328948974609, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8640384674072266, + "num_tokens": 689185310.0, + "step": 18061 + }, + { + "epoch": 2.2976720519017935, + "ewc_loss": 0.008558112196624279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558112313039601e-05, + "grad_norm": 4.1581878662109375, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8923306465148926, + "num_tokens": 689224150.0, + "step": 18062 + }, + { + "epoch": 2.297799262180384, + "ewc_loss": 0.008406841196119785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406840788666159e-05, + "grad_norm": 4.180199146270752, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8730581998825073, + "num_tokens": 689261206.0, + "step": 18063 + }, + { + "epoch": 2.2979264724589745, + "ewc_loss": 0.008492841385304928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492841152474284e-05, + "grad_norm": 4.1915812492370605, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8858131170272827, + "num_tokens": 689299129.0, + "step": 18064 + }, + { + "epoch": 2.298053682737565, + "ewc_loss": 0.00849073939025402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490739128319547e-05, + "grad_norm": 4.1674089431762695, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8969982862472534, + "num_tokens": 689330554.0, + "step": 18065 + }, + { + "epoch": 2.2981808930161556, + "ewc_loss": 0.008473547175526619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473547495668754e-05, + "grad_norm": 4.184312343597412, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8879331350326538, + "num_tokens": 689364084.0, + "step": 18066 + }, + { + "epoch": 2.298308103294746, + "ewc_loss": 0.008496221154928207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496221562381834e-05, + "grad_norm": 4.149536609649658, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.871397852897644, + "num_tokens": 689405254.0, + "step": 18067 + }, + { + "epoch": 2.2984353135733366, + "ewc_loss": 0.00846671499311924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.466715371469036e-05, + "grad_norm": 4.217231273651123, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.872011661529541, + "num_tokens": 689441109.0, + "step": 18068 + }, + { + "epoch": 2.298562523851927, + "ewc_loss": 0.008524194359779358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524194709025323e-05, + "grad_norm": 4.1693949699401855, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8814830780029297, + "num_tokens": 689474937.0, + "step": 18069 + }, + { + "epoch": 2.2986897341305177, + "ewc_loss": 0.008483685553073883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483685815008357e-05, + "grad_norm": 4.160933494567871, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8755133152008057, + "num_tokens": 689515971.0, + "step": 18070 + }, + { + "epoch": 2.298816944409108, + "ewc_loss": 0.008508607745170593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508607425028458e-05, + "grad_norm": 4.171811103820801, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8604390621185303, + "num_tokens": 689553620.0, + "step": 18071 + }, + { + "epoch": 2.2989441546876987, + "ewc_loss": 0.008518199436366558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518199319951236e-05, + "grad_norm": 4.1297287940979, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8868099451065063, + "num_tokens": 689592157.0, + "step": 18072 + }, + { + "epoch": 2.2990713649662893, + "ewc_loss": 0.00847148522734642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471484761685133e-05, + "grad_norm": 4.132752895355225, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8943496942520142, + "num_tokens": 689626889.0, + "step": 18073 + }, + { + "epoch": 2.29919857524488, + "ewc_loss": 0.008487221784889698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48722193040885e-05, + "grad_norm": 4.105697154998779, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8922325372695923, + "num_tokens": 689666161.0, + "step": 18074 + }, + { + "epoch": 2.2993257855234703, + "ewc_loss": 0.008460103534162045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460103708785027e-05, + "grad_norm": 4.154632091522217, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8739209175109863, + "num_tokens": 689713713.0, + "step": 18075 + }, + { + "epoch": 2.299452995802061, + "ewc_loss": 0.008499636314809322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49963616929017e-05, + "grad_norm": 4.145843982696533, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8749246597290039, + "num_tokens": 689755146.0, + "step": 18076 + }, + { + "epoch": 2.2995802060806514, + "ewc_loss": 0.008476612158119678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476612129015848e-05, + "grad_norm": 4.140752792358398, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8887935876846313, + "num_tokens": 689795925.0, + "step": 18077 + }, + { + "epoch": 2.299707416359242, + "ewc_loss": 0.008470749482512474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4707498899661e-05, + "grad_norm": 4.160580158233643, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8934941291809082, + "num_tokens": 689830673.0, + "step": 18078 + }, + { + "epoch": 2.2998346266378324, + "ewc_loss": 0.00846984051167965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469840395264328e-05, + "grad_norm": 4.237237930297852, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.882472574710846, + "num_tokens": 689863649.0, + "step": 18079 + }, + { + "epoch": 2.299961836916423, + "ewc_loss": 0.008513886481523514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513886132277548e-05, + "grad_norm": 4.150288105010986, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8812081813812256, + "num_tokens": 689901946.0, + "step": 18080 + }, + { + "epoch": 2.3000890471950135, + "ewc_loss": 0.008441228419542313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441228419542313e-05, + "grad_norm": 4.158388614654541, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8773999214172363, + "num_tokens": 689939968.0, + "step": 18081 + }, + { + "epoch": 2.300216257473604, + "ewc_loss": 0.008470377884805202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470378088532016e-05, + "grad_norm": 4.089478492736816, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8889034390449524, + "num_tokens": 689979792.0, + "step": 18082 + }, + { + "epoch": 2.3003434677521946, + "ewc_loss": 0.008436369709670544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436369535047561e-05, + "grad_norm": 4.134739398956299, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8929558992385864, + "num_tokens": 690013974.0, + "step": 18083 + }, + { + "epoch": 2.300470678030785, + "ewc_loss": 0.008485717698931694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485717262374237e-05, + "grad_norm": 4.104553699493408, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8878788948059082, + "num_tokens": 690056540.0, + "step": 18084 + }, + { + "epoch": 2.300597888309375, + "ewc_loss": 0.008438642136752605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43864181661047e-05, + "grad_norm": 4.12801456451416, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8770701885223389, + "num_tokens": 690101032.0, + "step": 18085 + }, + { + "epoch": 2.300725098587966, + "ewc_loss": 0.008470302447676659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470302418572828e-05, + "grad_norm": 4.172637462615967, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8924235701560974, + "num_tokens": 690135417.0, + "step": 18086 + }, + { + "epoch": 2.3008523088665562, + "ewc_loss": 0.008449449203908443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449449524050578e-05, + "grad_norm": 4.128812789916992, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8829560875892639, + "num_tokens": 690178552.0, + "step": 18087 + }, + { + "epoch": 2.3009795191451468, + "ewc_loss": 0.008426014333963394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426014392171055e-05, + "grad_norm": 4.161203384399414, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8778170943260193, + "num_tokens": 690221928.0, + "step": 18088 + }, + { + "epoch": 2.3011067294237373, + "ewc_loss": 0.008429688401520252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.429688750766218e-05, + "grad_norm": 4.1726789474487305, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8874552845954895, + "num_tokens": 690257040.0, + "step": 18089 + }, + { + "epoch": 2.301233939702328, + "ewc_loss": 0.008403927087783813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40392749523744e-05, + "grad_norm": 4.118771553039551, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8826920986175537, + "num_tokens": 690302720.0, + "step": 18090 + }, + { + "epoch": 2.3013611499809183, + "ewc_loss": 0.008360099047422409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360098581761122e-05, + "grad_norm": 4.208489894866943, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8787001371383667, + "num_tokens": 690337042.0, + "step": 18091 + }, + { + "epoch": 2.301488360259509, + "ewc_loss": 0.00841432809829712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414327749051154e-05, + "grad_norm": 4.081060409545898, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8872420191764832, + "num_tokens": 690380247.0, + "step": 18092 + }, + { + "epoch": 2.3016155705380994, + "ewc_loss": 0.008318501524627209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.318501932080835e-05, + "grad_norm": 4.1214375495910645, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8861093521118164, + "num_tokens": 690425361.0, + "step": 18093 + }, + { + "epoch": 2.30174278081669, + "ewc_loss": 0.0083822226151824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382222586078569e-05, + "grad_norm": 4.1916608810424805, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8769311904907227, + "num_tokens": 690465137.0, + "step": 18094 + }, + { + "epoch": 2.3018699910952805, + "ewc_loss": 0.008391189388930798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391189476242289e-05, + "grad_norm": 4.188723087310791, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8794020414352417, + "num_tokens": 690501437.0, + "step": 18095 + }, + { + "epoch": 2.301997201373871, + "ewc_loss": 0.008346165530383587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346165850525722e-05, + "grad_norm": 4.10873556137085, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8775011897087097, + "num_tokens": 690544134.0, + "step": 18096 + }, + { + "epoch": 2.3021244116524615, + "ewc_loss": 0.00832846388220787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328464173246175e-05, + "grad_norm": 4.1480183601379395, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8902227878570557, + "num_tokens": 690581062.0, + "step": 18097 + }, + { + "epoch": 2.302251621931052, + "ewc_loss": 0.008376333862543106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37633415358141e-05, + "grad_norm": 4.17970609664917, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8854986429214478, + "num_tokens": 690615145.0, + "step": 18098 + }, + { + "epoch": 2.3023788322096426, + "ewc_loss": 0.008370250463485718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37024999782443e-05, + "grad_norm": 4.109308242797852, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8865624666213989, + "num_tokens": 690655483.0, + "step": 18099 + }, + { + "epoch": 2.302506042488233, + "ewc_loss": 0.008335740305483341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.335740130860358e-05, + "grad_norm": 4.122543811798096, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8799545764923096, + "num_tokens": 690694133.0, + "step": 18100 + }, + { + "epoch": 2.3026332527668236, + "ewc_loss": 0.008368601091206074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368601265829057e-05, + "grad_norm": 4.237467288970947, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8611250519752502, + "num_tokens": 690729939.0, + "step": 18101 + }, + { + "epoch": 2.302760463045414, + "ewc_loss": 0.008442078717052937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44207897898741e-05, + "grad_norm": 4.098316192626953, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8877245187759399, + "num_tokens": 690770139.0, + "step": 18102 + }, + { + "epoch": 2.3028876733240047, + "ewc_loss": 0.008313694968819618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.313695434480906e-05, + "grad_norm": 4.118509292602539, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8832287788391113, + "num_tokens": 690811379.0, + "step": 18103 + }, + { + "epoch": 2.303014883602595, + "ewc_loss": 0.008396908640861511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396909106522799e-05, + "grad_norm": 4.228490829467773, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8783156871795654, + "num_tokens": 690844194.0, + "step": 18104 + }, + { + "epoch": 2.3031420938811857, + "ewc_loss": 0.00843260157853365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432601316599175e-05, + "grad_norm": 4.158803462982178, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8747119903564453, + "num_tokens": 690882129.0, + "step": 18105 + }, + { + "epoch": 2.3032693041597763, + "ewc_loss": 0.008336750790476799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.336750761372969e-05, + "grad_norm": 4.1161675453186035, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8934827446937561, + "num_tokens": 690915568.0, + "step": 18106 + }, + { + "epoch": 2.303396514438367, + "ewc_loss": 0.008364538662135601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364539098693058e-05, + "grad_norm": 4.134472846984863, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8792718648910522, + "num_tokens": 690953412.0, + "step": 18107 + }, + { + "epoch": 2.3035237247169573, + "ewc_loss": 0.008375332690775394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375332254217938e-05, + "grad_norm": 4.166862964630127, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8796940445899963, + "num_tokens": 690990047.0, + "step": 18108 + }, + { + "epoch": 2.303650934995548, + "ewc_loss": 0.008381081745028496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.381081715924665e-05, + "grad_norm": 4.155308723449707, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8876333236694336, + "num_tokens": 691031010.0, + "step": 18109 + }, + { + "epoch": 2.303778145274138, + "ewc_loss": 0.00837133452296257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371334115508944e-05, + "grad_norm": 4.174740791320801, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8836562633514404, + "num_tokens": 691065830.0, + "step": 18110 + }, + { + "epoch": 2.303905355552729, + "ewc_loss": 0.008390848524868488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390848961425945e-05, + "grad_norm": 4.170803546905518, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.891901969909668, + "num_tokens": 691101418.0, + "step": 18111 + }, + { + "epoch": 2.304032565831319, + "ewc_loss": 0.008383821696043015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383821841562167e-05, + "grad_norm": 4.197635650634766, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8676897287368774, + "num_tokens": 691139447.0, + "step": 18112 + }, + { + "epoch": 2.3041597761099095, + "ewc_loss": 0.00839182734489441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391827577725053e-05, + "grad_norm": 4.261251449584961, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8632850646972656, + "num_tokens": 691168809.0, + "step": 18113 + }, + { + "epoch": 2.3042869863885, + "ewc_loss": 0.008433944545686245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433944458374754e-05, + "grad_norm": 4.187032222747803, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8868486285209656, + "num_tokens": 691200420.0, + "step": 18114 + }, + { + "epoch": 2.3044141966670906, + "ewc_loss": 0.008401372469961643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401372178923339e-05, + "grad_norm": 4.152347564697266, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8771528005599976, + "num_tokens": 691238587.0, + "step": 18115 + }, + { + "epoch": 2.304541406945681, + "ewc_loss": 0.008411949500441551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411949238507077e-05, + "grad_norm": 4.172269821166992, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8818363547325134, + "num_tokens": 691277137.0, + "step": 18116 + }, + { + "epoch": 2.3046686172242716, + "ewc_loss": 0.00844162330031395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441623504040763e-05, + "grad_norm": 4.086623191833496, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8805562853813171, + "num_tokens": 691319564.0, + "step": 18117 + }, + { + "epoch": 2.304795827502862, + "ewc_loss": 0.008398863486945629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398863428737968e-05, + "grad_norm": 4.138979911804199, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8810327649116516, + "num_tokens": 691359652.0, + "step": 18118 + }, + { + "epoch": 2.3049230377814527, + "ewc_loss": 0.008450283668935299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450284076388925e-05, + "grad_norm": 4.133155345916748, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8811099529266357, + "num_tokens": 691399705.0, + "step": 18119 + }, + { + "epoch": 2.305050248060043, + "ewc_loss": 0.008420677855610847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420678204856813e-05, + "grad_norm": 4.1697282791137695, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8766646385192871, + "num_tokens": 691439447.0, + "step": 18120 + }, + { + "epoch": 2.3051774583386337, + "ewc_loss": 0.008455216884613037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455217175651342e-05, + "grad_norm": 4.166989803314209, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.869694709777832, + "num_tokens": 691477453.0, + "step": 18121 + }, + { + "epoch": 2.3053046686172243, + "ewc_loss": 0.008440975099802017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440975216217339e-05, + "grad_norm": 4.107900142669678, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8793079853057861, + "num_tokens": 691517766.0, + "step": 18122 + }, + { + "epoch": 2.305431878895815, + "ewc_loss": 0.00840085931122303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400859223911539e-05, + "grad_norm": 4.167895317077637, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8810838460922241, + "num_tokens": 691553956.0, + "step": 18123 + }, + { + "epoch": 2.3055590891744053, + "ewc_loss": 0.008448640815913677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448641165159643e-05, + "grad_norm": 4.1396074295043945, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8791436553001404, + "num_tokens": 691592522.0, + "step": 18124 + }, + { + "epoch": 2.305686299452996, + "ewc_loss": 0.008419470861554146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419470395892859e-05, + "grad_norm": 4.155946254730225, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8810502290725708, + "num_tokens": 691629202.0, + "step": 18125 + }, + { + "epoch": 2.3058135097315864, + "ewc_loss": 0.008436106145381927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436106145381927e-05, + "grad_norm": 4.13314962387085, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8845933675765991, + "num_tokens": 691665324.0, + "step": 18126 + }, + { + "epoch": 2.305940720010177, + "ewc_loss": 0.008403250947594643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403250831179321e-05, + "grad_norm": 4.101167678833008, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8904670476913452, + "num_tokens": 691704223.0, + "step": 18127 + }, + { + "epoch": 2.3060679302887674, + "ewc_loss": 0.008404061198234558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404061372857541e-05, + "grad_norm": 4.144800662994385, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8789807558059692, + "num_tokens": 691745061.0, + "step": 18128 + }, + { + "epoch": 2.306195140567358, + "ewc_loss": 0.008427383378148079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427382999798283e-05, + "grad_norm": 4.125140190124512, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8925902247428894, + "num_tokens": 691783630.0, + "step": 18129 + }, + { + "epoch": 2.3063223508459485, + "ewc_loss": 0.00838909950107336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389099093619734e-05, + "grad_norm": 4.203714370727539, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8807926177978516, + "num_tokens": 691818283.0, + "step": 18130 + }, + { + "epoch": 2.306449561124539, + "ewc_loss": 0.008431753143668175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.431752939941362e-05, + "grad_norm": 4.158100605010986, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8745086193084717, + "num_tokens": 691854585.0, + "step": 18131 + }, + { + "epoch": 2.3065767714031296, + "ewc_loss": 0.0083920918405056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.392091694986448e-05, + "grad_norm": 4.18423318862915, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8646749258041382, + "num_tokens": 691891784.0, + "step": 18132 + }, + { + "epoch": 2.3067039816817196, + "ewc_loss": 0.008426660671830177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426660497207195e-05, + "grad_norm": 4.162642955780029, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.882944643497467, + "num_tokens": 691927389.0, + "step": 18133 + }, + { + "epoch": 2.3068311919603106, + "ewc_loss": 0.008388628251850605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388628339162096e-05, + "grad_norm": 4.149207592010498, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8812002539634705, + "num_tokens": 691962928.0, + "step": 18134 + }, + { + "epoch": 2.3069584022389007, + "ewc_loss": 0.008395560085773468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395560143981129e-05, + "grad_norm": 4.102522850036621, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8830350637435913, + "num_tokens": 692007064.0, + "step": 18135 + }, + { + "epoch": 2.3070856125174912, + "ewc_loss": 0.008375568315386772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375568722840399e-05, + "grad_norm": 4.180194854736328, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8862287998199463, + "num_tokens": 692040668.0, + "step": 18136 + }, + { + "epoch": 2.3072128227960818, + "ewc_loss": 0.008439639583230019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439639350399375e-05, + "grad_norm": 4.147982120513916, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8806666135787964, + "num_tokens": 692077710.0, + "step": 18137 + }, + { + "epoch": 2.3073400330746723, + "ewc_loss": 0.008400946855545044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400946535402909e-05, + "grad_norm": 4.1744160652160645, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.887881875038147, + "num_tokens": 692112316.0, + "step": 18138 + }, + { + "epoch": 2.307467243353263, + "ewc_loss": 0.008431983180344105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43198286020197e-05, + "grad_norm": 4.17743444442749, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8855909705162048, + "num_tokens": 692153201.0, + "step": 18139 + }, + { + "epoch": 2.3075944536318533, + "ewc_loss": 0.00842251069843769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422510290984064e-05, + "grad_norm": 4.1414618492126465, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8750085830688477, + "num_tokens": 692192942.0, + "step": 18140 + }, + { + "epoch": 2.307721663910444, + "ewc_loss": 0.008407223038375378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407222776440904e-05, + "grad_norm": 4.101325035095215, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8954653739929199, + "num_tokens": 692235515.0, + "step": 18141 + }, + { + "epoch": 2.3078488741890344, + "ewc_loss": 0.008387366309762001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387365960516036e-05, + "grad_norm": 4.212987422943115, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8724794983863831, + "num_tokens": 692271338.0, + "step": 18142 + }, + { + "epoch": 2.307976084467625, + "ewc_loss": 0.00846024602651596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460246317554265e-05, + "grad_norm": 4.179156303405762, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8636325597763062, + "num_tokens": 692311469.0, + "step": 18143 + }, + { + "epoch": 2.3081032947462155, + "ewc_loss": 0.00839726161211729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397261990467086e-05, + "grad_norm": 4.185278415679932, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8651746511459351, + "num_tokens": 692352052.0, + "step": 18144 + }, + { + "epoch": 2.308230505024806, + "ewc_loss": 0.008431138470768929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.431138121522963e-05, + "grad_norm": 4.263826370239258, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8669381737709045, + "num_tokens": 692382978.0, + "step": 18145 + }, + { + "epoch": 2.3083577153033965, + "ewc_loss": 0.008472489193081856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47248884383589e-05, + "grad_norm": 4.181813716888428, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8840478658676147, + "num_tokens": 692415651.0, + "step": 18146 + }, + { + "epoch": 2.308484925581987, + "ewc_loss": 0.008396001532673836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396001067012548e-05, + "grad_norm": 4.144720554351807, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8648351430892944, + "num_tokens": 692455937.0, + "step": 18147 + }, + { + "epoch": 2.3086121358605776, + "ewc_loss": 0.008447748608887196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447748405160382e-05, + "grad_norm": 4.1435065269470215, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8762617707252502, + "num_tokens": 692496277.0, + "step": 18148 + }, + { + "epoch": 2.308739346139168, + "ewc_loss": 0.008443585596978664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443585829809308e-05, + "grad_norm": 4.179755687713623, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8828645944595337, + "num_tokens": 692530485.0, + "step": 18149 + }, + { + "epoch": 2.3088665564177586, + "ewc_loss": 0.008483463898301125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483463898301125e-05, + "grad_norm": 4.162874221801758, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8780298233032227, + "num_tokens": 692564794.0, + "step": 18150 + }, + { + "epoch": 2.308993766696349, + "ewc_loss": 0.008471337147057056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471337059745565e-05, + "grad_norm": 4.1478657722473145, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8713957667350769, + "num_tokens": 692610359.0, + "step": 18151 + }, + { + "epoch": 2.3091209769749397, + "ewc_loss": 0.008479277603328228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479278039885685e-05, + "grad_norm": 4.18310546875, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8876022100448608, + "num_tokens": 692641560.0, + "step": 18152 + }, + { + "epoch": 2.30924818725353, + "ewc_loss": 0.00851126667112112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511266787536442e-05, + "grad_norm": 4.19070291519165, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8774786591529846, + "num_tokens": 692680114.0, + "step": 18153 + }, + { + "epoch": 2.3093753975321207, + "ewc_loss": 0.008509193547070026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5091931396164e-05, + "grad_norm": 4.194622039794922, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8699444532394409, + "num_tokens": 692714199.0, + "step": 18154 + }, + { + "epoch": 2.3095026078107113, + "ewc_loss": 0.008502403274178505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502403215970844e-05, + "grad_norm": 4.112781524658203, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.882548451423645, + "num_tokens": 692752331.0, + "step": 18155 + }, + { + "epoch": 2.309629818089302, + "ewc_loss": 0.008470741100609303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470741158816963e-05, + "grad_norm": 4.144254207611084, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8870970010757446, + "num_tokens": 692789167.0, + "step": 18156 + }, + { + "epoch": 2.3097570283678923, + "ewc_loss": 0.008526872843503952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526872989023104e-05, + "grad_norm": 4.105928897857666, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8811245560646057, + "num_tokens": 692829234.0, + "step": 18157 + }, + { + "epoch": 2.3098842386464824, + "ewc_loss": 0.00849476084113121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494760550092906e-05, + "grad_norm": 4.2598185539245605, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8627290725708008, + "num_tokens": 692863749.0, + "step": 18158 + }, + { + "epoch": 2.3100114489250734, + "ewc_loss": 0.00858934037387371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589340723119676e-05, + "grad_norm": 4.179786682128906, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8772597312927246, + "num_tokens": 692900972.0, + "step": 18159 + }, + { + "epoch": 2.3101386592036635, + "ewc_loss": 0.008479876443743706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479876123601571e-05, + "grad_norm": 4.1372528076171875, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8757959604263306, + "num_tokens": 692938522.0, + "step": 18160 + }, + { + "epoch": 2.310265869482254, + "ewc_loss": 0.008483810350298882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48381023388356e-05, + "grad_norm": 4.1761908531188965, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8900138139724731, + "num_tokens": 692973653.0, + "step": 18161 + }, + { + "epoch": 2.3103930797608445, + "ewc_loss": 0.008534129709005356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534130029147491e-05, + "grad_norm": 4.143221378326416, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8754820227622986, + "num_tokens": 693016325.0, + "step": 18162 + }, + { + "epoch": 2.310520290039435, + "ewc_loss": 0.008497306145727634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497306407662109e-05, + "grad_norm": 4.135001182556152, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8897558450698853, + "num_tokens": 693052022.0, + "step": 18163 + }, + { + "epoch": 2.3106475003180256, + "ewc_loss": 0.00851054023951292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510540646966547e-05, + "grad_norm": 4.139949798583984, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8846881985664368, + "num_tokens": 693093936.0, + "step": 18164 + }, + { + "epoch": 2.310774710596616, + "ewc_loss": 0.00849184114485979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491841435898095e-05, + "grad_norm": 4.135419845581055, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8944350481033325, + "num_tokens": 693130160.0, + "step": 18165 + }, + { + "epoch": 2.3109019208752066, + "ewc_loss": 0.008508525788784027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50852593430318e-05, + "grad_norm": 4.142216205596924, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8834258317947388, + "num_tokens": 693172405.0, + "step": 18166 + }, + { + "epoch": 2.311029131153797, + "ewc_loss": 0.008477694354951382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477694791508839e-05, + "grad_norm": 4.151863098144531, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8839681148529053, + "num_tokens": 693206499.0, + "step": 18167 + }, + { + "epoch": 2.3111563414323877, + "ewc_loss": 0.008501633070409298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501632692059502e-05, + "grad_norm": 4.192002296447754, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8800204992294312, + "num_tokens": 693244287.0, + "step": 18168 + }, + { + "epoch": 2.311283551710978, + "ewc_loss": 0.008510525338351727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510525367455557e-05, + "grad_norm": 4.136162757873535, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8726793527603149, + "num_tokens": 693286164.0, + "step": 18169 + }, + { + "epoch": 2.3114107619895687, + "ewc_loss": 0.008455978706479073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455978968413547e-05, + "grad_norm": 4.121800899505615, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8886345028877258, + "num_tokens": 693330961.0, + "step": 18170 + }, + { + "epoch": 2.3115379722681593, + "ewc_loss": 0.00846127700060606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461276593152434e-05, + "grad_norm": 4.1081976890563965, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8864699602127075, + "num_tokens": 693375169.0, + "step": 18171 + }, + { + "epoch": 2.31166518254675, + "ewc_loss": 0.008451336063444614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451336179859936e-05, + "grad_norm": 4.200497150421143, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.875147819519043, + "num_tokens": 693414127.0, + "step": 18172 + }, + { + "epoch": 2.3117923928253403, + "ewc_loss": 0.008481349796056747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481349505018443e-05, + "grad_norm": 4.20700740814209, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8764888048171997, + "num_tokens": 693450446.0, + "step": 18173 + }, + { + "epoch": 2.311919603103931, + "ewc_loss": 0.008456659503281116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456659270450473e-05, + "grad_norm": 4.163608074188232, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8732311725616455, + "num_tokens": 693490973.0, + "step": 18174 + }, + { + "epoch": 2.3120468133825214, + "ewc_loss": 0.008414170704782009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414170588366687e-05, + "grad_norm": 4.173643589019775, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8830361366271973, + "num_tokens": 693526815.0, + "step": 18175 + }, + { + "epoch": 2.312174023661112, + "ewc_loss": 0.008435753174126148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43575326143764e-05, + "grad_norm": 4.169715404510498, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8750627040863037, + "num_tokens": 693566949.0, + "step": 18176 + }, + { + "epoch": 2.3123012339397024, + "ewc_loss": 0.008407087996602058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407088171225041e-05, + "grad_norm": 4.173332691192627, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8750938177108765, + "num_tokens": 693601116.0, + "step": 18177 + }, + { + "epoch": 2.312428444218293, + "ewc_loss": 0.008439469151198864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439469092991203e-05, + "grad_norm": 4.129973888397217, + "learning_rate": 1e-06, + "loss": 0.2719, + "mean_token_accuracy": 0.9080941081047058, + "num_tokens": 693640588.0, + "step": 18178 + }, + { + "epoch": 2.3125556544968835, + "ewc_loss": 0.008400024846196175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400024671573192e-05, + "grad_norm": 4.107512474060059, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8821264505386353, + "num_tokens": 693683352.0, + "step": 18179 + }, + { + "epoch": 2.312682864775474, + "ewc_loss": 0.008408429101109505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408429130213335e-05, + "grad_norm": 4.1994099617004395, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8856125473976135, + "num_tokens": 693722579.0, + "step": 18180 + }, + { + "epoch": 2.3128100750540646, + "ewc_loss": 0.008444326929748058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444326522294432e-05, + "grad_norm": 4.1132378578186035, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8837572336196899, + "num_tokens": 693766743.0, + "step": 18181 + }, + { + "epoch": 2.312937285332655, + "ewc_loss": 0.00836153794080019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361537766177207e-05, + "grad_norm": 4.171962261199951, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8822364807128906, + "num_tokens": 693809699.0, + "step": 18182 + }, + { + "epoch": 2.313064495611245, + "ewc_loss": 0.00841640867292881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41640867292881e-05, + "grad_norm": 4.174094200134277, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.886674702167511, + "num_tokens": 693845508.0, + "step": 18183 + }, + { + "epoch": 2.313191705889836, + "ewc_loss": 0.008378521539270878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378521306440234e-05, + "grad_norm": 4.1205596923828125, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.881956934928894, + "num_tokens": 693883763.0, + "step": 18184 + }, + { + "epoch": 2.313318916168426, + "ewc_loss": 0.008360007777810097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360007632290944e-05, + "grad_norm": 4.173096656799316, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8863024711608887, + "num_tokens": 693917093.0, + "step": 18185 + }, + { + "epoch": 2.3134461264470167, + "ewc_loss": 0.008390925824642181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390925358980894e-05, + "grad_norm": 4.221446990966797, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8815338015556335, + "num_tokens": 693949219.0, + "step": 18186 + }, + { + "epoch": 2.3135733367256073, + "ewc_loss": 0.008407599292695522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407599671045318e-05, + "grad_norm": 4.152019500732422, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8744887113571167, + "num_tokens": 693992755.0, + "step": 18187 + }, + { + "epoch": 2.313700547004198, + "ewc_loss": 0.008363164961338043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3631653978955e-05, + "grad_norm": 4.1883721351623535, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8670920133590698, + "num_tokens": 694030386.0, + "step": 18188 + }, + { + "epoch": 2.3138277572827883, + "ewc_loss": 0.008395555429160595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39555577840656e-05, + "grad_norm": 4.249576091766357, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8881900906562805, + "num_tokens": 694065019.0, + "step": 18189 + }, + { + "epoch": 2.313954967561379, + "ewc_loss": 0.008432422764599323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432423055637628e-05, + "grad_norm": 4.098021984100342, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8803026676177979, + "num_tokens": 694106645.0, + "step": 18190 + }, + { + "epoch": 2.3140821778399694, + "ewc_loss": 0.008346281945705414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346281538251787e-05, + "grad_norm": 4.147161483764648, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8852915167808533, + "num_tokens": 694149146.0, + "step": 18191 + }, + { + "epoch": 2.31420938811856, + "ewc_loss": 0.008409282192587852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409282600041479e-05, + "grad_norm": 4.141833782196045, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8651070594787598, + "num_tokens": 694193567.0, + "step": 18192 + }, + { + "epoch": 2.3143365983971504, + "ewc_loss": 0.008402409963309765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402409730479121e-05, + "grad_norm": 4.122401237487793, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8891973495483398, + "num_tokens": 694232160.0, + "step": 18193 + }, + { + "epoch": 2.314463808675741, + "ewc_loss": 0.00838453508913517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384534885408357e-05, + "grad_norm": 4.148303031921387, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8821889758110046, + "num_tokens": 694275153.0, + "step": 18194 + }, + { + "epoch": 2.3145910189543315, + "ewc_loss": 0.008410044014453888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410043665207922e-05, + "grad_norm": 4.1603169441223145, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8869460821151733, + "num_tokens": 694313722.0, + "step": 18195 + }, + { + "epoch": 2.314718229232922, + "ewc_loss": 0.008397970348596573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397969941142946e-05, + "grad_norm": 4.197229862213135, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8852775692939758, + "num_tokens": 694350544.0, + "step": 18196 + }, + { + "epoch": 2.3148454395115126, + "ewc_loss": 0.00839829072356224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398290810873732e-05, + "grad_norm": 4.150119781494141, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8823591470718384, + "num_tokens": 694386552.0, + "step": 18197 + }, + { + "epoch": 2.314972649790103, + "ewc_loss": 0.008363447152078152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363446977455169e-05, + "grad_norm": 4.086795330047607, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8781788349151611, + "num_tokens": 694434368.0, + "step": 18198 + }, + { + "epoch": 2.3150998600686936, + "ewc_loss": 0.008343108929693699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343109220732003e-05, + "grad_norm": 4.127298831939697, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8810175657272339, + "num_tokens": 694479856.0, + "step": 18199 + }, + { + "epoch": 2.315227070347284, + "ewc_loss": 0.008393335156142712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393335156142712e-05, + "grad_norm": 4.158263683319092, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8818991780281067, + "num_tokens": 694521107.0, + "step": 18200 + }, + { + "epoch": 2.3153542806258747, + "ewc_loss": 0.008364162407815456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364162204088643e-05, + "grad_norm": 4.117656707763672, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8825790882110596, + "num_tokens": 694560886.0, + "step": 18201 + }, + { + "epoch": 2.315481490904465, + "ewc_loss": 0.008341520093381405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341520151589066e-05, + "grad_norm": 4.149653434753418, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8840795755386353, + "num_tokens": 694598404.0, + "step": 18202 + }, + { + "epoch": 2.3156087011830557, + "ewc_loss": 0.008382722735404968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382723171962425e-05, + "grad_norm": 4.190851211547852, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8776199817657471, + "num_tokens": 694637162.0, + "step": 18203 + }, + { + "epoch": 2.3157359114616463, + "ewc_loss": 0.008368180133402348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368179987883195e-05, + "grad_norm": 4.16109037399292, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8851505517959595, + "num_tokens": 694677435.0, + "step": 18204 + }, + { + "epoch": 2.315863121740237, + "ewc_loss": 0.008345844224095345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.345844253199175e-05, + "grad_norm": 4.124194622039795, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8894199132919312, + "num_tokens": 694713980.0, + "step": 18205 + }, + { + "epoch": 2.3159903320188273, + "ewc_loss": 0.008349464274942875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349464042112231e-05, + "grad_norm": 4.158666610717773, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8783316612243652, + "num_tokens": 694753884.0, + "step": 18206 + }, + { + "epoch": 2.316117542297418, + "ewc_loss": 0.008370840921998024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370840805582702e-05, + "grad_norm": 4.165465831756592, + "learning_rate": 1e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.892884373664856, + "num_tokens": 694785739.0, + "step": 18207 + }, + { + "epoch": 2.316244752576008, + "ewc_loss": 0.008368422277271748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368422277271748e-05, + "grad_norm": 4.157131195068359, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8796117305755615, + "num_tokens": 694827629.0, + "step": 18208 + }, + { + "epoch": 2.316371962854599, + "ewc_loss": 0.008348067291080952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348067058250308e-05, + "grad_norm": 4.151638507843018, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8780913352966309, + "num_tokens": 694868241.0, + "step": 18209 + }, + { + "epoch": 2.316499173133189, + "ewc_loss": 0.00834972970187664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349729614565149e-05, + "grad_norm": 4.20375919342041, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.872884213924408, + "num_tokens": 694905298.0, + "step": 18210 + }, + { + "epoch": 2.3166263834117795, + "ewc_loss": 0.008379990234971046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379990322282538e-05, + "grad_norm": 4.185863494873047, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8735066652297974, + "num_tokens": 694938054.0, + "step": 18211 + }, + { + "epoch": 2.31675359369037, + "ewc_loss": 0.008367924019694328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367923874175176e-05, + "grad_norm": 4.150272369384766, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8789701461791992, + "num_tokens": 694978761.0, + "step": 18212 + }, + { + "epoch": 2.3168808039689606, + "ewc_loss": 0.008360827341675758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360827632714063e-05, + "grad_norm": 4.164926528930664, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8794165849685669, + "num_tokens": 695015073.0, + "step": 18213 + }, + { + "epoch": 2.317008014247551, + "ewc_loss": 0.008401978760957718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401978993788362e-05, + "grad_norm": 4.170021057128906, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8749446868896484, + "num_tokens": 695052288.0, + "step": 18214 + }, + { + "epoch": 2.3171352245261416, + "ewc_loss": 0.008406493812799454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406493725487962e-05, + "grad_norm": 4.189882278442383, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8730934262275696, + "num_tokens": 695088976.0, + "step": 18215 + }, + { + "epoch": 2.317262434804732, + "ewc_loss": 0.00840806495398283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408065332332626e-05, + "grad_norm": 4.147279262542725, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8839763402938843, + "num_tokens": 695128482.0, + "step": 18216 + }, + { + "epoch": 2.3173896450833227, + "ewc_loss": 0.008390105329453945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390105358557776e-05, + "grad_norm": 4.191515922546387, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8772531747817993, + "num_tokens": 695164963.0, + "step": 18217 + }, + { + "epoch": 2.317516855361913, + "ewc_loss": 0.00842897966504097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428980072494596e-05, + "grad_norm": 4.114255428314209, + "learning_rate": 1e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.9011454582214355, + "num_tokens": 695201515.0, + "step": 18218 + }, + { + "epoch": 2.3176440656405037, + "ewc_loss": 0.008386528119444847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386528497794643e-05, + "grad_norm": 4.165594577789307, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8878791332244873, + "num_tokens": 695242057.0, + "step": 18219 + }, + { + "epoch": 2.3177712759190943, + "ewc_loss": 0.0084401685744524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440168312517926e-05, + "grad_norm": 4.230959415435791, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8822544813156128, + "num_tokens": 695277128.0, + "step": 18220 + }, + { + "epoch": 2.317898486197685, + "ewc_loss": 0.008446544408798218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446544234175235e-05, + "grad_norm": 4.254319190979004, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.895003616809845, + "num_tokens": 695311984.0, + "step": 18221 + }, + { + "epoch": 2.3180256964762753, + "ewc_loss": 0.008433488197624683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433488255832344e-05, + "grad_norm": 4.157839298248291, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8726242780685425, + "num_tokens": 695348457.0, + "step": 18222 + }, + { + "epoch": 2.318152906754866, + "ewc_loss": 0.008354858495295048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354858437087387e-05, + "grad_norm": 4.161102771759033, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8834495544433594, + "num_tokens": 695386518.0, + "step": 18223 + }, + { + "epoch": 2.3182801170334564, + "ewc_loss": 0.008391746319830418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391746086999774e-05, + "grad_norm": 4.1718854904174805, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8788713812828064, + "num_tokens": 695425365.0, + "step": 18224 + }, + { + "epoch": 2.318407327312047, + "ewc_loss": 0.008430219255387783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430219168076292e-05, + "grad_norm": 4.189321994781494, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8832240104675293, + "num_tokens": 695459559.0, + "step": 18225 + }, + { + "epoch": 2.3185345375906374, + "ewc_loss": 0.008399137295782566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399137004744262e-05, + "grad_norm": 4.239080429077148, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8789181709289551, + "num_tokens": 695491386.0, + "step": 18226 + }, + { + "epoch": 2.318661747869228, + "ewc_loss": 0.008440500125288963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440500096185133e-05, + "grad_norm": 4.151571750640869, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8655223846435547, + "num_tokens": 695534660.0, + "step": 18227 + }, + { + "epoch": 2.3187889581478185, + "ewc_loss": 0.008396300487220287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396300836466253e-05, + "grad_norm": 4.15717077255249, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8730593323707581, + "num_tokens": 695578127.0, + "step": 18228 + }, + { + "epoch": 2.318916168426409, + "ewc_loss": 0.008421485312283039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421485108556226e-05, + "grad_norm": 4.207714557647705, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8708990216255188, + "num_tokens": 695614181.0, + "step": 18229 + }, + { + "epoch": 2.3190433787049995, + "ewc_loss": 0.008458035998046398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458035881631076e-05, + "grad_norm": 4.155137062072754, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8799802660942078, + "num_tokens": 695652753.0, + "step": 18230 + }, + { + "epoch": 2.3191705889835896, + "ewc_loss": 0.00841617863625288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416178752668202e-05, + "grad_norm": 4.162171363830566, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8884875774383545, + "num_tokens": 695692449.0, + "step": 18231 + }, + { + "epoch": 2.3192977992621806, + "ewc_loss": 0.008449269458651543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449269080301747e-05, + "grad_norm": 4.170475006103516, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8812788724899292, + "num_tokens": 695730812.0, + "step": 18232 + }, + { + "epoch": 2.3194250095407707, + "ewc_loss": 0.008462408557534218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4624087321572e-05, + "grad_norm": 4.166329860687256, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8955373764038086, + "num_tokens": 695767577.0, + "step": 18233 + }, + { + "epoch": 2.319552219819361, + "ewc_loss": 0.00846205372363329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462053665425628e-05, + "grad_norm": 4.101489067077637, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8806300759315491, + "num_tokens": 695812372.0, + "step": 18234 + }, + { + "epoch": 2.3196794300979517, + "ewc_loss": 0.008402806706726551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402806997764856e-05, + "grad_norm": 4.144372463226318, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8954148292541504, + "num_tokens": 695847470.0, + "step": 18235 + }, + { + "epoch": 2.3198066403765423, + "ewc_loss": 0.008455517701804638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455517672700807e-05, + "grad_norm": 4.120116710662842, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8948348164558411, + "num_tokens": 695888082.0, + "step": 18236 + }, + { + "epoch": 2.319933850655133, + "ewc_loss": 0.008410140872001648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410140435444191e-05, + "grad_norm": 4.14539098739624, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8881655931472778, + "num_tokens": 695928904.0, + "step": 18237 + }, + { + "epoch": 2.3200610609337233, + "ewc_loss": 0.008450090885162354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45009126351215e-05, + "grad_norm": 4.1928534507751465, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8707451820373535, + "num_tokens": 695967549.0, + "step": 18238 + }, + { + "epoch": 2.320188271212314, + "ewc_loss": 0.008448388427495956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44838868943043e-05, + "grad_norm": 4.172092437744141, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8752121329307556, + "num_tokens": 696005611.0, + "step": 18239 + }, + { + "epoch": 2.3203154814909044, + "ewc_loss": 0.008403551764786243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403551328228787e-05, + "grad_norm": 4.227575302124023, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8830405473709106, + "num_tokens": 696034864.0, + "step": 18240 + }, + { + "epoch": 2.320442691769495, + "ewc_loss": 0.00845932774245739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459328091703355e-05, + "grad_norm": 4.214103698730469, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8787031173706055, + "num_tokens": 696072137.0, + "step": 18241 + }, + { + "epoch": 2.3205699020480854, + "ewc_loss": 0.008401024155318737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401024388149381e-05, + "grad_norm": 4.126524448394775, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8843176364898682, + "num_tokens": 696115744.0, + "step": 18242 + }, + { + "epoch": 2.320697112326676, + "ewc_loss": 0.008378259837627411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378260099561885e-05, + "grad_norm": 4.205211639404297, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8769955635070801, + "num_tokens": 696151293.0, + "step": 18243 + }, + { + "epoch": 2.3208243226052665, + "ewc_loss": 0.00844452902674675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444528793916106e-05, + "grad_norm": 4.098248481750488, + "learning_rate": 1e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.8966542482376099, + "num_tokens": 696190742.0, + "step": 18244 + }, + { + "epoch": 2.320951532883857, + "ewc_loss": 0.008347335271537304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.347335096914321e-05, + "grad_norm": 4.167048931121826, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8699981570243835, + "num_tokens": 696232807.0, + "step": 18245 + }, + { + "epoch": 2.3210787431624476, + "ewc_loss": 0.008428362198174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428362343693152e-05, + "grad_norm": 4.167099475860596, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8782322406768799, + "num_tokens": 696269416.0, + "step": 18246 + }, + { + "epoch": 2.321205953441038, + "ewc_loss": 0.008401818573474884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40181892272085e-05, + "grad_norm": 4.181178092956543, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8839666843414307, + "num_tokens": 696305144.0, + "step": 18247 + }, + { + "epoch": 2.3213331637196286, + "ewc_loss": 0.008386019617319107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38601918076165e-05, + "grad_norm": 4.15244197845459, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.872749924659729, + "num_tokens": 696344854.0, + "step": 18248 + }, + { + "epoch": 2.321460373998219, + "ewc_loss": 0.008370096795260906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370097202714533e-05, + "grad_norm": 4.170245170593262, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.899154007434845, + "num_tokens": 696380154.0, + "step": 18249 + }, + { + "epoch": 2.3215875842768097, + "ewc_loss": 0.008388604037463665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388603600906208e-05, + "grad_norm": 4.198902606964111, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8701192140579224, + "num_tokens": 696416458.0, + "step": 18250 + }, + { + "epoch": 2.3217147945554, + "ewc_loss": 0.008391140028834343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391139999730512e-05, + "grad_norm": 4.1444268226623535, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8842500448226929, + "num_tokens": 696453232.0, + "step": 18251 + }, + { + "epoch": 2.3218420048339907, + "ewc_loss": 0.008346203714609146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346203685505316e-05, + "grad_norm": 4.193273544311523, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8643271923065186, + "num_tokens": 696487930.0, + "step": 18252 + }, + { + "epoch": 2.3219692151125813, + "ewc_loss": 0.008412491530179977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.412491297349334e-05, + "grad_norm": 4.216858386993408, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8730846643447876, + "num_tokens": 696523824.0, + "step": 18253 + }, + { + "epoch": 2.322096425391172, + "ewc_loss": 0.008420508354902267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420507947448641e-05, + "grad_norm": 4.101172924041748, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8907687664031982, + "num_tokens": 696567007.0, + "step": 18254 + }, + { + "epoch": 2.3222236356697623, + "ewc_loss": 0.008368953131139278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368953422177583e-05, + "grad_norm": 4.108585834503174, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.896736204624176, + "num_tokens": 696610055.0, + "step": 18255 + }, + { + "epoch": 2.3223508459483524, + "ewc_loss": 0.008409093134105206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40909342514351e-05, + "grad_norm": 4.2609992027282715, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8689791560173035, + "num_tokens": 696644058.0, + "step": 18256 + }, + { + "epoch": 2.3224780562269434, + "ewc_loss": 0.00848428625613451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484286081511527e-05, + "grad_norm": 4.177253723144531, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8878077864646912, + "num_tokens": 696686381.0, + "step": 18257 + }, + { + "epoch": 2.3226052665055335, + "ewc_loss": 0.00837433896958828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374339086003602e-05, + "grad_norm": 4.223418235778809, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8742389678955078, + "num_tokens": 696727567.0, + "step": 18258 + }, + { + "epoch": 2.322732476784124, + "ewc_loss": 0.008431497029960155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.431496826233342e-05, + "grad_norm": 4.178862571716309, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8719780445098877, + "num_tokens": 696767266.0, + "step": 18259 + }, + { + "epoch": 2.3228596870627145, + "ewc_loss": 0.008390619419515133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390619768761098e-05, + "grad_norm": 4.0975422859191895, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8944013118743896, + "num_tokens": 696806059.0, + "step": 18260 + }, + { + "epoch": 2.322986897341305, + "ewc_loss": 0.008355900645256042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3559010818135e-05, + "grad_norm": 4.1799187660217285, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8800180554389954, + "num_tokens": 696844232.0, + "step": 18261 + }, + { + "epoch": 2.3231141076198956, + "ewc_loss": 0.00844426080584526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444261038675904e-05, + "grad_norm": 4.13884162902832, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8827670812606812, + "num_tokens": 696886223.0, + "step": 18262 + }, + { + "epoch": 2.323241317898486, + "ewc_loss": 0.008398431353271008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398431236855686e-05, + "grad_norm": 4.194498538970947, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8683838844299316, + "num_tokens": 696922625.0, + "step": 18263 + }, + { + "epoch": 2.3233685281770766, + "ewc_loss": 0.00841661635786295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416616765316576e-05, + "grad_norm": 4.1506171226501465, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8709803819656372, + "num_tokens": 696965160.0, + "step": 18264 + }, + { + "epoch": 2.323495738455667, + "ewc_loss": 0.00839082058519125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39082058519125e-05, + "grad_norm": 4.206172466278076, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8865844011306763, + "num_tokens": 697000854.0, + "step": 18265 + }, + { + "epoch": 2.3236229487342577, + "ewc_loss": 0.008424750529229641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424750558333471e-05, + "grad_norm": 4.116405487060547, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8956760168075562, + "num_tokens": 697041264.0, + "step": 18266 + }, + { + "epoch": 2.323750159012848, + "ewc_loss": 0.008340148255228996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.340148633578792e-05, + "grad_norm": 4.193458080291748, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8792139887809753, + "num_tokens": 697080079.0, + "step": 18267 + }, + { + "epoch": 2.3238773692914387, + "ewc_loss": 0.008414254523813725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41425426187925e-05, + "grad_norm": 4.123743534088135, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8784518241882324, + "num_tokens": 697123755.0, + "step": 18268 + }, + { + "epoch": 2.3240045795700293, + "ewc_loss": 0.008354291319847107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35429091239348e-05, + "grad_norm": 4.248006343841553, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8550711870193481, + "num_tokens": 697162012.0, + "step": 18269 + }, + { + "epoch": 2.32413178984862, + "ewc_loss": 0.00844867154955864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448671724181622e-05, + "grad_norm": 4.146730422973633, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8903710246086121, + "num_tokens": 697200188.0, + "step": 18270 + }, + { + "epoch": 2.3242590001272103, + "ewc_loss": 0.008344229310750961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344229718204588e-05, + "grad_norm": 4.203341484069824, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8846191167831421, + "num_tokens": 697240536.0, + "step": 18271 + }, + { + "epoch": 2.324386210405801, + "ewc_loss": 0.008382920175790787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38292035041377e-05, + "grad_norm": 4.128706455230713, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8911992311477661, + "num_tokens": 697279459.0, + "step": 18272 + }, + { + "epoch": 2.3245134206843914, + "ewc_loss": 0.008351102471351624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.351102587766945e-05, + "grad_norm": 4.3963751792907715, + "learning_rate": 1e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8976198434829712, + "num_tokens": 697312594.0, + "step": 18273 + }, + { + "epoch": 2.324640630962982, + "ewc_loss": 0.008517962880432606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517962851328775e-05, + "grad_norm": 4.126265525817871, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8688902258872986, + "num_tokens": 697357834.0, + "step": 18274 + }, + { + "epoch": 2.3247678412415724, + "ewc_loss": 0.008270081132650375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.270081161754206e-05, + "grad_norm": 4.150287628173828, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8811166286468506, + "num_tokens": 697401522.0, + "step": 18275 + }, + { + "epoch": 2.324895051520163, + "ewc_loss": 0.00839805044233799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398049976676702e-05, + "grad_norm": 4.176293849945068, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8794386982917786, + "num_tokens": 697440415.0, + "step": 18276 + }, + { + "epoch": 2.3250222617987535, + "ewc_loss": 0.008374499157071114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374499157071114e-05, + "grad_norm": 4.227888107299805, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8671519160270691, + "num_tokens": 697477609.0, + "step": 18277 + }, + { + "epoch": 2.325149472077344, + "ewc_loss": 0.008407100103795528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407099812757224e-05, + "grad_norm": 4.6208977699279785, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8773833513259888, + "num_tokens": 697515566.0, + "step": 18278 + }, + { + "epoch": 2.3252766823559345, + "ewc_loss": 0.008589995093643665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589994831709191e-05, + "grad_norm": 4.138022422790527, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8855953216552734, + "num_tokens": 697549537.0, + "step": 18279 + }, + { + "epoch": 2.325403892634525, + "ewc_loss": 0.008238949812948704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.23895024950616e-05, + "grad_norm": 4.17677640914917, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8859919309616089, + "num_tokens": 697582618.0, + "step": 18280 + }, + { + "epoch": 2.325531102913115, + "ewc_loss": 0.008443017490208149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44301757751964e-05, + "grad_norm": 4.121993541717529, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8938924074172974, + "num_tokens": 697622234.0, + "step": 18281 + }, + { + "epoch": 2.325658313191706, + "ewc_loss": 0.008386729285120964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386729314224795e-05, + "grad_norm": 4.216770648956299, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8727588653564453, + "num_tokens": 697657504.0, + "step": 18282 + }, + { + "epoch": 2.325785523470296, + "ewc_loss": 0.008457668125629425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45766844577156e-05, + "grad_norm": 4.163233280181885, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8692293763160706, + "num_tokens": 697699496.0, + "step": 18283 + }, + { + "epoch": 2.3259127337488867, + "ewc_loss": 0.008410797454416752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410797454416752e-05, + "grad_norm": 4.136473655700684, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8918141722679138, + "num_tokens": 697739969.0, + "step": 18284 + }, + { + "epoch": 2.3260399440274773, + "ewc_loss": 0.008408262394368649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40826251078397e-05, + "grad_norm": 4.202542781829834, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8721402287483215, + "num_tokens": 697774346.0, + "step": 18285 + }, + { + "epoch": 2.326167154306068, + "ewc_loss": 0.00844658724963665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446587162325159e-05, + "grad_norm": 4.1736674308776855, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8675010204315186, + "num_tokens": 697813013.0, + "step": 18286 + }, + { + "epoch": 2.3262943645846583, + "ewc_loss": 0.008432179689407349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432180038653314e-05, + "grad_norm": 4.160708904266357, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8761303424835205, + "num_tokens": 697854163.0, + "step": 18287 + }, + { + "epoch": 2.326421574863249, + "ewc_loss": 0.008439081721007824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439081284450367e-05, + "grad_norm": 4.127171516418457, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8876539468765259, + "num_tokens": 697898049.0, + "step": 18288 + }, + { + "epoch": 2.3265487851418394, + "ewc_loss": 0.008432578295469284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432578761130571e-05, + "grad_norm": 4.170067310333252, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8837414979934692, + "num_tokens": 697936086.0, + "step": 18289 + }, + { + "epoch": 2.32667599542043, + "ewc_loss": 0.008444823324680328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44482347019948e-05, + "grad_norm": 4.179177284240723, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8585548400878906, + "num_tokens": 697981529.0, + "step": 18290 + }, + { + "epoch": 2.3268032056990204, + "ewc_loss": 0.008422181941568851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422182145295665e-05, + "grad_norm": 4.159433364868164, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8969617486000061, + "num_tokens": 698016047.0, + "step": 18291 + }, + { + "epoch": 2.326930415977611, + "ewc_loss": 0.008428326807916164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428326691500843e-05, + "grad_norm": 4.1832966804504395, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8933340311050415, + "num_tokens": 698052408.0, + "step": 18292 + }, + { + "epoch": 2.3270576262562015, + "ewc_loss": 0.008437185548245907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437185897491872e-05, + "grad_norm": 4.208887100219727, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.875207781791687, + "num_tokens": 698089751.0, + "step": 18293 + }, + { + "epoch": 2.327184836534792, + "ewc_loss": 0.008436190895736217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436190546490252e-05, + "grad_norm": 4.212951183319092, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8779901266098022, + "num_tokens": 698121274.0, + "step": 18294 + }, + { + "epoch": 2.3273120468133826, + "ewc_loss": 0.008427996188402176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427996363025159e-05, + "grad_norm": 4.167268753051758, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8897571563720703, + "num_tokens": 698155880.0, + "step": 18295 + }, + { + "epoch": 2.327439257091973, + "ewc_loss": 0.00839992891997099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399928628932685e-05, + "grad_norm": 4.185672760009766, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.883316695690155, + "num_tokens": 698191448.0, + "step": 18296 + }, + { + "epoch": 2.3275664673705636, + "ewc_loss": 0.008438434451818466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438434451818466e-05, + "grad_norm": 4.172636032104492, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8755326271057129, + "num_tokens": 698228437.0, + "step": 18297 + }, + { + "epoch": 2.327693677649154, + "ewc_loss": 0.008412448689341545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41244836919941e-05, + "grad_norm": 4.150002479553223, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8746241331100464, + "num_tokens": 698268714.0, + "step": 18298 + }, + { + "epoch": 2.3278208879277447, + "ewc_loss": 0.008419446647167206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419446385232732e-05, + "grad_norm": 4.150996685028076, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8842630982398987, + "num_tokens": 698305575.0, + "step": 18299 + }, + { + "epoch": 2.327948098206335, + "ewc_loss": 0.008444094099104404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44409441924654e-05, + "grad_norm": 4.160150051116943, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8764481544494629, + "num_tokens": 698347701.0, + "step": 18300 + }, + { + "epoch": 2.3280753084849257, + "ewc_loss": 0.008413702249526978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413702016696334e-05, + "grad_norm": 4.11539888381958, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8963825702667236, + "num_tokens": 698385906.0, + "step": 18301 + }, + { + "epoch": 2.3282025187635162, + "ewc_loss": 0.008408276364207268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408276335103437e-05, + "grad_norm": 4.269321441650391, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8798825740814209, + "num_tokens": 698415511.0, + "step": 18302 + }, + { + "epoch": 2.3283297290421068, + "ewc_loss": 0.00850772950798273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507729944540188e-05, + "grad_norm": 4.113006591796875, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8761122226715088, + "num_tokens": 698460528.0, + "step": 18303 + }, + { + "epoch": 2.3284569393206973, + "ewc_loss": 0.008371795527637005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371795411221683e-05, + "grad_norm": 4.170889854431152, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.867645263671875, + "num_tokens": 698500800.0, + "step": 18304 + }, + { + "epoch": 2.328584149599288, + "ewc_loss": 0.00847585964947939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475859794998541e-05, + "grad_norm": 4.165683269500732, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8800806999206543, + "num_tokens": 698539373.0, + "step": 18305 + }, + { + "epoch": 2.328711359877878, + "ewc_loss": 0.008453094400465488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453094051219523e-05, + "grad_norm": 4.140291690826416, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.9008752703666687, + "num_tokens": 698577067.0, + "step": 18306 + }, + { + "epoch": 2.328838570156469, + "ewc_loss": 0.008437777869403362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437778160441667e-05, + "grad_norm": 4.171567916870117, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8803126811981201, + "num_tokens": 698613515.0, + "step": 18307 + }, + { + "epoch": 2.328965780435059, + "ewc_loss": 0.008469066582620144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469066233374178e-05, + "grad_norm": 4.195247650146484, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8822769522666931, + "num_tokens": 698647278.0, + "step": 18308 + }, + { + "epoch": 2.3290929907136495, + "ewc_loss": 0.008483782410621643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483782585244626e-05, + "grad_norm": 4.118438243865967, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8877576589584351, + "num_tokens": 698688738.0, + "step": 18309 + }, + { + "epoch": 2.32922020099224, + "ewc_loss": 0.00843413919210434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434138726443052e-05, + "grad_norm": 4.143246173858643, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.885492205619812, + "num_tokens": 698725003.0, + "step": 18310 + }, + { + "epoch": 2.3293474112708306, + "ewc_loss": 0.008481004275381565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48100462462753e-05, + "grad_norm": 4.1743621826171875, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8644098043441772, + "num_tokens": 698768205.0, + "step": 18311 + }, + { + "epoch": 2.329474621549421, + "ewc_loss": 0.008465386927127838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465386781608686e-05, + "grad_norm": 4.201481342315674, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8638798594474792, + "num_tokens": 698805352.0, + "step": 18312 + }, + { + "epoch": 2.3296018318280116, + "ewc_loss": 0.008485749363899231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48574927658774e-05, + "grad_norm": 4.223485946655273, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8915195465087891, + "num_tokens": 698836610.0, + "step": 18313 + }, + { + "epoch": 2.329729042106602, + "ewc_loss": 0.008482365868985653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482365956297144e-05, + "grad_norm": 4.1387553215026855, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8890972137451172, + "num_tokens": 698877514.0, + "step": 18314 + }, + { + "epoch": 2.3298562523851927, + "ewc_loss": 0.00842086412012577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420864469371736e-05, + "grad_norm": 4.162539958953857, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8841542601585388, + "num_tokens": 698916699.0, + "step": 18315 + }, + { + "epoch": 2.329983462663783, + "ewc_loss": 0.008477585390210152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477585652144626e-05, + "grad_norm": 4.180377006530762, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8805316090583801, + "num_tokens": 698953568.0, + "step": 18316 + }, + { + "epoch": 2.3301106729423737, + "ewc_loss": 0.008471297100186348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471297041978687e-05, + "grad_norm": 4.165045738220215, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.877063512802124, + "num_tokens": 698990562.0, + "step": 18317 + }, + { + "epoch": 2.3302378832209643, + "ewc_loss": 0.008455830626189709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455830538878217e-05, + "grad_norm": 4.195642948150635, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8821601867675781, + "num_tokens": 699024095.0, + "step": 18318 + }, + { + "epoch": 2.330365093499555, + "ewc_loss": 0.008475881069898605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475880895275623e-05, + "grad_norm": 4.164238452911377, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8632451295852661, + "num_tokens": 699062327.0, + "step": 18319 + }, + { + "epoch": 2.3304923037781453, + "ewc_loss": 0.008449722081422806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44972237246111e-05, + "grad_norm": 4.160435199737549, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8863407969474792, + "num_tokens": 699101591.0, + "step": 18320 + }, + { + "epoch": 2.330619514056736, + "ewc_loss": 0.00846913643181324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469136810163036e-05, + "grad_norm": 4.262936592102051, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8743749856948853, + "num_tokens": 699134836.0, + "step": 18321 + }, + { + "epoch": 2.3307467243353264, + "ewc_loss": 0.008526920340955257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526920282747597e-05, + "grad_norm": 4.211633205413818, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8718359470367432, + "num_tokens": 699171218.0, + "step": 18322 + }, + { + "epoch": 2.330873934613917, + "ewc_loss": 0.008460022509098053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46002294565551e-05, + "grad_norm": 4.223031044006348, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8985462784767151, + "num_tokens": 699200891.0, + "step": 18323 + }, + { + "epoch": 2.3310011448925074, + "ewc_loss": 0.008509275503456593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50927535793744e-05, + "grad_norm": 4.151292324066162, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8953995108604431, + "num_tokens": 699238080.0, + "step": 18324 + }, + { + "epoch": 2.331128355171098, + "ewc_loss": 0.008468424901366234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468425221508369e-05, + "grad_norm": 4.14570426940918, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8695436120033264, + "num_tokens": 699276051.0, + "step": 18325 + }, + { + "epoch": 2.3312555654496885, + "ewc_loss": 0.008502746932208538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502746641170233e-05, + "grad_norm": 4.133009910583496, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8879348635673523, + "num_tokens": 699315721.0, + "step": 18326 + }, + { + "epoch": 2.331382775728279, + "ewc_loss": 0.008489605039358139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489604806527495e-05, + "grad_norm": 4.1789774894714355, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8933213353157043, + "num_tokens": 699350342.0, + "step": 18327 + }, + { + "epoch": 2.3315099860068695, + "ewc_loss": 0.008506795391440392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506795711582527e-05, + "grad_norm": 4.117715358734131, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8862829804420471, + "num_tokens": 699390301.0, + "step": 18328 + }, + { + "epoch": 2.3316371962854596, + "ewc_loss": 0.008468137122690678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468137093586847e-05, + "grad_norm": 4.219141960144043, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8659523725509644, + "num_tokens": 699427135.0, + "step": 18329 + }, + { + "epoch": 2.3317644065640506, + "ewc_loss": 0.008540534414350986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540534327039495e-05, + "grad_norm": 4.1444878578186035, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8877412676811218, + "num_tokens": 699466461.0, + "step": 18330 + }, + { + "epoch": 2.3318916168426407, + "ewc_loss": 0.008458537049591541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458537195110694e-05, + "grad_norm": 4.171966552734375, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8747397661209106, + "num_tokens": 699502446.0, + "step": 18331 + }, + { + "epoch": 2.332018827121231, + "ewc_loss": 0.008511299267411232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511299529345706e-05, + "grad_norm": 4.117430686950684, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8813656568527222, + "num_tokens": 699546357.0, + "step": 18332 + }, + { + "epoch": 2.3321460373998217, + "ewc_loss": 0.00845448486506939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454484486719593e-05, + "grad_norm": 4.19504976272583, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8810650110244751, + "num_tokens": 699581879.0, + "step": 18333 + }, + { + "epoch": 2.3322732476784123, + "ewc_loss": 0.008490548469126225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490548498230055e-05, + "grad_norm": 4.181994915008545, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8939733505249023, + "num_tokens": 699616287.0, + "step": 18334 + }, + { + "epoch": 2.332400457957003, + "ewc_loss": 0.0084675382822752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4675382822752e-05, + "grad_norm": 4.175817966461182, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8798514604568481, + "num_tokens": 699655409.0, + "step": 18335 + }, + { + "epoch": 2.3325276682355933, + "ewc_loss": 0.008439990691840649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43999077915214e-05, + "grad_norm": 4.162790298461914, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8571810722351074, + "num_tokens": 699701420.0, + "step": 18336 + }, + { + "epoch": 2.332654878514184, + "ewc_loss": 0.008428219705820084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428219734923914e-05, + "grad_norm": 4.215654373168945, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8848621249198914, + "num_tokens": 699736151.0, + "step": 18337 + }, + { + "epoch": 2.3327820887927744, + "ewc_loss": 0.008461127988994122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461128163617104e-05, + "grad_norm": 4.252574443817139, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8761372566223145, + "num_tokens": 699770111.0, + "step": 18338 + }, + { + "epoch": 2.332909299071365, + "ewc_loss": 0.008434333838522434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434333722107112e-05, + "grad_norm": 4.142141819000244, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8785006999969482, + "num_tokens": 699807442.0, + "step": 18339 + }, + { + "epoch": 2.3330365093499554, + "ewc_loss": 0.008362824097275734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362824155483395e-05, + "grad_norm": 4.185226917266846, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8792794942855835, + "num_tokens": 699849364.0, + "step": 18340 + }, + { + "epoch": 2.333163719628546, + "ewc_loss": 0.008433208800852299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43320885905996e-05, + "grad_norm": 4.134028434753418, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8953708410263062, + "num_tokens": 699886234.0, + "step": 18341 + }, + { + "epoch": 2.3332909299071365, + "ewc_loss": 0.00836186669766903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361866639461368e-05, + "grad_norm": 4.201215744018555, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8649272322654724, + "num_tokens": 699925660.0, + "step": 18342 + }, + { + "epoch": 2.333418140185727, + "ewc_loss": 0.008437523618340492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43752350192517e-05, + "grad_norm": 4.2274980545043945, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8561310768127441, + "num_tokens": 699967894.0, + "step": 18343 + }, + { + "epoch": 2.3335453504643175, + "ewc_loss": 0.008422700688242912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422700921073556e-05, + "grad_norm": 4.251176834106445, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8760862350463867, + "num_tokens": 699999211.0, + "step": 18344 + }, + { + "epoch": 2.333672560742908, + "ewc_loss": 0.008439285680651665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439285738859326e-05, + "grad_norm": 4.137064456939697, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8890771865844727, + "num_tokens": 700037639.0, + "step": 18345 + }, + { + "epoch": 2.3337997710214986, + "ewc_loss": 0.008366787806153297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366788097191602e-05, + "grad_norm": 4.178402423858643, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8760371208190918, + "num_tokens": 700075677.0, + "step": 18346 + }, + { + "epoch": 2.333926981300089, + "ewc_loss": 0.008411879651248455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411879389313981e-05, + "grad_norm": 4.108717441558838, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8870634436607361, + "num_tokens": 700115697.0, + "step": 18347 + }, + { + "epoch": 2.3340541915786797, + "ewc_loss": 0.008364888839423656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3648890722543e-05, + "grad_norm": 4.218568801879883, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8656750917434692, + "num_tokens": 700152553.0, + "step": 18348 + }, + { + "epoch": 2.33418140185727, + "ewc_loss": 0.008432707749307156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432707545580342e-05, + "grad_norm": 4.134768486022949, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8880804777145386, + "num_tokens": 700190958.0, + "step": 18349 + }, + { + "epoch": 2.3343086121358607, + "ewc_loss": 0.008364925161004066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36492472444661e-05, + "grad_norm": 4.188622951507568, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.879991352558136, + "num_tokens": 700224244.0, + "step": 18350 + }, + { + "epoch": 2.3344358224144512, + "ewc_loss": 0.00843127816915512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.431278547504917e-05, + "grad_norm": 4.147758483886719, + "learning_rate": 1e-06, + "loss": 0.2712, + "mean_token_accuracy": 0.905454158782959, + "num_tokens": 700259736.0, + "step": 18351 + }, + { + "epoch": 2.3345630326930418, + "ewc_loss": 0.008404308930039406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404308755416423e-05, + "grad_norm": 4.167434215545654, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8810800313949585, + "num_tokens": 700293624.0, + "step": 18352 + }, + { + "epoch": 2.3346902429716323, + "ewc_loss": 0.008441494777798653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441494719590992e-05, + "grad_norm": 4.173434257507324, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.885358452796936, + "num_tokens": 700328052.0, + "step": 18353 + }, + { + "epoch": 2.3348174532502224, + "ewc_loss": 0.008443173952400684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443174010608345e-05, + "grad_norm": 4.153635025024414, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8749150037765503, + "num_tokens": 700364652.0, + "step": 18354 + }, + { + "epoch": 2.3349446635288134, + "ewc_loss": 0.008421323262155056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421323582297191e-05, + "grad_norm": 4.163371562957764, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8918584585189819, + "num_tokens": 700403698.0, + "step": 18355 + }, + { + "epoch": 2.3350718738074034, + "ewc_loss": 0.008449614979326725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449615415884182e-05, + "grad_norm": 4.18716287612915, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8945668935775757, + "num_tokens": 700439391.0, + "step": 18356 + }, + { + "epoch": 2.335199084085994, + "ewc_loss": 0.008442588150501251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442588296020404e-05, + "grad_norm": 4.187962055206299, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8839214444160461, + "num_tokens": 700475095.0, + "step": 18357 + }, + { + "epoch": 2.3353262943645845, + "ewc_loss": 0.008441979065537453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441979298368096e-05, + "grad_norm": 4.127886772155762, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8854405879974365, + "num_tokens": 700514167.0, + "step": 18358 + }, + { + "epoch": 2.335453504643175, + "ewc_loss": 0.008418790064752102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418790093855932e-05, + "grad_norm": 4.1721038818359375, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8884906768798828, + "num_tokens": 700548333.0, + "step": 18359 + }, + { + "epoch": 2.3355807149217656, + "ewc_loss": 0.008460897952318192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460897515760735e-05, + "grad_norm": 4.125111103057861, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8854473233222961, + "num_tokens": 700594261.0, + "step": 18360 + }, + { + "epoch": 2.335707925200356, + "ewc_loss": 0.00841008871793747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41008877614513e-05, + "grad_norm": 4.171994686126709, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8856675624847412, + "num_tokens": 700632513.0, + "step": 18361 + }, + { + "epoch": 2.3358351354789466, + "ewc_loss": 0.008451510220766068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451510075246915e-05, + "grad_norm": 4.1605448722839355, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8877583146095276, + "num_tokens": 700672430.0, + "step": 18362 + }, + { + "epoch": 2.335962345757537, + "ewc_loss": 0.008418935351073742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418935613008216e-05, + "grad_norm": 4.160340785980225, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8863846063613892, + "num_tokens": 700714321.0, + "step": 18363 + }, + { + "epoch": 2.3360895560361277, + "ewc_loss": 0.00840998999774456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409989823121578e-05, + "grad_norm": 4.231682300567627, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8816924691200256, + "num_tokens": 700751714.0, + "step": 18364 + }, + { + "epoch": 2.336216766314718, + "ewc_loss": 0.008451547473669052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451547182630748e-05, + "grad_norm": 4.119679927825928, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8787214159965515, + "num_tokens": 700794087.0, + "step": 18365 + }, + { + "epoch": 2.3363439765933087, + "ewc_loss": 0.008351607248187065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35160753922537e-05, + "grad_norm": 4.181661605834961, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8769400119781494, + "num_tokens": 700832031.0, + "step": 18366 + }, + { + "epoch": 2.3364711868718993, + "ewc_loss": 0.008421535603702068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421535312663764e-05, + "grad_norm": 4.232039451599121, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8839609622955322, + "num_tokens": 700867175.0, + "step": 18367 + }, + { + "epoch": 2.33659839715049, + "ewc_loss": 0.008419114165008068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419113873969764e-05, + "grad_norm": 4.154597759246826, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8869968056678772, + "num_tokens": 700906762.0, + "step": 18368 + }, + { + "epoch": 2.3367256074290803, + "ewc_loss": 0.008359083905816078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.359084313269705e-05, + "grad_norm": 4.157737731933594, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8843021392822266, + "num_tokens": 700944316.0, + "step": 18369 + }, + { + "epoch": 2.336852817707671, + "ewc_loss": 0.008385147899389267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385148248635232e-05, + "grad_norm": 4.192078590393066, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8775316476821899, + "num_tokens": 700979369.0, + "step": 18370 + }, + { + "epoch": 2.3369800279862614, + "ewc_loss": 0.008394058793783188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394058386329561e-05, + "grad_norm": 4.191446304321289, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8783623576164246, + "num_tokens": 701016213.0, + "step": 18371 + }, + { + "epoch": 2.337107238264852, + "ewc_loss": 0.008382580243051052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382580563193187e-05, + "grad_norm": 4.134502410888672, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8800808787345886, + "num_tokens": 701055270.0, + "step": 18372 + }, + { + "epoch": 2.3372344485434424, + "ewc_loss": 0.0083580007776618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358000923180953e-05, + "grad_norm": 4.195959091186523, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8802403211593628, + "num_tokens": 701091058.0, + "step": 18373 + }, + { + "epoch": 2.337361658822033, + "ewc_loss": 0.008426256477832794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426256681559607e-05, + "grad_norm": 4.16842794418335, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8894554376602173, + "num_tokens": 701129446.0, + "step": 18374 + }, + { + "epoch": 2.3374888691006235, + "ewc_loss": 0.008378937840461731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378937491215765e-05, + "grad_norm": 4.160994052886963, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8856303095817566, + "num_tokens": 701168726.0, + "step": 18375 + }, + { + "epoch": 2.337616079379214, + "ewc_loss": 0.0083992388099432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39923886815086e-05, + "grad_norm": 4.169439792633057, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8760283589363098, + "num_tokens": 701204041.0, + "step": 18376 + }, + { + "epoch": 2.3377432896578045, + "ewc_loss": 0.00840754620730877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407546556554735e-05, + "grad_norm": 4.158818244934082, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.860934853553772, + "num_tokens": 701244660.0, + "step": 18377 + }, + { + "epoch": 2.337870499936395, + "ewc_loss": 0.008403965272009373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403965330217034e-05, + "grad_norm": 4.168674468994141, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8739093542098999, + "num_tokens": 701284830.0, + "step": 18378 + }, + { + "epoch": 2.337997710214985, + "ewc_loss": 0.008396935649216175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39693529997021e-05, + "grad_norm": 4.125207424163818, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8885327577590942, + "num_tokens": 701319720.0, + "step": 18379 + }, + { + "epoch": 2.338124920493576, + "ewc_loss": 0.008397586643695831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397586498176679e-05, + "grad_norm": 4.1453328132629395, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8612666130065918, + "num_tokens": 701361770.0, + "step": 18380 + }, + { + "epoch": 2.338252130772166, + "ewc_loss": 0.00843045487999916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430454909102991e-05, + "grad_norm": 4.173971176147461, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8789609670639038, + "num_tokens": 701399777.0, + "step": 18381 + }, + { + "epoch": 2.3383793410507567, + "ewc_loss": 0.0084200669080019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420067024417222e-05, + "grad_norm": 4.2122321128845215, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8818188309669495, + "num_tokens": 701432680.0, + "step": 18382 + }, + { + "epoch": 2.3385065513293473, + "ewc_loss": 0.008449714630842209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449714368907735e-05, + "grad_norm": 4.245333671569824, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8901270031929016, + "num_tokens": 701474101.0, + "step": 18383 + }, + { + "epoch": 2.338633761607938, + "ewc_loss": 0.008437007665634155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437007636530325e-05, + "grad_norm": 4.101469993591309, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8929401636123657, + "num_tokens": 701517568.0, + "step": 18384 + }, + { + "epoch": 2.3387609718865283, + "ewc_loss": 0.008357441052794456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35744067444466e-05, + "grad_norm": 4.171321392059326, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8748340606689453, + "num_tokens": 701558466.0, + "step": 18385 + }, + { + "epoch": 2.338888182165119, + "ewc_loss": 0.008417471311986446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417471690336242e-05, + "grad_norm": 4.171854496002197, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8774123191833496, + "num_tokens": 701591104.0, + "step": 18386 + }, + { + "epoch": 2.3390153924437094, + "ewc_loss": 0.008399666287004948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399665966862813e-05, + "grad_norm": 4.120728969573975, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8747479915618896, + "num_tokens": 701633242.0, + "step": 18387 + }, + { + "epoch": 2.3391426027223, + "ewc_loss": 0.008385570719838142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385570254176855e-05, + "grad_norm": 4.212916374206543, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8795616626739502, + "num_tokens": 701668864.0, + "step": 18388 + }, + { + "epoch": 2.3392698130008904, + "ewc_loss": 0.008447508327662945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447508298559114e-05, + "grad_norm": 4.169511318206787, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8862946033477783, + "num_tokens": 701703648.0, + "step": 18389 + }, + { + "epoch": 2.339397023279481, + "ewc_loss": 0.008390458300709724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390458242502064e-05, + "grad_norm": 4.0833563804626465, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.879300057888031, + "num_tokens": 701744417.0, + "step": 18390 + }, + { + "epoch": 2.3395242335580715, + "ewc_loss": 0.008389116264879704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389115828322247e-05, + "grad_norm": 4.197380542755127, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.850858747959137, + "num_tokens": 701783541.0, + "step": 18391 + }, + { + "epoch": 2.339651443836662, + "ewc_loss": 0.008482187986373901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482187695335597e-05, + "grad_norm": 4.121633052825928, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8755851984024048, + "num_tokens": 701825054.0, + "step": 18392 + }, + { + "epoch": 2.3397786541152525, + "ewc_loss": 0.008401408791542053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40140855871141e-05, + "grad_norm": 4.1118083000183105, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8879298567771912, + "num_tokens": 701866173.0, + "step": 18393 + }, + { + "epoch": 2.339905864393843, + "ewc_loss": 0.008451882749795914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451882604276761e-05, + "grad_norm": 4.177107334136963, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8777965307235718, + "num_tokens": 701904812.0, + "step": 18394 + }, + { + "epoch": 2.3400330746724336, + "ewc_loss": 0.00847984291613102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479842654196545e-05, + "grad_norm": 4.162626266479492, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8708348274230957, + "num_tokens": 701942590.0, + "step": 18395 + }, + { + "epoch": 2.340160284951024, + "ewc_loss": 0.008425307460129261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425307896686718e-05, + "grad_norm": 4.172543048858643, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8842751979827881, + "num_tokens": 701980773.0, + "step": 18396 + }, + { + "epoch": 2.3402874952296147, + "ewc_loss": 0.008450264111161232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450263703707606e-05, + "grad_norm": 4.120528697967529, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8936706185340881, + "num_tokens": 702014508.0, + "step": 18397 + }, + { + "epoch": 2.340414705508205, + "ewc_loss": 0.008420848287642002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420848462264985e-05, + "grad_norm": 4.094781398773193, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8930829763412476, + "num_tokens": 702055754.0, + "step": 18398 + }, + { + "epoch": 2.3405419157867957, + "ewc_loss": 0.008430042304098606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430042362306267e-05, + "grad_norm": 4.0959343910217285, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8802816867828369, + "num_tokens": 702100022.0, + "step": 18399 + }, + { + "epoch": 2.3406691260653862, + "ewc_loss": 0.008429458364844322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.429458102909848e-05, + "grad_norm": 4.22987699508667, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.873458981513977, + "num_tokens": 702133217.0, + "step": 18400 + }, + { + "epoch": 2.3407963363439768, + "ewc_loss": 0.00851481407880783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514813816873357e-05, + "grad_norm": 4.118148326873779, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8928841948509216, + "num_tokens": 702176998.0, + "step": 18401 + }, + { + "epoch": 2.3409235466225673, + "ewc_loss": 0.008379560895264149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379561040783301e-05, + "grad_norm": 4.160270690917969, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8885420560836792, + "num_tokens": 702216219.0, + "step": 18402 + }, + { + "epoch": 2.341050756901158, + "ewc_loss": 0.008446373045444489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446373249171302e-05, + "grad_norm": 4.330050945281982, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8754180669784546, + "num_tokens": 702246502.0, + "step": 18403 + }, + { + "epoch": 2.341177967179748, + "ewc_loss": 0.008514286018908024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514286309946328e-05, + "grad_norm": 4.191445827484131, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8880996704101562, + "num_tokens": 702277990.0, + "step": 18404 + }, + { + "epoch": 2.341305177458339, + "ewc_loss": 0.008376370184123516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376370533369482e-05, + "grad_norm": 4.219287395477295, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8848650455474854, + "num_tokens": 702312742.0, + "step": 18405 + }, + { + "epoch": 2.341432387736929, + "ewc_loss": 0.00845407322049141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454073395114392e-05, + "grad_norm": 4.155424118041992, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8941488265991211, + "num_tokens": 702348575.0, + "step": 18406 + }, + { + "epoch": 2.3415595980155195, + "ewc_loss": 0.008408145047724247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40814464027062e-05, + "grad_norm": 4.199334144592285, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8710131645202637, + "num_tokens": 702385956.0, + "step": 18407 + }, + { + "epoch": 2.34168680829411, + "ewc_loss": 0.008455798029899597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455797797068954e-05, + "grad_norm": 4.149639129638672, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8856096863746643, + "num_tokens": 702423973.0, + "step": 18408 + }, + { + "epoch": 2.3418140185727006, + "ewc_loss": 0.008424678817391396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424678526353091e-05, + "grad_norm": 4.181895732879639, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8785414099693298, + "num_tokens": 702455508.0, + "step": 18409 + }, + { + "epoch": 2.341941228851291, + "ewc_loss": 0.008467593230307102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467593579553068e-05, + "grad_norm": 4.168800354003906, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8906964063644409, + "num_tokens": 702493365.0, + "step": 18410 + }, + { + "epoch": 2.3420684391298816, + "ewc_loss": 0.008438381366431713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438381337327883e-05, + "grad_norm": 4.199828624725342, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8838301301002502, + "num_tokens": 702528036.0, + "step": 18411 + }, + { + "epoch": 2.342195649408472, + "ewc_loss": 0.008460739627480507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460739627480507e-05, + "grad_norm": 4.134069442749023, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8763864040374756, + "num_tokens": 702571238.0, + "step": 18412 + }, + { + "epoch": 2.3423228596870627, + "ewc_loss": 0.008427556604146957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4275561675895e-05, + "grad_norm": 4.162163257598877, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8771708607673645, + "num_tokens": 702611247.0, + "step": 18413 + }, + { + "epoch": 2.342450069965653, + "ewc_loss": 0.008458705618977547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458705997327343e-05, + "grad_norm": 4.138772010803223, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8812132477760315, + "num_tokens": 702649661.0, + "step": 18414 + }, + { + "epoch": 2.3425772802442437, + "ewc_loss": 0.008419414050877094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419414371019229e-05, + "grad_norm": 4.151559352874756, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8754861950874329, + "num_tokens": 702693573.0, + "step": 18415 + }, + { + "epoch": 2.3427044905228342, + "ewc_loss": 0.008419565856456757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419565710937604e-05, + "grad_norm": 4.105065822601318, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8909066319465637, + "num_tokens": 702735351.0, + "step": 18416 + }, + { + "epoch": 2.3428317008014248, + "ewc_loss": 0.008406246080994606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406245615333319e-05, + "grad_norm": 4.161984443664551, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8887777328491211, + "num_tokens": 702773406.0, + "step": 18417 + }, + { + "epoch": 2.3429589110800153, + "ewc_loss": 0.008420384488999844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4203842561692e-05, + "grad_norm": 4.165567398071289, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8877755403518677, + "num_tokens": 702809939.0, + "step": 18418 + }, + { + "epoch": 2.343086121358606, + "ewc_loss": 0.008399124257266521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399124635616317e-05, + "grad_norm": 4.122917175292969, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8790122270584106, + "num_tokens": 702851120.0, + "step": 18419 + }, + { + "epoch": 2.3432133316371964, + "ewc_loss": 0.008382787927985191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382787927985191e-05, + "grad_norm": 4.143372058868408, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8899827003479004, + "num_tokens": 702889492.0, + "step": 18420 + }, + { + "epoch": 2.343340541915787, + "ewc_loss": 0.008393595926463604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393596363021061e-05, + "grad_norm": 4.163771629333496, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8749029636383057, + "num_tokens": 702930486.0, + "step": 18421 + }, + { + "epoch": 2.3434677521943774, + "ewc_loss": 0.00839411374181509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394113683607429e-05, + "grad_norm": 4.169926166534424, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8746833801269531, + "num_tokens": 702969866.0, + "step": 18422 + }, + { + "epoch": 2.343594962472968, + "ewc_loss": 0.008374551311135292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374551543965936e-05, + "grad_norm": 4.131407260894775, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8735251426696777, + "num_tokens": 703013703.0, + "step": 18423 + }, + { + "epoch": 2.3437221727515585, + "ewc_loss": 0.008355695754289627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35569589980878e-05, + "grad_norm": 4.172758102416992, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8716989755630493, + "num_tokens": 703054849.0, + "step": 18424 + }, + { + "epoch": 2.343849383030149, + "ewc_loss": 0.00838439166545868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384391549043357e-05, + "grad_norm": 4.229095935821533, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8779913783073425, + "num_tokens": 703088250.0, + "step": 18425 + }, + { + "epoch": 2.3439765933087395, + "ewc_loss": 0.008416282944381237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416282798862085e-05, + "grad_norm": 4.192358493804932, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8761518001556396, + "num_tokens": 703124072.0, + "step": 18426 + }, + { + "epoch": 2.3441038035873296, + "ewc_loss": 0.008373954333364964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373954187845811e-05, + "grad_norm": 4.202910900115967, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8657964468002319, + "num_tokens": 703158869.0, + "step": 18427 + }, + { + "epoch": 2.3442310138659206, + "ewc_loss": 0.008405404165387154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405404514633119e-05, + "grad_norm": 4.159224510192871, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8823962807655334, + "num_tokens": 703193342.0, + "step": 18428 + }, + { + "epoch": 2.3443582241445107, + "ewc_loss": 0.008382340893149376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382340456591919e-05, + "grad_norm": 4.145959854125977, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8938829898834229, + "num_tokens": 703229899.0, + "step": 18429 + }, + { + "epoch": 2.344485434423101, + "ewc_loss": 0.008409497328102589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409497240791097e-05, + "grad_norm": 4.151276111602783, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8812466859817505, + "num_tokens": 703266735.0, + "step": 18430 + }, + { + "epoch": 2.3446126447016917, + "ewc_loss": 0.008423538878560066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.42353911139071e-05, + "grad_norm": 4.143134593963623, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.877254068851471, + "num_tokens": 703307515.0, + "step": 18431 + }, + { + "epoch": 2.3447398549802823, + "ewc_loss": 0.008415326476097107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415326738031581e-05, + "grad_norm": 4.1568098068237305, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.892056405544281, + "num_tokens": 703341237.0, + "step": 18432 + }, + { + "epoch": 2.344867065258873, + "ewc_loss": 0.008426892571151257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426892600255087e-05, + "grad_norm": 4.133764743804932, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8901716470718384, + "num_tokens": 703378900.0, + "step": 18433 + }, + { + "epoch": 2.3449942755374633, + "ewc_loss": 0.00841103307902813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411033195443451e-05, + "grad_norm": 4.174679756164551, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8908085823059082, + "num_tokens": 703415260.0, + "step": 18434 + }, + { + "epoch": 2.345121485816054, + "ewc_loss": 0.00845490861684084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4549086750485e-05, + "grad_norm": 4.144313335418701, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.886980414390564, + "num_tokens": 703452399.0, + "step": 18435 + }, + { + "epoch": 2.3452486960946444, + "ewc_loss": 0.008413749746978283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413750038016587e-05, + "grad_norm": 4.14841365814209, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8829999566078186, + "num_tokens": 703492700.0, + "step": 18436 + }, + { + "epoch": 2.345375906373235, + "ewc_loss": 0.008420848287642002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420848462264985e-05, + "grad_norm": 4.157951354980469, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8928935527801514, + "num_tokens": 703530805.0, + "step": 18437 + }, + { + "epoch": 2.3455031166518254, + "ewc_loss": 0.008425145410001278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425145642831922e-05, + "grad_norm": 4.167257308959961, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8905112743377686, + "num_tokens": 703571894.0, + "step": 18438 + }, + { + "epoch": 2.345630326930416, + "ewc_loss": 0.008410166949033737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410166628891602e-05, + "grad_norm": 4.142763614654541, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8871471881866455, + "num_tokens": 703612583.0, + "step": 18439 + }, + { + "epoch": 2.3457575372090065, + "ewc_loss": 0.008414567448198795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41456712805666e-05, + "grad_norm": 4.2251973152160645, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.874380350112915, + "num_tokens": 703649100.0, + "step": 18440 + }, + { + "epoch": 2.345884747487597, + "ewc_loss": 0.00845290720462799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452907059108838e-05, + "grad_norm": 4.227392673492432, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.8964478969573975, + "num_tokens": 703680425.0, + "step": 18441 + }, + { + "epoch": 2.3460119577661875, + "ewc_loss": 0.00839924719184637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399246871704236e-05, + "grad_norm": 4.128009796142578, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.887405276298523, + "num_tokens": 703721819.0, + "step": 18442 + }, + { + "epoch": 2.346139168044778, + "ewc_loss": 0.008348377421498299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348377741640434e-05, + "grad_norm": 4.214863300323486, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8804140090942383, + "num_tokens": 703755432.0, + "step": 18443 + }, + { + "epoch": 2.3462663783233686, + "ewc_loss": 0.00844498910009861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444989362033084e-05, + "grad_norm": 4.143625736236572, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8799805641174316, + "num_tokens": 703795903.0, + "step": 18444 + }, + { + "epoch": 2.346393588601959, + "ewc_loss": 0.008367909118533134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367909322259948e-05, + "grad_norm": 4.171323299407959, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8930391073226929, + "num_tokens": 703836074.0, + "step": 18445 + }, + { + "epoch": 2.3465207988805497, + "ewc_loss": 0.008402833715081215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402833918808028e-05, + "grad_norm": 4.227315425872803, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8858898878097534, + "num_tokens": 703870609.0, + "step": 18446 + }, + { + "epoch": 2.34664800915914, + "ewc_loss": 0.008432065136730671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43206507852301e-05, + "grad_norm": 4.19251823425293, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8866922855377197, + "num_tokens": 703909034.0, + "step": 18447 + }, + { + "epoch": 2.3467752194377307, + "ewc_loss": 0.008365945890545845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365946268895641e-05, + "grad_norm": 4.185630798339844, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8744834661483765, + "num_tokens": 703946976.0, + "step": 18448 + }, + { + "epoch": 2.3469024297163212, + "ewc_loss": 0.008389695547521114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389695722144097e-05, + "grad_norm": 4.150012493133545, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8839709162712097, + "num_tokens": 703986379.0, + "step": 18449 + }, + { + "epoch": 2.3470296399949118, + "ewc_loss": 0.008349484764039516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349485142389312e-05, + "grad_norm": 4.145643711090088, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8985376358032227, + "num_tokens": 704019461.0, + "step": 18450 + }, + { + "epoch": 2.3471568502735023, + "ewc_loss": 0.008370165713131428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370165596716106e-05, + "grad_norm": 4.213937282562256, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8681563138961792, + "num_tokens": 704053411.0, + "step": 18451 + }, + { + "epoch": 2.3472840605520924, + "ewc_loss": 0.008416966535151005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416966738877818e-05, + "grad_norm": 4.152679920196533, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8855935335159302, + "num_tokens": 704092633.0, + "step": 18452 + }, + { + "epoch": 2.3474112708306833, + "ewc_loss": 0.008366354741156101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366355177713558e-05, + "grad_norm": 4.161970615386963, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8870055675506592, + "num_tokens": 704126955.0, + "step": 18453 + }, + { + "epoch": 2.3475384811092734, + "ewc_loss": 0.008412571623921394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41257133288309e-05, + "grad_norm": 4.193286418914795, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8977227807044983, + "num_tokens": 704162222.0, + "step": 18454 + }, + { + "epoch": 2.347665691387864, + "ewc_loss": 0.008413822390139103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41382279759273e-05, + "grad_norm": 4.190333843231201, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.882105827331543, + "num_tokens": 704198949.0, + "step": 18455 + }, + { + "epoch": 2.3477929016664545, + "ewc_loss": 0.008405176922678947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405176777159795e-05, + "grad_norm": 4.207798957824707, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8775513768196106, + "num_tokens": 704235327.0, + "step": 18456 + }, + { + "epoch": 2.347920111945045, + "ewc_loss": 0.008446977473795414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44697788124904e-05, + "grad_norm": 4.181282997131348, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.888560950756073, + "num_tokens": 704272783.0, + "step": 18457 + }, + { + "epoch": 2.3480473222236355, + "ewc_loss": 0.008391018025577068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391017763642594e-05, + "grad_norm": 4.148250579833984, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8642939329147339, + "num_tokens": 704314396.0, + "step": 18458 + }, + { + "epoch": 2.348174532502226, + "ewc_loss": 0.00839141570031643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39141575852409e-05, + "grad_norm": 4.187386512756348, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8968178629875183, + "num_tokens": 704346903.0, + "step": 18459 + }, + { + "epoch": 2.3483017427808166, + "ewc_loss": 0.00844320934265852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443209662800655e-05, + "grad_norm": 4.1540961265563965, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8884129524230957, + "num_tokens": 704380456.0, + "step": 18460 + }, + { + "epoch": 2.348428953059407, + "ewc_loss": 0.00841151736676693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411517774220556e-05, + "grad_norm": 4.162414073944092, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8943073153495789, + "num_tokens": 704416403.0, + "step": 18461 + }, + { + "epoch": 2.3485561633379977, + "ewc_loss": 0.008442244492471218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442244870821014e-05, + "grad_norm": 4.142885684967041, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8834652900695801, + "num_tokens": 704460431.0, + "step": 18462 + }, + { + "epoch": 2.348683373616588, + "ewc_loss": 0.008416504599153996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416504715569317e-05, + "grad_norm": 4.173303604125977, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8886525630950928, + "num_tokens": 704497872.0, + "step": 18463 + }, + { + "epoch": 2.3488105838951787, + "ewc_loss": 0.008453543297946453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45354370540008e-05, + "grad_norm": 4.168424129486084, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8844965100288391, + "num_tokens": 704538180.0, + "step": 18464 + }, + { + "epoch": 2.3489377941737692, + "ewc_loss": 0.008432981558144093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432981849182397e-05, + "grad_norm": 4.15200138092041, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8931604027748108, + "num_tokens": 704578342.0, + "step": 18465 + }, + { + "epoch": 2.3490650044523598, + "ewc_loss": 0.008414830081164837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414830517722294e-05, + "grad_norm": 4.172535419464111, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8927191495895386, + "num_tokens": 704611792.0, + "step": 18466 + }, + { + "epoch": 2.3491922147309503, + "ewc_loss": 0.008426721207797527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426720887655392e-05, + "grad_norm": 4.185111045837402, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8834567070007324, + "num_tokens": 704649898.0, + "step": 18467 + }, + { + "epoch": 2.349319425009541, + "ewc_loss": 0.00843909103423357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439091470791027e-05, + "grad_norm": 4.165698051452637, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8895590305328369, + "num_tokens": 704690632.0, + "step": 18468 + }, + { + "epoch": 2.3494466352881314, + "ewc_loss": 0.008390460163354874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390460425289348e-05, + "grad_norm": 4.264118671417236, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8732435703277588, + "num_tokens": 704722955.0, + "step": 18469 + }, + { + "epoch": 2.349573845566722, + "ewc_loss": 0.008459929376840591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459929813398048e-05, + "grad_norm": 4.198578834533691, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.881753146648407, + "num_tokens": 704762678.0, + "step": 18470 + }, + { + "epoch": 2.3497010558453124, + "ewc_loss": 0.00838425476104021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384254761040211e-05, + "grad_norm": 4.059525489807129, + "learning_rate": 1e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9056271910667419, + "num_tokens": 704805440.0, + "step": 18471 + }, + { + "epoch": 2.349828266123903, + "ewc_loss": 0.008328140713274479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328141120728105e-05, + "grad_norm": 4.206263542175293, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8635964393615723, + "num_tokens": 704848153.0, + "step": 18472 + }, + { + "epoch": 2.3499554764024935, + "ewc_loss": 0.008440392091870308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440391684416682e-05, + "grad_norm": 4.161348342895508, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8813751935958862, + "num_tokens": 704886739.0, + "step": 18473 + }, + { + "epoch": 2.350082686681084, + "ewc_loss": 0.008327871561050415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327871182700619e-05, + "grad_norm": 4.1367950439453125, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8823657631874084, + "num_tokens": 704926145.0, + "step": 18474 + }, + { + "epoch": 2.3502098969596745, + "ewc_loss": 0.00836104154586792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36104154586792e-05, + "grad_norm": 4.16032600402832, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8676644563674927, + "num_tokens": 704964135.0, + "step": 18475 + }, + { + "epoch": 2.350337107238265, + "ewc_loss": 0.008375179022550583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375178731512278e-05, + "grad_norm": 4.218395709991455, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8716635704040527, + "num_tokens": 705002171.0, + "step": 18476 + }, + { + "epoch": 2.350464317516855, + "ewc_loss": 0.008393562398850918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393562166020274e-05, + "grad_norm": 4.1157026290893555, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.8996713161468506, + "num_tokens": 705040660.0, + "step": 18477 + }, + { + "epoch": 2.350591527795446, + "ewc_loss": 0.008320944383740425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320944471051916e-05, + "grad_norm": 4.209731578826904, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8896059989929199, + "num_tokens": 705073655.0, + "step": 18478 + }, + { + "epoch": 2.350718738074036, + "ewc_loss": 0.008402454666793346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402454841416329e-05, + "grad_norm": 4.191293239593506, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8801007270812988, + "num_tokens": 705115497.0, + "step": 18479 + }, + { + "epoch": 2.3508459483526267, + "ewc_loss": 0.00834124255925417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341242937603965e-05, + "grad_norm": 4.135013580322266, + "learning_rate": 1e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.899719774723053, + "num_tokens": 705152642.0, + "step": 18480 + }, + { + "epoch": 2.3509731586312173, + "ewc_loss": 0.008311872370541096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.311872079502791e-05, + "grad_norm": 4.247674465179443, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8683377504348755, + "num_tokens": 705187282.0, + "step": 18481 + }, + { + "epoch": 2.351100368909808, + "ewc_loss": 0.00840922724455595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409227302763611e-05, + "grad_norm": 4.162578105926514, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.885656476020813, + "num_tokens": 705225635.0, + "step": 18482 + }, + { + "epoch": 2.3512275791883983, + "ewc_loss": 0.008310084231197834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.310084376716986e-05, + "grad_norm": 4.187548637390137, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.891290545463562, + "num_tokens": 705260376.0, + "step": 18483 + }, + { + "epoch": 2.351354789466989, + "ewc_loss": 0.008360080420970917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360080391867086e-05, + "grad_norm": 4.156102180480957, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8825729489326477, + "num_tokens": 705298799.0, + "step": 18484 + }, + { + "epoch": 2.3514819997455794, + "ewc_loss": 0.008343419060111046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.343419176526368e-05, + "grad_norm": 4.232710838317871, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8659024238586426, + "num_tokens": 705332570.0, + "step": 18485 + }, + { + "epoch": 2.35160921002417, + "ewc_loss": 0.00840723980218172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407240238739178e-05, + "grad_norm": 4.156347274780273, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.884014368057251, + "num_tokens": 705371214.0, + "step": 18486 + }, + { + "epoch": 2.3517364203027604, + "ewc_loss": 0.00833488255739212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.334882295457646e-05, + "grad_norm": 4.217351913452148, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8802664875984192, + "num_tokens": 705405051.0, + "step": 18487 + }, + { + "epoch": 2.351863630581351, + "ewc_loss": 0.008407756686210632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407756831729785e-05, + "grad_norm": 4.146907329559326, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8795790672302246, + "num_tokens": 705442460.0, + "step": 18488 + }, + { + "epoch": 2.3519908408599415, + "ewc_loss": 0.008361153304576874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361152868019417e-05, + "grad_norm": 4.158092498779297, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8825640678405762, + "num_tokens": 705482929.0, + "step": 18489 + }, + { + "epoch": 2.352118051138532, + "ewc_loss": 0.008402124978601933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402125240536407e-05, + "grad_norm": 4.123327732086182, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8769109845161438, + "num_tokens": 705525313.0, + "step": 18490 + }, + { + "epoch": 2.3522452614171225, + "ewc_loss": 0.008387137204408646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387136767851189e-05, + "grad_norm": 4.200494289398193, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8772842884063721, + "num_tokens": 705563730.0, + "step": 18491 + }, + { + "epoch": 2.352372471695713, + "ewc_loss": 0.008445458486676216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445458661299199e-05, + "grad_norm": 4.194237232208252, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8846112489700317, + "num_tokens": 705602459.0, + "step": 18492 + }, + { + "epoch": 2.3524996819743036, + "ewc_loss": 0.008391857147216797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391857409151271e-05, + "grad_norm": 4.1986002922058105, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8809171915054321, + "num_tokens": 705636223.0, + "step": 18493 + }, + { + "epoch": 2.352626892252894, + "ewc_loss": 0.008414884097874165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414884359808639e-05, + "grad_norm": 4.167707443237305, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8807207345962524, + "num_tokens": 705676178.0, + "step": 18494 + }, + { + "epoch": 2.3527541025314846, + "ewc_loss": 0.008399155922234058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399155922234058e-05, + "grad_norm": 4.1617889404296875, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.889388918876648, + "num_tokens": 705716687.0, + "step": 18495 + }, + { + "epoch": 2.352881312810075, + "ewc_loss": 0.008411440998315811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411441376665607e-05, + "grad_norm": 4.0861029624938965, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8882717490196228, + "num_tokens": 705759163.0, + "step": 18496 + }, + { + "epoch": 2.3530085230886657, + "ewc_loss": 0.008386587724089622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386587433051318e-05, + "grad_norm": 4.200338840484619, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8827839493751526, + "num_tokens": 705795982.0, + "step": 18497 + }, + { + "epoch": 2.3531357333672562, + "ewc_loss": 0.008449845016002655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449844608549029e-05, + "grad_norm": 4.149105072021484, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8855575323104858, + "num_tokens": 705835601.0, + "step": 18498 + }, + { + "epoch": 2.3532629436458468, + "ewc_loss": 0.008375356905162334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375356992473826e-05, + "grad_norm": 4.168879985809326, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8789989948272705, + "num_tokens": 705876858.0, + "step": 18499 + }, + { + "epoch": 2.353390153924437, + "ewc_loss": 0.00841316394507885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413164323428646e-05, + "grad_norm": 4.1510396003723145, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8733015060424805, + "num_tokens": 705921944.0, + "step": 18500 + }, + { + "epoch": 2.353517364203028, + "ewc_loss": 0.008405269123613834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405269181821495e-05, + "grad_norm": 4.147571563720703, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8828442096710205, + "num_tokens": 705967999.0, + "step": 18501 + }, + { + "epoch": 2.353644574481618, + "ewc_loss": 0.008384744636714458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384744432987645e-05, + "grad_norm": 4.166463851928711, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8746249675750732, + "num_tokens": 706003370.0, + "step": 18502 + }, + { + "epoch": 2.353771784760209, + "ewc_loss": 0.008378769271075726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378769416594878e-05, + "grad_norm": 4.234927654266357, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8715742230415344, + "num_tokens": 706036551.0, + "step": 18503 + }, + { + "epoch": 2.353898995038799, + "ewc_loss": 0.008418239653110504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4182400314603e-05, + "grad_norm": 4.114321231842041, + "learning_rate": 1e-06, + "loss": 0.2883, + "mean_token_accuracy": 0.8995875120162964, + "num_tokens": 706075965.0, + "step": 18504 + }, + { + "epoch": 2.3540262053173895, + "ewc_loss": 0.00831630453467369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316304592881352e-05, + "grad_norm": 4.17759895324707, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8701842427253723, + "num_tokens": 706117943.0, + "step": 18505 + }, + { + "epoch": 2.35415341559598, + "ewc_loss": 0.008402237668633461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402237290283665e-05, + "grad_norm": 4.1781005859375, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.884249746799469, + "num_tokens": 706157183.0, + "step": 18506 + }, + { + "epoch": 2.3542806258745705, + "ewc_loss": 0.008371390402317047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371390140382573e-05, + "grad_norm": 4.1495680809021, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8884084820747375, + "num_tokens": 706196878.0, + "step": 18507 + }, + { + "epoch": 2.354407836153161, + "ewc_loss": 0.008349256590008736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349256677320227e-05, + "grad_norm": 4.216291904449463, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8694614171981812, + "num_tokens": 706231102.0, + "step": 18508 + }, + { + "epoch": 2.3545350464317516, + "ewc_loss": 0.008396388031542301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396388147957623e-05, + "grad_norm": 4.1384172439575195, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8890897631645203, + "num_tokens": 706273372.0, + "step": 18509 + }, + { + "epoch": 2.354662256710342, + "ewc_loss": 0.008339625783264637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339625492226332e-05, + "grad_norm": 4.284139633178711, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8762797117233276, + "num_tokens": 706308106.0, + "step": 18510 + }, + { + "epoch": 2.3547894669889327, + "ewc_loss": 0.008440963923931122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440963574685156e-05, + "grad_norm": 4.125111103057861, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8902848958969116, + "num_tokens": 706344267.0, + "step": 18511 + }, + { + "epoch": 2.354916677267523, + "ewc_loss": 0.00831684097647667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.316840830957517e-05, + "grad_norm": 4.137916088104248, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8977894186973572, + "num_tokens": 706385019.0, + "step": 18512 + }, + { + "epoch": 2.3550438875461137, + "ewc_loss": 0.00837520882487297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375209290534258e-05, + "grad_norm": 4.124555587768555, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8812710046768188, + "num_tokens": 706424913.0, + "step": 18513 + }, + { + "epoch": 2.3551710978247042, + "ewc_loss": 0.008366746827960014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366746624233201e-05, + "grad_norm": 4.189407825469971, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.882276177406311, + "num_tokens": 706466064.0, + "step": 18514 + }, + { + "epoch": 2.3552983081032948, + "ewc_loss": 0.008401765488088131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401765080634505e-05, + "grad_norm": 4.143204212188721, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8903064727783203, + "num_tokens": 706504572.0, + "step": 18515 + }, + { + "epoch": 2.3554255183818853, + "ewc_loss": 0.008355848491191864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355848694918677e-05, + "grad_norm": 4.186671733856201, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8773589134216309, + "num_tokens": 706543901.0, + "step": 18516 + }, + { + "epoch": 2.355552728660476, + "ewc_loss": 0.008405978791415691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405978587688878e-05, + "grad_norm": 4.209076404571533, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8768343329429626, + "num_tokens": 706579095.0, + "step": 18517 + }, + { + "epoch": 2.3556799389390664, + "ewc_loss": 0.008397850207984447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397850615438074e-05, + "grad_norm": 4.194756031036377, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8762214183807373, + "num_tokens": 706614078.0, + "step": 18518 + }, + { + "epoch": 2.355807149217657, + "ewc_loss": 0.008388430811464787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38843043311499e-05, + "grad_norm": 4.191038131713867, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8879591226577759, + "num_tokens": 706652596.0, + "step": 18519 + }, + { + "epoch": 2.3559343594962474, + "ewc_loss": 0.00840162392705679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40162392705679e-05, + "grad_norm": 4.125025272369385, + "learning_rate": 1e-06, + "loss": 0.2745, + "mean_token_accuracy": 0.9051637649536133, + "num_tokens": 706691297.0, + "step": 18520 + }, + { + "epoch": 2.356061569774838, + "ewc_loss": 0.008373003453016281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373003220185637e-05, + "grad_norm": 4.207394123077393, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8721632361412048, + "num_tokens": 706727344.0, + "step": 18521 + }, + { + "epoch": 2.3561887800534285, + "ewc_loss": 0.008448922075331211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44892201712355e-05, + "grad_norm": 4.123293399810791, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8843134045600891, + "num_tokens": 706770411.0, + "step": 18522 + }, + { + "epoch": 2.356315990332019, + "ewc_loss": 0.008358530700206757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358530612895265e-05, + "grad_norm": 4.150086879730225, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8854820728302002, + "num_tokens": 706816215.0, + "step": 18523 + }, + { + "epoch": 2.3564432006106095, + "ewc_loss": 0.008395561948418617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395561599172652e-05, + "grad_norm": 4.135347366333008, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8774223923683167, + "num_tokens": 706857400.0, + "step": 18524 + }, + { + "epoch": 2.3565704108891996, + "ewc_loss": 0.008386333473026752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386333502130583e-05, + "grad_norm": 4.18658447265625, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8756924867630005, + "num_tokens": 706892715.0, + "step": 18525 + }, + { + "epoch": 2.3566976211677906, + "ewc_loss": 0.008392730727791786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.392730524064973e-05, + "grad_norm": 4.165802478790283, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8840638399124146, + "num_tokens": 706934373.0, + "step": 18526 + }, + { + "epoch": 2.3568248314463807, + "ewc_loss": 0.008389783091843128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389783033635467e-05, + "grad_norm": 4.14354944229126, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.883739709854126, + "num_tokens": 706976885.0, + "step": 18527 + }, + { + "epoch": 2.356952041724971, + "ewc_loss": 0.00835154578089714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.351545693585649e-05, + "grad_norm": 4.139069080352783, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8886167407035828, + "num_tokens": 707016957.0, + "step": 18528 + }, + { + "epoch": 2.3570792520035617, + "ewc_loss": 0.00834907777607441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349077688762918e-05, + "grad_norm": 4.155976295471191, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8914694786071777, + "num_tokens": 707053687.0, + "step": 18529 + }, + { + "epoch": 2.3572064622821522, + "ewc_loss": 0.008374427445232868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374427125090733e-05, + "grad_norm": 4.244647979736328, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8754206895828247, + "num_tokens": 707087327.0, + "step": 18530 + }, + { + "epoch": 2.3573336725607428, + "ewc_loss": 0.008411383256316185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411383169004694e-05, + "grad_norm": 4.155455589294434, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8933520317077637, + "num_tokens": 707124479.0, + "step": 18531 + }, + { + "epoch": 2.3574608828393333, + "ewc_loss": 0.00832725502550602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.327254909090698e-05, + "grad_norm": 4.162069797515869, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8866857290267944, + "num_tokens": 707162452.0, + "step": 18532 + }, + { + "epoch": 2.357588093117924, + "ewc_loss": 0.008387131616473198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387131674680859e-05, + "grad_norm": 4.2265095710754395, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8916304111480713, + "num_tokens": 707196765.0, + "step": 18533 + }, + { + "epoch": 2.3577153033965144, + "ewc_loss": 0.008411066606640816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411066664848477e-05, + "grad_norm": 4.177337646484375, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8901030421257019, + "num_tokens": 707230151.0, + "step": 18534 + }, + { + "epoch": 2.357842513675105, + "ewc_loss": 0.00836253073066473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362530934391543e-05, + "grad_norm": 4.202089309692383, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.865649938583374, + "num_tokens": 707264373.0, + "step": 18535 + }, + { + "epoch": 2.3579697239536954, + "ewc_loss": 0.008418112993240356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418113429797813e-05, + "grad_norm": 4.183791637420654, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.877623975276947, + "num_tokens": 707304844.0, + "step": 18536 + }, + { + "epoch": 2.358096934232286, + "ewc_loss": 0.008412348106503487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.412347960984334e-05, + "grad_norm": 4.174901485443115, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8843486309051514, + "num_tokens": 707347081.0, + "step": 18537 + }, + { + "epoch": 2.3582241445108765, + "ewc_loss": 0.008403114974498749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403114770771936e-05, + "grad_norm": 4.179821491241455, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8773419260978699, + "num_tokens": 707388243.0, + "step": 18538 + }, + { + "epoch": 2.358351354789467, + "ewc_loss": 0.008393650874495506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393650932703167e-05, + "grad_norm": 4.217135429382324, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8702120780944824, + "num_tokens": 707427109.0, + "step": 18539 + }, + { + "epoch": 2.3584785650680575, + "ewc_loss": 0.008434943854808807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434944174950942e-05, + "grad_norm": 4.201968669891357, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8834685683250427, + "num_tokens": 707462502.0, + "step": 18540 + }, + { + "epoch": 2.358605775346648, + "ewc_loss": 0.008409949019551277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409949077758938e-05, + "grad_norm": 4.1703081130981445, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8784103393554688, + "num_tokens": 707502861.0, + "step": 18541 + }, + { + "epoch": 2.3587329856252386, + "ewc_loss": 0.008409899659454823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409899601247162e-05, + "grad_norm": 4.253229141235352, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8643008470535278, + "num_tokens": 707535868.0, + "step": 18542 + }, + { + "epoch": 2.358860195903829, + "ewc_loss": 0.008480561897158623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480561518808827e-05, + "grad_norm": 4.25832986831665, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8854141235351562, + "num_tokens": 707566128.0, + "step": 18543 + }, + { + "epoch": 2.3589874061824196, + "ewc_loss": 0.00844377838075161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443777915090322e-05, + "grad_norm": 4.1188788414001465, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8892835378646851, + "num_tokens": 707605113.0, + "step": 18544 + }, + { + "epoch": 2.35911461646101, + "ewc_loss": 0.008394434116780758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394433825742453e-05, + "grad_norm": 4.20711612701416, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8824390172958374, + "num_tokens": 707647968.0, + "step": 18545 + }, + { + "epoch": 2.3592418267396007, + "ewc_loss": 0.008484757505357265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484757563564926e-05, + "grad_norm": 4.217321395874023, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8893865346908569, + "num_tokens": 707682183.0, + "step": 18546 + }, + { + "epoch": 2.3593690370181912, + "ewc_loss": 0.008452977985143661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452977635897696e-05, + "grad_norm": 4.094664096832275, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8923020362854004, + "num_tokens": 707720938.0, + "step": 18547 + }, + { + "epoch": 2.3594962472967818, + "ewc_loss": 0.008391360752284527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391360461246222e-05, + "grad_norm": 4.155632495880127, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8698450326919556, + "num_tokens": 707762624.0, + "step": 18548 + }, + { + "epoch": 2.3596234575753723, + "ewc_loss": 0.008490167558193207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490167965646833e-05, + "grad_norm": 4.178123474121094, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.875900387763977, + "num_tokens": 707798406.0, + "step": 18549 + }, + { + "epoch": 2.3597506678539624, + "ewc_loss": 0.00845848023891449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458480442641303e-05, + "grad_norm": 4.176519870758057, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8842893838882446, + "num_tokens": 707835935.0, + "step": 18550 + }, + { + "epoch": 2.3598778781325533, + "ewc_loss": 0.00844637956470251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446379797533154e-05, + "grad_norm": 4.132511615753174, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8930885791778564, + "num_tokens": 707872971.0, + "step": 18551 + }, + { + "epoch": 2.3600050884111434, + "ewc_loss": 0.008417661301791668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417660865234211e-05, + "grad_norm": 4.175892353057861, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8934013843536377, + "num_tokens": 707909798.0, + "step": 18552 + }, + { + "epoch": 2.360132298689734, + "ewc_loss": 0.008454172872006893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454173075733706e-05, + "grad_norm": 4.134993553161621, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.887711226940155, + "num_tokens": 707951130.0, + "step": 18553 + }, + { + "epoch": 2.3602595089683245, + "ewc_loss": 0.0084229726344347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422973041888326e-05, + "grad_norm": 4.2050557136535645, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8705500364303589, + "num_tokens": 707989502.0, + "step": 18554 + }, + { + "epoch": 2.360386719246915, + "ewc_loss": 0.008475632406771183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475632785120979e-05, + "grad_norm": 4.190160274505615, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8803658485412598, + "num_tokens": 708023521.0, + "step": 18555 + }, + { + "epoch": 2.3605139295255055, + "ewc_loss": 0.008444424718618393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444424747722223e-05, + "grad_norm": 4.216967582702637, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8879680037498474, + "num_tokens": 708057594.0, + "step": 18556 + }, + { + "epoch": 2.360641139804096, + "ewc_loss": 0.00846120249480009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461202378384769e-05, + "grad_norm": 4.1358642578125, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8809770345687866, + "num_tokens": 708101393.0, + "step": 18557 + }, + { + "epoch": 2.3607683500826866, + "ewc_loss": 0.008403346873819828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403346873819828e-05, + "grad_norm": 4.220588207244873, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8926808834075928, + "num_tokens": 708137141.0, + "step": 18558 + }, + { + "epoch": 2.360895560361277, + "ewc_loss": 0.0084551265463233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455126953776926e-05, + "grad_norm": 4.1928229331970215, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8859720826148987, + "num_tokens": 708173497.0, + "step": 18559 + }, + { + "epoch": 2.3610227706398677, + "ewc_loss": 0.008408579975366592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408579742535949e-05, + "grad_norm": 4.204029083251953, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8822726011276245, + "num_tokens": 708209412.0, + "step": 18560 + }, + { + "epoch": 2.361149980918458, + "ewc_loss": 0.008408517576754093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408517169300467e-05, + "grad_norm": 4.212738037109375, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8890918493270874, + "num_tokens": 708239675.0, + "step": 18561 + }, + { + "epoch": 2.3612771911970487, + "ewc_loss": 0.00843526516109705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435265044681728e-05, + "grad_norm": 4.189492225646973, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8766125440597534, + "num_tokens": 708274918.0, + "step": 18562 + }, + { + "epoch": 2.3614044014756392, + "ewc_loss": 0.008425207808613777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425207488471642e-05, + "grad_norm": 4.171064853668213, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8676421046257019, + "num_tokens": 708313645.0, + "step": 18563 + }, + { + "epoch": 2.3615316117542298, + "ewc_loss": 0.008443320170044899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44332025735639e-05, + "grad_norm": 4.26595401763916, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8833584785461426, + "num_tokens": 708357985.0, + "step": 18564 + }, + { + "epoch": 2.3616588220328203, + "ewc_loss": 0.008499999530613422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499999239575118e-05, + "grad_norm": 4.222451210021973, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8640727996826172, + "num_tokens": 708393606.0, + "step": 18565 + }, + { + "epoch": 2.361786032311411, + "ewc_loss": 0.008446318097412586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446317951893434e-05, + "grad_norm": 4.183077812194824, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8877294063568115, + "num_tokens": 708428941.0, + "step": 18566 + }, + { + "epoch": 2.3619132425900013, + "ewc_loss": 0.008471079170703888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471079490846023e-05, + "grad_norm": 4.169286251068115, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8853772878646851, + "num_tokens": 708467847.0, + "step": 18567 + }, + { + "epoch": 2.362040452868592, + "ewc_loss": 0.008460728451609612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460727985948324e-05, + "grad_norm": 4.146264553070068, + "learning_rate": 1e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8973451852798462, + "num_tokens": 708503212.0, + "step": 18568 + }, + { + "epoch": 2.3621676631471824, + "ewc_loss": 0.008455225266516209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455225179204717e-05, + "grad_norm": 4.14315938949585, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8840078115463257, + "num_tokens": 708542897.0, + "step": 18569 + }, + { + "epoch": 2.362294873425773, + "ewc_loss": 0.008473493158817291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473492925986648e-05, + "grad_norm": 4.2624030113220215, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8728879690170288, + "num_tokens": 708574725.0, + "step": 18570 + }, + { + "epoch": 2.3624220837043635, + "ewc_loss": 0.008533665910363197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533665823051706e-05, + "grad_norm": 4.120453834533691, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8925020694732666, + "num_tokens": 708613557.0, + "step": 18571 + }, + { + "epoch": 2.362549293982954, + "ewc_loss": 0.008418153040111065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418153447564691e-05, + "grad_norm": 4.190572738647461, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.875325083732605, + "num_tokens": 708651872.0, + "step": 18572 + }, + { + "epoch": 2.3626765042615445, + "ewc_loss": 0.008518174290657043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518174581695348e-05, + "grad_norm": 4.169872760772705, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8720377683639526, + "num_tokens": 708688654.0, + "step": 18573 + }, + { + "epoch": 2.362803714540135, + "ewc_loss": 0.008479330688714981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479331154376268e-05, + "grad_norm": 4.16474723815918, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8897386193275452, + "num_tokens": 708723285.0, + "step": 18574 + }, + { + "epoch": 2.362930924818725, + "ewc_loss": 0.00849069282412529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490692562190816e-05, + "grad_norm": 4.151604652404785, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8914080858230591, + "num_tokens": 708760617.0, + "step": 18575 + }, + { + "epoch": 2.363058135097316, + "ewc_loss": 0.008499219082295895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499219256918877e-05, + "grad_norm": 4.219650745391846, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8870501518249512, + "num_tokens": 708790792.0, + "step": 18576 + }, + { + "epoch": 2.363185345375906, + "ewc_loss": 0.008531458675861359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531459025107324e-05, + "grad_norm": 4.166656494140625, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8739798665046692, + "num_tokens": 708829935.0, + "step": 18577 + }, + { + "epoch": 2.3633125556544967, + "ewc_loss": 0.00845818966627121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458190131932497e-05, + "grad_norm": 4.132198810577393, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8780770301818848, + "num_tokens": 708870578.0, + "step": 18578 + }, + { + "epoch": 2.3634397659330872, + "ewc_loss": 0.008490667678415775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490667823934928e-05, + "grad_norm": 4.16524076461792, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8815171718597412, + "num_tokens": 708907044.0, + "step": 18579 + }, + { + "epoch": 2.3635669762116778, + "ewc_loss": 0.008503222838044167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503222488798201e-05, + "grad_norm": 4.186854839324951, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8659483194351196, + "num_tokens": 708947443.0, + "step": 18580 + }, + { + "epoch": 2.3636941864902683, + "ewc_loss": 0.008501982316374779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501981938024983e-05, + "grad_norm": 4.199240684509277, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8836315870285034, + "num_tokens": 708983174.0, + "step": 18581 + }, + { + "epoch": 2.363821396768859, + "ewc_loss": 0.008498473092913628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498473471263424e-05, + "grad_norm": 4.143601894378662, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8857568502426147, + "num_tokens": 709021950.0, + "step": 18582 + }, + { + "epoch": 2.3639486070474494, + "ewc_loss": 0.008457477204501629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457477088086307e-05, + "grad_norm": 4.168142318725586, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8899341821670532, + "num_tokens": 709057236.0, + "step": 18583 + }, + { + "epoch": 2.36407581732604, + "ewc_loss": 0.008482844568789005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482844714308158e-05, + "grad_norm": 4.242640972137451, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8825589418411255, + "num_tokens": 709093376.0, + "step": 18584 + }, + { + "epoch": 2.3642030276046304, + "ewc_loss": 0.00851534679532051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515347144566476e-05, + "grad_norm": 4.1935553550720215, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8784070014953613, + "num_tokens": 709129502.0, + "step": 18585 + }, + { + "epoch": 2.364330237883221, + "ewc_loss": 0.008472250774502754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.472250920021906e-05, + "grad_norm": 4.2160115242004395, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8841749429702759, + "num_tokens": 709162538.0, + "step": 18586 + }, + { + "epoch": 2.3644574481618115, + "ewc_loss": 0.00847999844700098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479998359689489e-05, + "grad_norm": 4.177003383636475, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8722509145736694, + "num_tokens": 709201690.0, + "step": 18587 + }, + { + "epoch": 2.364584658440402, + "ewc_loss": 0.008450284600257874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450284803984687e-05, + "grad_norm": 4.174652576446533, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8785359263420105, + "num_tokens": 709240593.0, + "step": 18588 + }, + { + "epoch": 2.3647118687189925, + "ewc_loss": 0.00846914853900671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469148451695219e-05, + "grad_norm": 4.150992393493652, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8938145637512207, + "num_tokens": 709278192.0, + "step": 18589 + }, + { + "epoch": 2.364839078997583, + "ewc_loss": 0.008434205316007137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434205665253103e-05, + "grad_norm": 4.129432678222656, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.8993529081344604, + "num_tokens": 709311797.0, + "step": 18590 + }, + { + "epoch": 2.3649662892761736, + "ewc_loss": 0.008449981920421124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449982124147937e-05, + "grad_norm": 4.175650596618652, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8801834583282471, + "num_tokens": 709350379.0, + "step": 18591 + }, + { + "epoch": 2.365093499554764, + "ewc_loss": 0.008487184531986713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487184823025018e-05, + "grad_norm": 4.148155212402344, + "learning_rate": 1e-06, + "loss": 0.2685, + "mean_token_accuracy": 0.9056673645973206, + "num_tokens": 709389030.0, + "step": 18592 + }, + { + "epoch": 2.3652207098333546, + "ewc_loss": 0.008436695672571659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436695497948676e-05, + "grad_norm": 4.177112102508545, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8746426105499268, + "num_tokens": 709433692.0, + "step": 18593 + }, + { + "epoch": 2.365347920111945, + "ewc_loss": 0.008461506105959415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46150578581728e-05, + "grad_norm": 4.181676387786865, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8827815055847168, + "num_tokens": 709470773.0, + "step": 18594 + }, + { + "epoch": 2.3654751303905357, + "ewc_loss": 0.008447258733212948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447258733212948e-05, + "grad_norm": 4.208168983459473, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8783246874809265, + "num_tokens": 709506764.0, + "step": 18595 + }, + { + "epoch": 2.3656023406691262, + "ewc_loss": 0.00844775140285492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447751315543428e-05, + "grad_norm": 4.18834114074707, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8819709420204163, + "num_tokens": 709546931.0, + "step": 18596 + }, + { + "epoch": 2.3657295509477168, + "ewc_loss": 0.00843771081417799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437711221631616e-05, + "grad_norm": 4.212465286254883, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8847366571426392, + "num_tokens": 709582842.0, + "step": 18597 + }, + { + "epoch": 2.365856761226307, + "ewc_loss": 0.008461624383926392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461624383926392e-05, + "grad_norm": 4.149521350860596, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8824148178100586, + "num_tokens": 709621574.0, + "step": 18598 + }, + { + "epoch": 2.365983971504898, + "ewc_loss": 0.008417835459113121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417835488216951e-05, + "grad_norm": 4.138422012329102, + "learning_rate": 1e-06, + "loss": 0.2848, + "mean_token_accuracy": 0.8996078968048096, + "num_tokens": 709662437.0, + "step": 18599 + }, + { + "epoch": 2.366111181783488, + "ewc_loss": 0.0084172822535038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417282515438274e-05, + "grad_norm": 4.110927581787109, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8955994248390198, + "num_tokens": 709701318.0, + "step": 18600 + }, + { + "epoch": 2.3662383920620784, + "ewc_loss": 0.00840175710618496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401757077081129e-05, + "grad_norm": 4.127185344696045, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8820652365684509, + "num_tokens": 709745156.0, + "step": 18601 + }, + { + "epoch": 2.366365602340669, + "ewc_loss": 0.008421785198152065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421785605605692e-05, + "grad_norm": 4.181193828582764, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8775060772895813, + "num_tokens": 709785849.0, + "step": 18602 + }, + { + "epoch": 2.3664928126192595, + "ewc_loss": 0.00843217596411705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432175673078746e-05, + "grad_norm": 4.139206409454346, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8851557970046997, + "num_tokens": 709830038.0, + "step": 18603 + }, + { + "epoch": 2.36662002289785, + "ewc_loss": 0.008378932252526283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378932398045436e-05, + "grad_norm": 4.165804386138916, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8809296488761902, + "num_tokens": 709871174.0, + "step": 18604 + }, + { + "epoch": 2.3667472331764405, + "ewc_loss": 0.008407429791986942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407429413637146e-05, + "grad_norm": 4.168638706207275, + "learning_rate": 1e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.9015554189682007, + "num_tokens": 709905921.0, + "step": 18605 + }, + { + "epoch": 2.366874443455031, + "ewc_loss": 0.00838027149438858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380271174246445e-05, + "grad_norm": 4.159224510192871, + "learning_rate": 1e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8971548676490784, + "num_tokens": 709939856.0, + "step": 18606 + }, + { + "epoch": 2.3670016537336216, + "ewc_loss": 0.00838469434529543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384694228880107e-05, + "grad_norm": 4.116388320922852, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8951035141944885, + "num_tokens": 709978805.0, + "step": 18607 + }, + { + "epoch": 2.367128864012212, + "ewc_loss": 0.0083575788885355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35757891763933e-05, + "grad_norm": 4.158019065856934, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8953986167907715, + "num_tokens": 710015015.0, + "step": 18608 + }, + { + "epoch": 2.3672560742908026, + "ewc_loss": 0.008386890403926373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38689084048383e-05, + "grad_norm": 4.1760687828063965, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.875758171081543, + "num_tokens": 710054887.0, + "step": 18609 + }, + { + "epoch": 2.367383284569393, + "ewc_loss": 0.008377304300665855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377304766327143e-05, + "grad_norm": 4.126465320587158, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8833855986595154, + "num_tokens": 710097012.0, + "step": 18610 + }, + { + "epoch": 2.3675104948479837, + "ewc_loss": 0.008350945077836514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350944699486718e-05, + "grad_norm": 4.256251335144043, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8683661818504333, + "num_tokens": 710130912.0, + "step": 18611 + }, + { + "epoch": 2.3676377051265742, + "ewc_loss": 0.008433694951236248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433694893028587e-05, + "grad_norm": 4.16962194442749, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8799252510070801, + "num_tokens": 710168880.0, + "step": 18612 + }, + { + "epoch": 2.3677649154051648, + "ewc_loss": 0.008342278189957142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.342278306372464e-05, + "grad_norm": 4.1348114013671875, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8912085294723511, + "num_tokens": 710205619.0, + "step": 18613 + }, + { + "epoch": 2.3678921256837553, + "ewc_loss": 0.0083723533898592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372353477170691e-05, + "grad_norm": 4.228423118591309, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.888452410697937, + "num_tokens": 710239475.0, + "step": 18614 + }, + { + "epoch": 2.368019335962346, + "ewc_loss": 0.008435890078544617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435890049440786e-05, + "grad_norm": 4.172834873199463, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8861602544784546, + "num_tokens": 710276533.0, + "step": 18615 + }, + { + "epoch": 2.3681465462409363, + "ewc_loss": 0.008369682356715202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369682473130524e-05, + "grad_norm": 4.166454315185547, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.898525595664978, + "num_tokens": 710316731.0, + "step": 18616 + }, + { + "epoch": 2.368273756519527, + "ewc_loss": 0.008385016582906246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385016553802416e-05, + "grad_norm": 4.166359901428223, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.878507673740387, + "num_tokens": 710352310.0, + "step": 18617 + }, + { + "epoch": 2.3684009667981174, + "ewc_loss": 0.008410437032580376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41043729451485e-05, + "grad_norm": 4.152933597564697, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.87938392162323, + "num_tokens": 710400203.0, + "step": 18618 + }, + { + "epoch": 2.368528177076708, + "ewc_loss": 0.008382109925150871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38210980873555e-05, + "grad_norm": 4.202448844909668, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8870946168899536, + "num_tokens": 710433054.0, + "step": 18619 + }, + { + "epoch": 2.3686553873552985, + "ewc_loss": 0.008434465155005455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434464689344168e-05, + "grad_norm": 4.194960594177246, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8871472477912903, + "num_tokens": 710466879.0, + "step": 18620 + }, + { + "epoch": 2.368782597633889, + "ewc_loss": 0.008415676653385162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415676711592823e-05, + "grad_norm": 4.201717376708984, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8768842816352844, + "num_tokens": 710502808.0, + "step": 18621 + }, + { + "epoch": 2.3689098079124795, + "ewc_loss": 0.008439919911324978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439919474767521e-05, + "grad_norm": 4.170045852661133, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8732683658599854, + "num_tokens": 710546190.0, + "step": 18622 + }, + { + "epoch": 2.3690370181910696, + "ewc_loss": 0.00841860007494688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418600191362202e-05, + "grad_norm": 4.177892684936523, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8853003978729248, + "num_tokens": 710585556.0, + "step": 18623 + }, + { + "epoch": 2.3691642284696606, + "ewc_loss": 0.008423751220107079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.423750841757283e-05, + "grad_norm": 4.189868450164795, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8865399360656738, + "num_tokens": 710621366.0, + "step": 18624 + }, + { + "epoch": 2.3692914387482507, + "ewc_loss": 0.008427855558693409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427855209447443e-05, + "grad_norm": 4.154204368591309, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8861024379730225, + "num_tokens": 710660217.0, + "step": 18625 + }, + { + "epoch": 2.369418649026841, + "ewc_loss": 0.008404352702200413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404352411162108e-05, + "grad_norm": 4.222161293029785, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8796966671943665, + "num_tokens": 710695798.0, + "step": 18626 + }, + { + "epoch": 2.3695458593054317, + "ewc_loss": 0.008477085269987583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47708506626077e-05, + "grad_norm": 4.25851583480835, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8682714104652405, + "num_tokens": 710732739.0, + "step": 18627 + }, + { + "epoch": 2.3696730695840222, + "ewc_loss": 0.008474295027554035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474295464111492e-05, + "grad_norm": 4.222054958343506, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8768516182899475, + "num_tokens": 710769651.0, + "step": 18628 + }, + { + "epoch": 2.3698002798626128, + "ewc_loss": 0.00842520222067833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425202395301312e-05, + "grad_norm": 4.127627372741699, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8822270631790161, + "num_tokens": 710809487.0, + "step": 18629 + }, + { + "epoch": 2.3699274901412033, + "ewc_loss": 0.008407596498727798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407596033066511e-05, + "grad_norm": 4.135809421539307, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.885134756565094, + "num_tokens": 710849152.0, + "step": 18630 + }, + { + "epoch": 2.370054700419794, + "ewc_loss": 0.008455936796963215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455936767859384e-05, + "grad_norm": 4.190423965454102, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8713054060935974, + "num_tokens": 710890933.0, + "step": 18631 + }, + { + "epoch": 2.3701819106983844, + "ewc_loss": 0.008488454855978489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488454477628693e-05, + "grad_norm": 4.224677085876465, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.886717677116394, + "num_tokens": 710928262.0, + "step": 18632 + }, + { + "epoch": 2.370309120976975, + "ewc_loss": 0.00846677552908659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.466775761917233e-05, + "grad_norm": 4.1876397132873535, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8884955644607544, + "num_tokens": 710962969.0, + "step": 18633 + }, + { + "epoch": 2.3704363312555654, + "ewc_loss": 0.008430736139416695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430735761066899e-05, + "grad_norm": 4.1838531494140625, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.880545437335968, + "num_tokens": 710998717.0, + "step": 18634 + }, + { + "epoch": 2.370563541534156, + "ewc_loss": 0.008438842371106148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43884190544486e-05, + "grad_norm": 4.217016220092773, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8682198524475098, + "num_tokens": 711032819.0, + "step": 18635 + }, + { + "epoch": 2.3706907518127465, + "ewc_loss": 0.008470240980386734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470240572933108e-05, + "grad_norm": 4.203378677368164, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8737519979476929, + "num_tokens": 711069395.0, + "step": 18636 + }, + { + "epoch": 2.370817962091337, + "ewc_loss": 0.008459342643618584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459342643618584e-05, + "grad_norm": 4.141142845153809, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8855715394020081, + "num_tokens": 711105754.0, + "step": 18637 + }, + { + "epoch": 2.3709451723699275, + "ewc_loss": 0.008425075560808182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425075793638825e-05, + "grad_norm": 4.170935153961182, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8780043125152588, + "num_tokens": 711145291.0, + "step": 18638 + }, + { + "epoch": 2.371072382648518, + "ewc_loss": 0.00847232062369585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.472320769215003e-05, + "grad_norm": 4.145399570465088, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8836525678634644, + "num_tokens": 711183431.0, + "step": 18639 + }, + { + "epoch": 2.3711995929271086, + "ewc_loss": 0.008462194353342056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462194819003344e-05, + "grad_norm": 4.154604434967041, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8689243197441101, + "num_tokens": 711229754.0, + "step": 18640 + }, + { + "epoch": 2.371326803205699, + "ewc_loss": 0.008477868512272835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477868686895818e-05, + "grad_norm": 4.1614251136779785, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8841221332550049, + "num_tokens": 711269071.0, + "step": 18641 + }, + { + "epoch": 2.3714540134842896, + "ewc_loss": 0.008464932441711426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464932761853561e-05, + "grad_norm": 4.166867256164551, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8787404298782349, + "num_tokens": 711312036.0, + "step": 18642 + }, + { + "epoch": 2.37158122376288, + "ewc_loss": 0.008467691019177437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467691077385098e-05, + "grad_norm": 4.175421237945557, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.890430212020874, + "num_tokens": 711347945.0, + "step": 18643 + }, + { + "epoch": 2.3717084340414707, + "ewc_loss": 0.008460013195872307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46001275931485e-05, + "grad_norm": 4.224396228790283, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8623790144920349, + "num_tokens": 711388433.0, + "step": 18644 + }, + { + "epoch": 2.371835644320061, + "ewc_loss": 0.008466769941151142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.466769941151142e-05, + "grad_norm": 4.1928205490112305, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8857330083847046, + "num_tokens": 711425821.0, + "step": 18645 + }, + { + "epoch": 2.3719628545986517, + "ewc_loss": 0.00842626765370369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426267595496029e-05, + "grad_norm": 4.221559524536133, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8776161670684814, + "num_tokens": 711462725.0, + "step": 18646 + }, + { + "epoch": 2.3720900648772423, + "ewc_loss": 0.008467606268823147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467605948681012e-05, + "grad_norm": 4.177596092224121, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8948183059692383, + "num_tokens": 711499987.0, + "step": 18647 + }, + { + "epoch": 2.3722172751558324, + "ewc_loss": 0.008401446975767612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401447121286765e-05, + "grad_norm": 4.131303310394287, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8680654168128967, + "num_tokens": 711543658.0, + "step": 18648 + }, + { + "epoch": 2.3723444854344233, + "ewc_loss": 0.008411779999732971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411780436290428e-05, + "grad_norm": 4.142454624176025, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8947978019714355, + "num_tokens": 711583875.0, + "step": 18649 + }, + { + "epoch": 2.3724716957130134, + "ewc_loss": 0.00841565802693367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415657794103026e-05, + "grad_norm": 4.1684489250183105, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8675261735916138, + "num_tokens": 711625543.0, + "step": 18650 + }, + { + "epoch": 2.372598905991604, + "ewc_loss": 0.008430182002484798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43018206069246e-05, + "grad_norm": 4.188570499420166, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8867694139480591, + "num_tokens": 711660528.0, + "step": 18651 + }, + { + "epoch": 2.3727261162701945, + "ewc_loss": 0.008419664576649666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419664663961157e-05, + "grad_norm": 4.141347408294678, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8986743688583374, + "num_tokens": 711695646.0, + "step": 18652 + }, + { + "epoch": 2.372853326548785, + "ewc_loss": 0.00839314516633749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393145253648981e-05, + "grad_norm": 4.187885761260986, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8687193989753723, + "num_tokens": 711734691.0, + "step": 18653 + }, + { + "epoch": 2.3729805368273755, + "ewc_loss": 0.008428754284977913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428754517808557e-05, + "grad_norm": 4.233245372772217, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8842734098434448, + "num_tokens": 711771518.0, + "step": 18654 + }, + { + "epoch": 2.373107747105966, + "ewc_loss": 0.00843166746199131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.431667083641514e-05, + "grad_norm": 4.214752197265625, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8731766939163208, + "num_tokens": 711808968.0, + "step": 18655 + }, + { + "epoch": 2.3732349573845566, + "ewc_loss": 0.00841604359447956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416043419856578e-05, + "grad_norm": 4.187914848327637, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8890049457550049, + "num_tokens": 711842944.0, + "step": 18656 + }, + { + "epoch": 2.373362167663147, + "ewc_loss": 0.008441591635346413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44159148982726e-05, + "grad_norm": 4.222362518310547, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8800768852233887, + "num_tokens": 711882740.0, + "step": 18657 + }, + { + "epoch": 2.3734893779417376, + "ewc_loss": 0.008467388339340687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467388397548348e-05, + "grad_norm": 4.167087078094482, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8753104209899902, + "num_tokens": 711925362.0, + "step": 18658 + }, + { + "epoch": 2.373616588220328, + "ewc_loss": 0.008418134413659573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418134530074894e-05, + "grad_norm": 4.138608455657959, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8866649270057678, + "num_tokens": 711971285.0, + "step": 18659 + }, + { + "epoch": 2.3737437984989187, + "ewc_loss": 0.008415909484028816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415909542236477e-05, + "grad_norm": 4.142892360687256, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8786872029304504, + "num_tokens": 712013306.0, + "step": 18660 + }, + { + "epoch": 2.3738710087775092, + "ewc_loss": 0.008428437635302544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428437286056578e-05, + "grad_norm": 4.182833671569824, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8835241794586182, + "num_tokens": 712050837.0, + "step": 18661 + }, + { + "epoch": 2.3739982190560998, + "ewc_loss": 0.008433614857494831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433614857494831e-05, + "grad_norm": 4.207264423370361, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.8980966806411743, + "num_tokens": 712085140.0, + "step": 18662 + }, + { + "epoch": 2.3741254293346903, + "ewc_loss": 0.00844606850296259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446068386547267e-05, + "grad_norm": 4.125637054443359, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8946444392204285, + "num_tokens": 712122495.0, + "step": 18663 + }, + { + "epoch": 2.374252639613281, + "ewc_loss": 0.008395264856517315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395264740101993e-05, + "grad_norm": 4.155132293701172, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8771820664405823, + "num_tokens": 712161312.0, + "step": 18664 + }, + { + "epoch": 2.3743798498918713, + "ewc_loss": 0.008433889597654343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433889161096886e-05, + "grad_norm": 4.186919212341309, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8813198208808899, + "num_tokens": 712197275.0, + "step": 18665 + }, + { + "epoch": 2.374507060170462, + "ewc_loss": 0.008443761616945267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443761907983571e-05, + "grad_norm": 4.242209434509277, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8699560761451721, + "num_tokens": 712231309.0, + "step": 18666 + }, + { + "epoch": 2.3746342704490524, + "ewc_loss": 0.0084513109177351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451310714008287e-05, + "grad_norm": 4.208855628967285, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8792685866355896, + "num_tokens": 712264864.0, + "step": 18667 + }, + { + "epoch": 2.374761480727643, + "ewc_loss": 0.008437003009021282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437003270955756e-05, + "grad_norm": 4.191784858703613, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8821557760238647, + "num_tokens": 712303564.0, + "step": 18668 + }, + { + "epoch": 2.3748886910062335, + "ewc_loss": 0.008439311757683754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439311932306737e-05, + "grad_norm": 4.145453453063965, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8920290470123291, + "num_tokens": 712341872.0, + "step": 18669 + }, + { + "epoch": 2.375015901284824, + "ewc_loss": 0.008412132039666176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.412131865043193e-05, + "grad_norm": 4.209977626800537, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8720762133598328, + "num_tokens": 712379835.0, + "step": 18670 + }, + { + "epoch": 2.3751431115634145, + "ewc_loss": 0.008475998416543007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475998765788972e-05, + "grad_norm": 4.181211948394775, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8767359256744385, + "num_tokens": 712417462.0, + "step": 18671 + }, + { + "epoch": 2.375270321842005, + "ewc_loss": 0.00842820294201374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428203000221401e-05, + "grad_norm": 4.173825740814209, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8604055643081665, + "num_tokens": 712461581.0, + "step": 18672 + }, + { + "epoch": 2.375397532120595, + "ewc_loss": 0.00844429712742567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444296690868214e-05, + "grad_norm": 4.182796001434326, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8821424245834351, + "num_tokens": 712499615.0, + "step": 18673 + }, + { + "epoch": 2.375524742399186, + "ewc_loss": 0.008461163379251957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461163815809414e-05, + "grad_norm": 4.153957843780518, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8929733633995056, + "num_tokens": 712533264.0, + "step": 18674 + }, + { + "epoch": 2.375651952677776, + "ewc_loss": 0.00844540074467659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445401181234047e-05, + "grad_norm": 4.23336124420166, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8748239874839783, + "num_tokens": 712571496.0, + "step": 18675 + }, + { + "epoch": 2.3757791629563667, + "ewc_loss": 0.008498767390847206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498767419951037e-05, + "grad_norm": 4.142142295837402, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8909910917282104, + "num_tokens": 712606510.0, + "step": 18676 + }, + { + "epoch": 2.3759063732349572, + "ewc_loss": 0.008434242568910122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434242772636935e-05, + "grad_norm": 4.164834022521973, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8914068937301636, + "num_tokens": 712646218.0, + "step": 18677 + }, + { + "epoch": 2.3760335835135478, + "ewc_loss": 0.008481842465698719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481842814944685e-05, + "grad_norm": 4.250416278839111, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8527587652206421, + "num_tokens": 712681905.0, + "step": 18678 + }, + { + "epoch": 2.3761607937921383, + "ewc_loss": 0.008543106727302074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543106378056109e-05, + "grad_norm": 4.123055458068848, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8950575590133667, + "num_tokens": 712719916.0, + "step": 18679 + }, + { + "epoch": 2.376288004070729, + "ewc_loss": 0.008418586105108261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418585639446974e-05, + "grad_norm": 4.126830101013184, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8820303678512573, + "num_tokens": 712760849.0, + "step": 18680 + }, + { + "epoch": 2.3764152143493193, + "ewc_loss": 0.008484508842229843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484508725814521e-05, + "grad_norm": 4.155488014221191, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8937495946884155, + "num_tokens": 712801199.0, + "step": 18681 + }, + { + "epoch": 2.37654242462791, + "ewc_loss": 0.008498867973685265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498867828166112e-05, + "grad_norm": 4.175378322601318, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8796258568763733, + "num_tokens": 712843471.0, + "step": 18682 + }, + { + "epoch": 2.3766696349065004, + "ewc_loss": 0.008469559252262115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469558815704659e-05, + "grad_norm": 4.183610916137695, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8832601308822632, + "num_tokens": 712880649.0, + "step": 18683 + }, + { + "epoch": 2.376796845185091, + "ewc_loss": 0.008481764234602451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481764234602451e-05, + "grad_norm": 4.181107521057129, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8591800928115845, + "num_tokens": 712920574.0, + "step": 18684 + }, + { + "epoch": 2.3769240554636815, + "ewc_loss": 0.008463575504720211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463575068162754e-05, + "grad_norm": 4.160715579986572, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.890669584274292, + "num_tokens": 712962544.0, + "step": 18685 + }, + { + "epoch": 2.377051265742272, + "ewc_loss": 0.008461816236376762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461816469207406e-05, + "grad_norm": 4.178758144378662, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8862736225128174, + "num_tokens": 712999138.0, + "step": 18686 + }, + { + "epoch": 2.3771784760208625, + "ewc_loss": 0.008466934785246849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.466935105388984e-05, + "grad_norm": 4.152684688568115, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8848052024841309, + "num_tokens": 713041799.0, + "step": 18687 + }, + { + "epoch": 2.377305686299453, + "ewc_loss": 0.00845074187964201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450741734122857e-05, + "grad_norm": 4.125700950622559, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8748379945755005, + "num_tokens": 713088248.0, + "step": 18688 + }, + { + "epoch": 2.3774328965780436, + "ewc_loss": 0.008428378030657768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428378350799903e-05, + "grad_norm": 4.203563690185547, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8896874189376831, + "num_tokens": 713127239.0, + "step": 18689 + }, + { + "epoch": 2.377560106856634, + "ewc_loss": 0.008477170020341873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477170194964856e-05, + "grad_norm": 4.183516025543213, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8833798766136169, + "num_tokens": 713168389.0, + "step": 18690 + }, + { + "epoch": 2.3776873171352246, + "ewc_loss": 0.008410322479903698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410322334384546e-05, + "grad_norm": 4.167831897735596, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8737373948097229, + "num_tokens": 713208137.0, + "step": 18691 + }, + { + "epoch": 2.377814527413815, + "ewc_loss": 0.008388380520045757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388380229007453e-05, + "grad_norm": 4.176464080810547, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8736342787742615, + "num_tokens": 713249344.0, + "step": 18692 + }, + { + "epoch": 2.3779417376924057, + "ewc_loss": 0.008388099260628223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388099377043545e-05, + "grad_norm": 4.179130554199219, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8785275220870972, + "num_tokens": 713291376.0, + "step": 18693 + }, + { + "epoch": 2.378068947970996, + "ewc_loss": 0.008375617675483227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375617471756414e-05, + "grad_norm": 4.191604137420654, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8798863291740417, + "num_tokens": 713329925.0, + "step": 18694 + }, + { + "epoch": 2.3781961582495867, + "ewc_loss": 0.008391052484512329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391052688239142e-05, + "grad_norm": 4.124865531921387, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8855702877044678, + "num_tokens": 713372509.0, + "step": 18695 + }, + { + "epoch": 2.378323368528177, + "ewc_loss": 0.008319724351167679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319724292960018e-05, + "grad_norm": 4.197812080383301, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8777080774307251, + "num_tokens": 713410055.0, + "step": 18696 + }, + { + "epoch": 2.378450578806768, + "ewc_loss": 0.008407744579017162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40774446260184e-05, + "grad_norm": 4.175134658813477, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8792194724082947, + "num_tokens": 713453958.0, + "step": 18697 + }, + { + "epoch": 2.378577789085358, + "ewc_loss": 0.00835550669580698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35550672491081e-05, + "grad_norm": 4.205014705657959, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8832501769065857, + "num_tokens": 713485683.0, + "step": 18698 + }, + { + "epoch": 2.3787049993639484, + "ewc_loss": 0.008378899656236172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378899656236172e-05, + "grad_norm": 4.14758825302124, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8847581148147583, + "num_tokens": 713528565.0, + "step": 18699 + }, + { + "epoch": 2.378832209642539, + "ewc_loss": 0.008347324095666409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.3473241829779e-05, + "grad_norm": 4.29939603805542, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8781977295875549, + "num_tokens": 713560932.0, + "step": 18700 + }, + { + "epoch": 2.3789594199211295, + "ewc_loss": 0.00844205729663372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442057151114568e-05, + "grad_norm": 4.294613838195801, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8790910243988037, + "num_tokens": 713592891.0, + "step": 18701 + }, + { + "epoch": 2.37908663019972, + "ewc_loss": 0.00838954746723175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389547292608768e-05, + "grad_norm": 4.142861366271973, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8792121410369873, + "num_tokens": 713631285.0, + "step": 18702 + }, + { + "epoch": 2.3792138404783105, + "ewc_loss": 0.008324858732521534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.324858936248347e-05, + "grad_norm": 4.19451904296875, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.865395724773407, + "num_tokens": 713675143.0, + "step": 18703 + }, + { + "epoch": 2.379341050756901, + "ewc_loss": 0.008418846875429153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418846846325323e-05, + "grad_norm": 4.1237897872924805, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8867706060409546, + "num_tokens": 713717556.0, + "step": 18704 + }, + { + "epoch": 2.3794682610354916, + "ewc_loss": 0.008350405842065811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350405551027507e-05, + "grad_norm": 4.197852611541748, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8750104308128357, + "num_tokens": 713763515.0, + "step": 18705 + }, + { + "epoch": 2.379595471314082, + "ewc_loss": 0.008429856970906258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.429856825387105e-05, + "grad_norm": 4.2137451171875, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.887414813041687, + "num_tokens": 713801064.0, + "step": 18706 + }, + { + "epoch": 2.3797226815926726, + "ewc_loss": 0.008400351740419865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400351362070069e-05, + "grad_norm": 4.1396331787109375, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8792511224746704, + "num_tokens": 713841662.0, + "step": 18707 + }, + { + "epoch": 2.379849891871263, + "ewc_loss": 0.008354630321264267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354630699614063e-05, + "grad_norm": 4.120408535003662, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8844996094703674, + "num_tokens": 713883646.0, + "step": 18708 + }, + { + "epoch": 2.3799771021498537, + "ewc_loss": 0.008386967703700066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38696796563454e-05, + "grad_norm": 4.128354549407959, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.9015622138977051, + "num_tokens": 713925365.0, + "step": 18709 + }, + { + "epoch": 2.3801043124284442, + "ewc_loss": 0.00837952084839344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379521023016423e-05, + "grad_norm": 4.183574676513672, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8807195425033569, + "num_tokens": 713961142.0, + "step": 18710 + }, + { + "epoch": 2.3802315227070348, + "ewc_loss": 0.008404831402003765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404831169173121e-05, + "grad_norm": 4.178014755249023, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8901073932647705, + "num_tokens": 714001652.0, + "step": 18711 + }, + { + "epoch": 2.3803587329856253, + "ewc_loss": 0.008382986299693584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382986561628059e-05, + "grad_norm": 4.117422103881836, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8839551210403442, + "num_tokens": 714043023.0, + "step": 18712 + }, + { + "epoch": 2.380485943264216, + "ewc_loss": 0.008349180221557617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349180279765278e-05, + "grad_norm": 4.194089412689209, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8815584778785706, + "num_tokens": 714080389.0, + "step": 18713 + }, + { + "epoch": 2.3806131535428063, + "ewc_loss": 0.008405137807130814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405137486988679e-05, + "grad_norm": 4.159552097320557, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.885033905506134, + "num_tokens": 714117695.0, + "step": 18714 + }, + { + "epoch": 2.380740363821397, + "ewc_loss": 0.00833807047456503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338070620084181e-05, + "grad_norm": 4.1852707862854, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8891634941101074, + "num_tokens": 714154204.0, + "step": 18715 + }, + { + "epoch": 2.3808675740999874, + "ewc_loss": 0.008378206752240658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.378206985071301e-05, + "grad_norm": 4.16584587097168, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8666552901268005, + "num_tokens": 714194857.0, + "step": 18716 + }, + { + "epoch": 2.380994784378578, + "ewc_loss": 0.008363252505660057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36325270938687e-05, + "grad_norm": 4.285299777984619, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8929201364517212, + "num_tokens": 714228034.0, + "step": 18717 + }, + { + "epoch": 2.3811219946571685, + "ewc_loss": 0.008419904857873917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419904770562425e-05, + "grad_norm": 4.093845367431641, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8889084458351135, + "num_tokens": 714269688.0, + "step": 18718 + }, + { + "epoch": 2.381249204935759, + "ewc_loss": 0.008282149210572243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.28214906505309e-05, + "grad_norm": 4.208352565765381, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8759535551071167, + "num_tokens": 714309039.0, + "step": 18719 + }, + { + "epoch": 2.3813764152143495, + "ewc_loss": 0.00840782467275858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407824498135597e-05, + "grad_norm": 4.16414213180542, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8875932693481445, + "num_tokens": 714347453.0, + "step": 18720 + }, + { + "epoch": 2.3815036254929396, + "ewc_loss": 0.008339419960975647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339419582625851e-05, + "grad_norm": 4.159745216369629, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8808090090751648, + "num_tokens": 714385850.0, + "step": 18721 + }, + { + "epoch": 2.3816308357715306, + "ewc_loss": 0.008349391631782055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34939128253609e-05, + "grad_norm": 4.15214729309082, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8772984743118286, + "num_tokens": 714426735.0, + "step": 18722 + }, + { + "epoch": 2.3817580460501206, + "ewc_loss": 0.008358068764209747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358068589586765e-05, + "grad_norm": 4.191985130310059, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8936281204223633, + "num_tokens": 714465279.0, + "step": 18723 + }, + { + "epoch": 2.381885256328711, + "ewc_loss": 0.008381140418350697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38114065118134e-05, + "grad_norm": 4.1760172843933105, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8718479871749878, + "num_tokens": 714505703.0, + "step": 18724 + }, + { + "epoch": 2.3820124666073017, + "ewc_loss": 0.008349700830876827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349701238330454e-05, + "grad_norm": 4.170414924621582, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8770752549171448, + "num_tokens": 714543642.0, + "step": 18725 + }, + { + "epoch": 2.3821396768858922, + "ewc_loss": 0.008359474129974842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.359474304597825e-05, + "grad_norm": 4.11945915222168, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8979278802871704, + "num_tokens": 714583533.0, + "step": 18726 + }, + { + "epoch": 2.3822668871644828, + "ewc_loss": 0.008354035206139088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354035526281223e-05, + "grad_norm": 4.148479461669922, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8873902559280396, + "num_tokens": 714625577.0, + "step": 18727 + }, + { + "epoch": 2.3823940974430733, + "ewc_loss": 0.008384749293327332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384749526157975e-05, + "grad_norm": 4.159157752990723, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8793503642082214, + "num_tokens": 714664987.0, + "step": 18728 + }, + { + "epoch": 2.382521307721664, + "ewc_loss": 0.008380923420190811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380923100048676e-05, + "grad_norm": 4.175759792327881, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8757002353668213, + "num_tokens": 714705881.0, + "step": 18729 + }, + { + "epoch": 2.3826485180002543, + "ewc_loss": 0.008370825089514256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370825526071712e-05, + "grad_norm": 4.192126750946045, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8741832971572876, + "num_tokens": 714746012.0, + "step": 18730 + }, + { + "epoch": 2.382775728278845, + "ewc_loss": 0.008367728441953659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367728150915354e-05, + "grad_norm": 4.131962776184082, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8850822448730469, + "num_tokens": 714790001.0, + "step": 18731 + }, + { + "epoch": 2.3829029385574354, + "ewc_loss": 0.008333454839885235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.333454752573743e-05, + "grad_norm": 4.203722953796387, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8890888690948486, + "num_tokens": 714824515.0, + "step": 18732 + }, + { + "epoch": 2.383030148836026, + "ewc_loss": 0.008401223458349705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401223749388009e-05, + "grad_norm": 4.200077533721924, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.875930905342102, + "num_tokens": 714860841.0, + "step": 18733 + }, + { + "epoch": 2.3831573591146165, + "ewc_loss": 0.008368533104658127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368532871827483e-05, + "grad_norm": 4.183951377868652, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8839010000228882, + "num_tokens": 714896373.0, + "step": 18734 + }, + { + "epoch": 2.383284569393207, + "ewc_loss": 0.008350849151611328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350849384441972e-05, + "grad_norm": 4.250431537628174, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8839128017425537, + "num_tokens": 714926684.0, + "step": 18735 + }, + { + "epoch": 2.3834117796717975, + "ewc_loss": 0.008404281921684742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40428183437325e-05, + "grad_norm": 4.1677069664001465, + "learning_rate": 1e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.895403265953064, + "num_tokens": 714961057.0, + "step": 18736 + }, + { + "epoch": 2.383538989950388, + "ewc_loss": 0.008345292881131172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34529273561202e-05, + "grad_norm": 4.1817402839660645, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8895280957221985, + "num_tokens": 715001215.0, + "step": 18737 + }, + { + "epoch": 2.3836662002289786, + "ewc_loss": 0.008365786634385586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365786197828129e-05, + "grad_norm": 4.211420059204102, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8631225824356079, + "num_tokens": 715038252.0, + "step": 18738 + }, + { + "epoch": 2.383793410507569, + "ewc_loss": 0.008395695127546787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395695476792753e-05, + "grad_norm": 4.134993553161621, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8810704350471497, + "num_tokens": 715078320.0, + "step": 18739 + }, + { + "epoch": 2.3839206207861596, + "ewc_loss": 0.00834655947983265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.346559479832649e-05, + "grad_norm": 4.175469398498535, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8780049681663513, + "num_tokens": 715115628.0, + "step": 18740 + }, + { + "epoch": 2.38404783106475, + "ewc_loss": 0.008399728685617447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399728540098295e-05, + "grad_norm": 4.228095054626465, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8910887241363525, + "num_tokens": 715153294.0, + "step": 18741 + }, + { + "epoch": 2.3841750413433407, + "ewc_loss": 0.008407742716372013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407742279814556e-05, + "grad_norm": 4.155141353607178, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8845473527908325, + "num_tokens": 715193736.0, + "step": 18742 + }, + { + "epoch": 2.384302251621931, + "ewc_loss": 0.008362788707017899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362788503291085e-05, + "grad_norm": 4.166301250457764, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8840109705924988, + "num_tokens": 715234539.0, + "step": 18743 + }, + { + "epoch": 2.3844294619005217, + "ewc_loss": 0.008389057591557503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389057620661333e-05, + "grad_norm": 4.16812801361084, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8953205347061157, + "num_tokens": 715271567.0, + "step": 18744 + }, + { + "epoch": 2.3845566721791123, + "ewc_loss": 0.008393626660108566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39362692204304e-05, + "grad_norm": 4.137494087219238, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.893251359462738, + "num_tokens": 715314850.0, + "step": 18745 + }, + { + "epoch": 2.3846838824577024, + "ewc_loss": 0.008369374088943005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369373972527683e-05, + "grad_norm": 4.181434631347656, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8890788555145264, + "num_tokens": 715353353.0, + "step": 18746 + }, + { + "epoch": 2.3848110927362933, + "ewc_loss": 0.008387735113501549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387734851567075e-05, + "grad_norm": 4.252295017242432, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8932250142097473, + "num_tokens": 715387843.0, + "step": 18747 + }, + { + "epoch": 2.3849383030148834, + "ewc_loss": 0.00840925145894289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409251313423738e-05, + "grad_norm": 4.1707024574279785, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8708720803260803, + "num_tokens": 715428037.0, + "step": 18748 + }, + { + "epoch": 2.385065513293474, + "ewc_loss": 0.008340292610228062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.340292697539553e-05, + "grad_norm": 4.202366828918457, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8958984613418579, + "num_tokens": 715464403.0, + "step": 18749 + }, + { + "epoch": 2.3851927235720645, + "ewc_loss": 0.00838543102145195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385431283386424e-05, + "grad_norm": 4.1248459815979, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8922923803329468, + "num_tokens": 715501820.0, + "step": 18750 + }, + { + "epoch": 2.385319933850655, + "ewc_loss": 0.00835005659610033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.350056305062026e-05, + "grad_norm": 4.170551300048828, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8905684947967529, + "num_tokens": 715543287.0, + "step": 18751 + }, + { + "epoch": 2.3854471441292455, + "ewc_loss": 0.008374025113880634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374024764634669e-05, + "grad_norm": 4.180945873260498, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8918551206588745, + "num_tokens": 715578189.0, + "step": 18752 + }, + { + "epoch": 2.385574354407836, + "ewc_loss": 0.008371125906705856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371126023121178e-05, + "grad_norm": 4.227999687194824, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8754465579986572, + "num_tokens": 715616447.0, + "step": 18753 + }, + { + "epoch": 2.3857015646864266, + "ewc_loss": 0.008391411043703556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391411392949522e-05, + "grad_norm": 4.198505401611328, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8859162330627441, + "num_tokens": 715651692.0, + "step": 18754 + }, + { + "epoch": 2.385828774965017, + "ewc_loss": 0.008373722434043884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37372281239368e-05, + "grad_norm": 4.235710620880127, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8759213089942932, + "num_tokens": 715687733.0, + "step": 18755 + }, + { + "epoch": 2.3859559852436076, + "ewc_loss": 0.008413376286625862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413376053795218e-05, + "grad_norm": 4.164441108703613, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8624323010444641, + "num_tokens": 715727481.0, + "step": 18756 + }, + { + "epoch": 2.386083195522198, + "ewc_loss": 0.008373377844691277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373377932002768e-05, + "grad_norm": 4.214193820953369, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8668116927146912, + "num_tokens": 715767549.0, + "step": 18757 + }, + { + "epoch": 2.3862104058007887, + "ewc_loss": 0.008426399901509285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426400017924607e-05, + "grad_norm": 4.191216945648193, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8784183263778687, + "num_tokens": 715807251.0, + "step": 18758 + }, + { + "epoch": 2.386337616079379, + "ewc_loss": 0.008393334224820137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39333442854695e-05, + "grad_norm": 4.201234340667725, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8683598041534424, + "num_tokens": 715846893.0, + "step": 18759 + }, + { + "epoch": 2.3864648263579697, + "ewc_loss": 0.008403990417718887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403990068472922e-05, + "grad_norm": 4.258709907531738, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8707609176635742, + "num_tokens": 715883611.0, + "step": 18760 + }, + { + "epoch": 2.3865920366365603, + "ewc_loss": 0.00845128670334816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45128670334816e-05, + "grad_norm": 4.215350151062012, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8833452463150024, + "num_tokens": 715915598.0, + "step": 18761 + }, + { + "epoch": 2.386719246915151, + "ewc_loss": 0.008417351171374321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417350909439847e-05, + "grad_norm": 4.140317916870117, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8875398635864258, + "num_tokens": 715956647.0, + "step": 18762 + }, + { + "epoch": 2.3868464571937413, + "ewc_loss": 0.00840908382087946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409083966398612e-05, + "grad_norm": 4.162890911102295, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8886712789535522, + "num_tokens": 715996627.0, + "step": 18763 + }, + { + "epoch": 2.386973667472332, + "ewc_loss": 0.008450170047581196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450169843854383e-05, + "grad_norm": 4.220975399017334, + "learning_rate": 1e-06, + "loss": 0.2766, + "mean_token_accuracy": 0.9016997814178467, + "num_tokens": 716027823.0, + "step": 18764 + }, + { + "epoch": 2.3871008777509224, + "ewc_loss": 0.008460146375000477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460146636934951e-05, + "grad_norm": 4.218960285186768, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8714989423751831, + "num_tokens": 716067430.0, + "step": 18765 + }, + { + "epoch": 2.387228088029513, + "ewc_loss": 0.008445918560028076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445918501820415e-05, + "grad_norm": 4.201333045959473, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8743937611579895, + "num_tokens": 716109436.0, + "step": 18766 + }, + { + "epoch": 2.3873552983081034, + "ewc_loss": 0.008438549935817719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438550139544532e-05, + "grad_norm": 4.162247180938721, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8781114816665649, + "num_tokens": 716148157.0, + "step": 18767 + }, + { + "epoch": 2.387482508586694, + "ewc_loss": 0.008414601907134056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414602052653208e-05, + "grad_norm": 4.187347888946533, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8849606513977051, + "num_tokens": 716184662.0, + "step": 18768 + }, + { + "epoch": 2.3876097188652845, + "ewc_loss": 0.008456151001155376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456150681013241e-05, + "grad_norm": 4.176321983337402, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8763644099235535, + "num_tokens": 716227382.0, + "step": 18769 + }, + { + "epoch": 2.387736929143875, + "ewc_loss": 0.008454711176455021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454710769001395e-05, + "grad_norm": 4.193656921386719, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.885187029838562, + "num_tokens": 716262236.0, + "step": 18770 + }, + { + "epoch": 2.387864139422465, + "ewc_loss": 0.00845446065068245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454460476059467e-05, + "grad_norm": 4.190042972564697, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.890076756477356, + "num_tokens": 716295541.0, + "step": 18771 + }, + { + "epoch": 2.387991349701056, + "ewc_loss": 0.008455758914351463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455759234493598e-05, + "grad_norm": 4.177262306213379, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8851537108421326, + "num_tokens": 716331796.0, + "step": 18772 + }, + { + "epoch": 2.388118559979646, + "ewc_loss": 0.008435639552772045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435639756498858e-05, + "grad_norm": 4.16229772567749, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8747454285621643, + "num_tokens": 716369539.0, + "step": 18773 + }, + { + "epoch": 2.3882457702582367, + "ewc_loss": 0.008445804938673973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445804996881634e-05, + "grad_norm": 4.127028465270996, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8986107110977173, + "num_tokens": 716409806.0, + "step": 18774 + }, + { + "epoch": 2.3883729805368272, + "ewc_loss": 0.008422896265983582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422895916737616e-05, + "grad_norm": 4.170935153961182, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8762916922569275, + "num_tokens": 716448191.0, + "step": 18775 + }, + { + "epoch": 2.3885001908154178, + "ewc_loss": 0.00845316145569086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453161717625335e-05, + "grad_norm": 4.172656059265137, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8845975399017334, + "num_tokens": 716486522.0, + "step": 18776 + }, + { + "epoch": 2.3886274010940083, + "ewc_loss": 0.00844880472868681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448804874205962e-05, + "grad_norm": 4.229432106018066, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8850951790809631, + "num_tokens": 716519383.0, + "step": 18777 + }, + { + "epoch": 2.388754611372599, + "ewc_loss": 0.008468071930110455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46807160996832e-05, + "grad_norm": 4.233096122741699, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8845292329788208, + "num_tokens": 716556136.0, + "step": 18778 + }, + { + "epoch": 2.3888818216511893, + "ewc_loss": 0.008455623872578144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455623901681975e-05, + "grad_norm": 4.193314552307129, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8757538795471191, + "num_tokens": 716592346.0, + "step": 18779 + }, + { + "epoch": 2.38900903192978, + "ewc_loss": 0.008443066850304604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443067054031417e-05, + "grad_norm": 4.174949645996094, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8757730722427368, + "num_tokens": 716633996.0, + "step": 18780 + }, + { + "epoch": 2.3891362422083704, + "ewc_loss": 0.008442563936114311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442563557764515e-05, + "grad_norm": 4.16672420501709, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8788464069366455, + "num_tokens": 716669638.0, + "step": 18781 + }, + { + "epoch": 2.389263452486961, + "ewc_loss": 0.008463853038847446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463853009743616e-05, + "grad_norm": 4.181301593780518, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8741346001625061, + "num_tokens": 716711194.0, + "step": 18782 + }, + { + "epoch": 2.3893906627655515, + "ewc_loss": 0.008454544469714165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454544877167791e-05, + "grad_norm": 4.177334308624268, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8885135054588318, + "num_tokens": 716753402.0, + "step": 18783 + }, + { + "epoch": 2.389517873044142, + "ewc_loss": 0.00844684150069952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446841093245894e-05, + "grad_norm": 4.138926029205322, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8846362829208374, + "num_tokens": 716794638.0, + "step": 18784 + }, + { + "epoch": 2.3896450833227325, + "ewc_loss": 0.008441389538347721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441389218205586e-05, + "grad_norm": 4.2093400955200195, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8564797639846802, + "num_tokens": 716834003.0, + "step": 18785 + }, + { + "epoch": 2.389772293601323, + "ewc_loss": 0.008482111617922783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482112025376409e-05, + "grad_norm": 4.211348056793213, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8866870403289795, + "num_tokens": 716867349.0, + "step": 18786 + }, + { + "epoch": 2.3898995038799136, + "ewc_loss": 0.008469670079648495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469670137856156e-05, + "grad_norm": 4.206774711608887, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8817559480667114, + "num_tokens": 716904271.0, + "step": 18787 + }, + { + "epoch": 2.390026714158504, + "ewc_loss": 0.008455359376966953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45535978442058e-05, + "grad_norm": 4.238523006439209, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8867306709289551, + "num_tokens": 716936165.0, + "step": 18788 + }, + { + "epoch": 2.3901539244370946, + "ewc_loss": 0.008485697209835052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485696889692917e-05, + "grad_norm": 4.188501358032227, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8828237652778625, + "num_tokens": 716974011.0, + "step": 18789 + }, + { + "epoch": 2.390281134715685, + "ewc_loss": 0.008447856642305851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447856816928834e-05, + "grad_norm": 4.1825761795043945, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8829041719436646, + "num_tokens": 717012438.0, + "step": 18790 + }, + { + "epoch": 2.3904083449942757, + "ewc_loss": 0.00846741534769535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46741531859152e-05, + "grad_norm": 4.173134803771973, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8735595941543579, + "num_tokens": 717051755.0, + "step": 18791 + }, + { + "epoch": 2.390535555272866, + "ewc_loss": 0.008456218987703323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456219075014815e-05, + "grad_norm": 4.148294448852539, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8810721635818481, + "num_tokens": 717090740.0, + "step": 18792 + }, + { + "epoch": 2.3906627655514567, + "ewc_loss": 0.008462139405310154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462139521725476e-05, + "grad_norm": 4.2435784339904785, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8584566712379456, + "num_tokens": 717132033.0, + "step": 18793 + }, + { + "epoch": 2.390789975830047, + "ewc_loss": 0.008518056944012642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518056711181998e-05, + "grad_norm": 4.166304588317871, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8876940011978149, + "num_tokens": 717168255.0, + "step": 18794 + }, + { + "epoch": 2.390917186108638, + "ewc_loss": 0.0084408363327384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440836245426908e-05, + "grad_norm": 4.181982040405273, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8809723258018494, + "num_tokens": 717206863.0, + "step": 18795 + }, + { + "epoch": 2.391044396387228, + "ewc_loss": 0.008480064570903778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480064570903778e-05, + "grad_norm": 4.185995101928711, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8633739352226257, + "num_tokens": 717245374.0, + "step": 18796 + }, + { + "epoch": 2.3911716066658184, + "ewc_loss": 0.008485455065965652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485455327900127e-05, + "grad_norm": 4.157175540924072, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8988162279129028, + "num_tokens": 717279821.0, + "step": 18797 + }, + { + "epoch": 2.391298816944409, + "ewc_loss": 0.008460285142064095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46028488012962e-05, + "grad_norm": 4.183443069458008, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8755180835723877, + "num_tokens": 717318896.0, + "step": 18798 + }, + { + "epoch": 2.3914260272229995, + "ewc_loss": 0.008483144454658031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483144483761862e-05, + "grad_norm": 4.1652092933654785, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.87269526720047, + "num_tokens": 717361493.0, + "step": 18799 + }, + { + "epoch": 2.39155323750159, + "ewc_loss": 0.008459929376840591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459929085802287e-05, + "grad_norm": 4.175320625305176, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8753458857536316, + "num_tokens": 717402801.0, + "step": 18800 + }, + { + "epoch": 2.3916804477801805, + "ewc_loss": 0.008478786796331406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478786912746727e-05, + "grad_norm": 4.143350124359131, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.898406982421875, + "num_tokens": 717441275.0, + "step": 18801 + }, + { + "epoch": 2.391807658058771, + "ewc_loss": 0.008454547263681889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454547059955075e-05, + "grad_norm": 4.226644992828369, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8696244955062866, + "num_tokens": 717477423.0, + "step": 18802 + }, + { + "epoch": 2.3919348683373616, + "ewc_loss": 0.008498918265104294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49891803227365e-05, + "grad_norm": 4.178762912750244, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8754400014877319, + "num_tokens": 717512978.0, + "step": 18803 + }, + { + "epoch": 2.392062078615952, + "ewc_loss": 0.00843940768390894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439407974947244e-05, + "grad_norm": 4.141790390014648, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.8995893597602844, + "num_tokens": 717549710.0, + "step": 18804 + }, + { + "epoch": 2.3921892888945426, + "ewc_loss": 0.00844587106257677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445871208095923e-05, + "grad_norm": 4.197682857513428, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8768097758293152, + "num_tokens": 717586588.0, + "step": 18805 + }, + { + "epoch": 2.392316499173133, + "ewc_loss": 0.008499309420585632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499309478793293e-05, + "grad_norm": 4.194650173187256, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8945320248603821, + "num_tokens": 717621507.0, + "step": 18806 + }, + { + "epoch": 2.3924437094517237, + "ewc_loss": 0.008449072949588299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449072629446164e-05, + "grad_norm": 4.194085597991943, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8837951421737671, + "num_tokens": 717655915.0, + "step": 18807 + }, + { + "epoch": 2.392570919730314, + "ewc_loss": 0.008457126095890999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457125659333542e-05, + "grad_norm": 4.153282642364502, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8783661127090454, + "num_tokens": 717696357.0, + "step": 18808 + }, + { + "epoch": 2.3926981300089047, + "ewc_loss": 0.008408363908529282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408363646594808e-05, + "grad_norm": 4.134424209594727, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8946707248687744, + "num_tokens": 717735397.0, + "step": 18809 + }, + { + "epoch": 2.3928253402874953, + "ewc_loss": 0.00844512227922678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445122512057424e-05, + "grad_norm": 4.219412326812744, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8851996064186096, + "num_tokens": 717770081.0, + "step": 18810 + }, + { + "epoch": 2.392952550566086, + "ewc_loss": 0.008487972430884838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487972081638873e-05, + "grad_norm": 4.141535758972168, + "learning_rate": 1e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.8983780145645142, + "num_tokens": 717805262.0, + "step": 18811 + }, + { + "epoch": 2.3930797608446763, + "ewc_loss": 0.008401613682508469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40161374071613e-05, + "grad_norm": 4.21051025390625, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8827723264694214, + "num_tokens": 717841877.0, + "step": 18812 + }, + { + "epoch": 2.393206971123267, + "ewc_loss": 0.008468118496239185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468118903692812e-05, + "grad_norm": 4.14837646484375, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8679327964782715, + "num_tokens": 717881435.0, + "step": 18813 + }, + { + "epoch": 2.3933341814018574, + "ewc_loss": 0.008429186418652534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.429186709690839e-05, + "grad_norm": 4.1264848709106445, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.900378942489624, + "num_tokens": 717922240.0, + "step": 18814 + }, + { + "epoch": 2.393461391680448, + "ewc_loss": 0.008427410386502743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427410648437217e-05, + "grad_norm": 4.182112216949463, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.879326343536377, + "num_tokens": 717958438.0, + "step": 18815 + }, + { + "epoch": 2.3935886019590384, + "ewc_loss": 0.008462752215564251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46275215735659e-05, + "grad_norm": 4.239249229431152, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8744101524353027, + "num_tokens": 717994451.0, + "step": 18816 + }, + { + "epoch": 2.393715812237629, + "ewc_loss": 0.008462519384920597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462519326712936e-05, + "grad_norm": 4.153939723968506, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8845093846321106, + "num_tokens": 718034318.0, + "step": 18817 + }, + { + "epoch": 2.3938430225162195, + "ewc_loss": 0.008392512798309326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.392512972932309e-05, + "grad_norm": 4.130911350250244, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8803962469100952, + "num_tokens": 718074994.0, + "step": 18818 + }, + { + "epoch": 2.3939702327948096, + "ewc_loss": 0.00842295028269291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422950486419722e-05, + "grad_norm": 4.161361217498779, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8807398080825806, + "num_tokens": 718118849.0, + "step": 18819 + }, + { + "epoch": 2.3940974430734006, + "ewc_loss": 0.008421207778155804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421207894571126e-05, + "grad_norm": 4.211042881011963, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.887856662273407, + "num_tokens": 718152874.0, + "step": 18820 + }, + { + "epoch": 2.3942246533519906, + "ewc_loss": 0.008454546332359314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454546332359314e-05, + "grad_norm": 4.192424297332764, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.874704122543335, + "num_tokens": 718194206.0, + "step": 18821 + }, + { + "epoch": 2.394351863630581, + "ewc_loss": 0.008404630236327648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40463035274297e-05, + "grad_norm": 4.188783168792725, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8850005865097046, + "num_tokens": 718227408.0, + "step": 18822 + }, + { + "epoch": 2.3944790739091717, + "ewc_loss": 0.00843210518360138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432105096289888e-05, + "grad_norm": 4.187577247619629, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8759263753890991, + "num_tokens": 718266962.0, + "step": 18823 + }, + { + "epoch": 2.3946062841877622, + "ewc_loss": 0.00840787310153246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407873247051612e-05, + "grad_norm": 4.146640300750732, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8865793347358704, + "num_tokens": 718307873.0, + "step": 18824 + }, + { + "epoch": 2.3947334944663528, + "ewc_loss": 0.008386805653572083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386805711779743e-05, + "grad_norm": 4.139284133911133, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.888057291507721, + "num_tokens": 718350296.0, + "step": 18825 + }, + { + "epoch": 2.3948607047449433, + "ewc_loss": 0.008408574387431145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408573921769857e-05, + "grad_norm": 4.189306259155273, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8860803842544556, + "num_tokens": 718390734.0, + "step": 18826 + }, + { + "epoch": 2.394987915023534, + "ewc_loss": 0.008421612903475761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421612437814474e-05, + "grad_norm": 4.0876054763793945, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8956506252288818, + "num_tokens": 718435630.0, + "step": 18827 + }, + { + "epoch": 2.3951151253021243, + "ewc_loss": 0.00834121648222208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.341216744156554e-05, + "grad_norm": 4.150134563446045, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8891729116439819, + "num_tokens": 718474791.0, + "step": 18828 + }, + { + "epoch": 2.395242335580715, + "ewc_loss": 0.008387095294892788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387095294892788e-05, + "grad_norm": 4.205446243286133, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8657044172286987, + "num_tokens": 718514957.0, + "step": 18829 + }, + { + "epoch": 2.3953695458593054, + "ewc_loss": 0.008415170013904572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415169577347115e-05, + "grad_norm": 4.194693088531494, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8929705619812012, + "num_tokens": 718551761.0, + "step": 18830 + }, + { + "epoch": 2.395496756137896, + "ewc_loss": 0.008356163278222084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35616301628761e-05, + "grad_norm": 4.187994003295898, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8817960023880005, + "num_tokens": 718586445.0, + "step": 18831 + }, + { + "epoch": 2.3956239664164865, + "ewc_loss": 0.00837676040828228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376760524697602e-05, + "grad_norm": 4.132553577423096, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8762431144714355, + "num_tokens": 718631105.0, + "step": 18832 + }, + { + "epoch": 2.395751176695077, + "ewc_loss": 0.00836950447410345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369504212168977e-05, + "grad_norm": 4.18471622467041, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8779255151748657, + "num_tokens": 718672278.0, + "step": 18833 + }, + { + "epoch": 2.3958783869736675, + "ewc_loss": 0.00841133389621973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411333692492917e-05, + "grad_norm": 4.146510124206543, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8880935907363892, + "num_tokens": 718713191.0, + "step": 18834 + }, + { + "epoch": 2.396005597252258, + "ewc_loss": 0.008372674696147442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.372675074497238e-05, + "grad_norm": 4.232513427734375, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8759533166885376, + "num_tokens": 718745012.0, + "step": 18835 + }, + { + "epoch": 2.3961328075308486, + "ewc_loss": 0.008424723520874977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424723637290299e-05, + "grad_norm": 4.183694362640381, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8821666240692139, + "num_tokens": 718780149.0, + "step": 18836 + }, + { + "epoch": 2.396260017809439, + "ewc_loss": 0.008384771645069122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384771354030818e-05, + "grad_norm": 4.283016681671143, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8771911859512329, + "num_tokens": 718812831.0, + "step": 18837 + }, + { + "epoch": 2.3963872280880296, + "ewc_loss": 0.008464123122394085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464122947771102e-05, + "grad_norm": 4.173100471496582, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.891220211982727, + "num_tokens": 718847650.0, + "step": 18838 + }, + { + "epoch": 2.39651443836662, + "ewc_loss": 0.008357114158570766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357113983947784e-05, + "grad_norm": 4.165576934814453, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8847471475601196, + "num_tokens": 718883417.0, + "step": 18839 + }, + { + "epoch": 2.3966416486452107, + "ewc_loss": 0.008409494534134865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409494330408052e-05, + "grad_norm": 4.1636061668396, + "learning_rate": 1e-06, + "loss": 0.2655, + "mean_token_accuracy": 0.9048315286636353, + "num_tokens": 718915501.0, + "step": 18840 + }, + { + "epoch": 2.396768858923801, + "ewc_loss": 0.008402523584663868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402523235417902e-05, + "grad_norm": 4.16458797454834, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8958529233932495, + "num_tokens": 718954485.0, + "step": 18841 + }, + { + "epoch": 2.3968960692023917, + "ewc_loss": 0.0084103187546134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410318696405739e-05, + "grad_norm": 4.149068355560303, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8672379851341248, + "num_tokens": 718995815.0, + "step": 18842 + }, + { + "epoch": 2.3970232794809823, + "ewc_loss": 0.008410247042775154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410246664425358e-05, + "grad_norm": 4.125234603881836, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.8960263729095459, + "num_tokens": 719032820.0, + "step": 18843 + }, + { + "epoch": 2.3971504897595723, + "ewc_loss": 0.008390809409320354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390809671254829e-05, + "grad_norm": 4.220587253570557, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8807273507118225, + "num_tokens": 719066966.0, + "step": 18844 + }, + { + "epoch": 2.3972777000381633, + "ewc_loss": 0.00845001358538866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450013410765678e-05, + "grad_norm": 4.187690258026123, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.882413923740387, + "num_tokens": 719108768.0, + "step": 18845 + }, + { + "epoch": 2.3974049103167534, + "ewc_loss": 0.008398409001529217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398409408982843e-05, + "grad_norm": 4.160306453704834, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8796746730804443, + "num_tokens": 719151451.0, + "step": 18846 + }, + { + "epoch": 2.397532120595344, + "ewc_loss": 0.008389675058424473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389674621867016e-05, + "grad_norm": 4.194148540496826, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8746371269226074, + "num_tokens": 719192103.0, + "step": 18847 + }, + { + "epoch": 2.3976593308739345, + "ewc_loss": 0.008406545035541058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406544657191262e-05, + "grad_norm": 4.223349571228027, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8792333602905273, + "num_tokens": 719226514.0, + "step": 18848 + }, + { + "epoch": 2.397786541152525, + "ewc_loss": 0.008406889624893665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406889537582174e-05, + "grad_norm": 4.132140636444092, + "learning_rate": 1e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9029160737991333, + "num_tokens": 719262944.0, + "step": 18849 + }, + { + "epoch": 2.3979137514311155, + "ewc_loss": 0.00835795421153307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357954357052222e-05, + "grad_norm": 4.158572196960449, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8882921934127808, + "num_tokens": 719306707.0, + "step": 18850 + }, + { + "epoch": 2.398040961709706, + "ewc_loss": 0.008402722887694836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402722596656531e-05, + "grad_norm": 4.17190408706665, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8875071406364441, + "num_tokens": 719347523.0, + "step": 18851 + }, + { + "epoch": 2.3981681719882966, + "ewc_loss": 0.008374745026230812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.374745084438473e-05, + "grad_norm": 4.218761444091797, + "learning_rate": 1e-06, + "loss": 0.274, + "mean_token_accuracy": 0.901591420173645, + "num_tokens": 719379474.0, + "step": 18852 + }, + { + "epoch": 2.398295382266887, + "ewc_loss": 0.00840392429381609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403924584854394e-05, + "grad_norm": 4.181402206420898, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8676022887229919, + "num_tokens": 719421624.0, + "step": 18853 + }, + { + "epoch": 2.3984225925454776, + "ewc_loss": 0.0083827655762434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382765372516587e-05, + "grad_norm": 4.236775875091553, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8788834810256958, + "num_tokens": 719455570.0, + "step": 18854 + }, + { + "epoch": 2.398549802824068, + "ewc_loss": 0.008425773121416569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425772830378264e-05, + "grad_norm": 4.091928958892822, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8984189033508301, + "num_tokens": 719500300.0, + "step": 18855 + }, + { + "epoch": 2.3986770131026587, + "ewc_loss": 0.008335827849805355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.33582816994749e-05, + "grad_norm": 4.227662563323975, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8676387667655945, + "num_tokens": 719538119.0, + "step": 18856 + }, + { + "epoch": 2.398804223381249, + "ewc_loss": 0.008457429707050323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457429794361815e-05, + "grad_norm": 4.203273773193359, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8741198778152466, + "num_tokens": 719574012.0, + "step": 18857 + }, + { + "epoch": 2.3989314336598397, + "ewc_loss": 0.008395812474191189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395812619710341e-05, + "grad_norm": 4.1627726554870605, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8927119970321655, + "num_tokens": 719609548.0, + "step": 18858 + }, + { + "epoch": 2.3990586439384303, + "ewc_loss": 0.008404280990362167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404281106777489e-05, + "grad_norm": 4.157537937164307, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8769327402114868, + "num_tokens": 719647280.0, + "step": 18859 + }, + { + "epoch": 2.399185854217021, + "ewc_loss": 0.008413739502429962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413739124080166e-05, + "grad_norm": 4.1134772300720215, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8976490497589111, + "num_tokens": 719686987.0, + "step": 18860 + }, + { + "epoch": 2.3993130644956113, + "ewc_loss": 0.008389538154006004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389538561459631e-05, + "grad_norm": 4.120343208312988, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8999572396278381, + "num_tokens": 719727974.0, + "step": 18861 + }, + { + "epoch": 2.399440274774202, + "ewc_loss": 0.008418846875429153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418846846325323e-05, + "grad_norm": 4.2031426429748535, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8843262791633606, + "num_tokens": 719764308.0, + "step": 18862 + }, + { + "epoch": 2.3995674850527924, + "ewc_loss": 0.00845405738800764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454057388007641e-05, + "grad_norm": 4.183961391448975, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8728702068328857, + "num_tokens": 719801872.0, + "step": 18863 + }, + { + "epoch": 2.399694695331383, + "ewc_loss": 0.008415902964770794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415902993874624e-05, + "grad_norm": 4.135198593139648, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8833308219909668, + "num_tokens": 719842504.0, + "step": 18864 + }, + { + "epoch": 2.3998219056099734, + "ewc_loss": 0.008387639187276363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387638808926567e-05, + "grad_norm": 4.186229228973389, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8767731785774231, + "num_tokens": 719878885.0, + "step": 18865 + }, + { + "epoch": 2.399949115888564, + "ewc_loss": 0.008449619635939598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44961978145875e-05, + "grad_norm": 4.169117450714111, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.865960955619812, + "num_tokens": 719922891.0, + "step": 18866 + }, + { + "epoch": 2.4000763261671545, + "ewc_loss": 0.008400261402130127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400261867791414e-05, + "grad_norm": 4.193285942077637, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8798694610595703, + "num_tokens": 719960113.0, + "step": 18867 + }, + { + "epoch": 2.400203536445745, + "ewc_loss": 0.008416368626058102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416368655161932e-05, + "grad_norm": 4.117062091827393, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8811886310577393, + "num_tokens": 720006079.0, + "step": 18868 + }, + { + "epoch": 2.400330746724335, + "ewc_loss": 0.008356739766895771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356739999726415e-05, + "grad_norm": 4.284780502319336, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.869773805141449, + "num_tokens": 720042219.0, + "step": 18869 + }, + { + "epoch": 2.400457957002926, + "ewc_loss": 0.008492290042340755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49229036248289e-05, + "grad_norm": 4.155820846557617, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8840187788009644, + "num_tokens": 720082574.0, + "step": 18870 + }, + { + "epoch": 2.400585167281516, + "ewc_loss": 0.00832853652536869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.328536205226555e-05, + "grad_norm": 4.165641784667969, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8796170949935913, + "num_tokens": 720115583.0, + "step": 18871 + }, + { + "epoch": 2.4007123775601067, + "ewc_loss": 0.008393635973334312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393635653192177e-05, + "grad_norm": 4.219550609588623, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8742181062698364, + "num_tokens": 720151350.0, + "step": 18872 + }, + { + "epoch": 2.400839587838697, + "ewc_loss": 0.008418122306466103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41812216094695e-05, + "grad_norm": 4.190129280090332, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8758485913276672, + "num_tokens": 720187072.0, + "step": 18873 + }, + { + "epoch": 2.4009667981172877, + "ewc_loss": 0.008387945592403412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387945126742125e-05, + "grad_norm": 4.189507007598877, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8853026628494263, + "num_tokens": 720222249.0, + "step": 18874 + }, + { + "epoch": 2.4010940083958783, + "ewc_loss": 0.00838761031627655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387610432691872e-05, + "grad_norm": 4.100236892700195, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.881974458694458, + "num_tokens": 720264682.0, + "step": 18875 + }, + { + "epoch": 2.401221218674469, + "ewc_loss": 0.00835280679166317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352806617040187e-05, + "grad_norm": 4.141750812530518, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.878474235534668, + "num_tokens": 720309935.0, + "step": 18876 + }, + { + "epoch": 2.4013484289530593, + "ewc_loss": 0.00842139683663845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421397069469094e-05, + "grad_norm": 4.231445789337158, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8780359029769897, + "num_tokens": 720343992.0, + "step": 18877 + }, + { + "epoch": 2.40147563923165, + "ewc_loss": 0.008442269638180733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442269609076902e-05, + "grad_norm": 4.198583602905273, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.88335120677948, + "num_tokens": 720379707.0, + "step": 18878 + }, + { + "epoch": 2.4016028495102404, + "ewc_loss": 0.008417550474405289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417550270678475e-05, + "grad_norm": 4.141071796417236, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8787259459495544, + "num_tokens": 720419796.0, + "step": 18879 + }, + { + "epoch": 2.401730059788831, + "ewc_loss": 0.00839842576533556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398426143685356e-05, + "grad_norm": 4.123991012573242, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8842035531997681, + "num_tokens": 720460919.0, + "step": 18880 + }, + { + "epoch": 2.4018572700674214, + "ewc_loss": 0.008415807038545609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415806951234117e-05, + "grad_norm": 4.183228492736816, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8796432018280029, + "num_tokens": 720498403.0, + "step": 18881 + }, + { + "epoch": 2.401984480346012, + "ewc_loss": 0.008458017371594906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458017691737041e-05, + "grad_norm": 4.167662143707275, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8837987184524536, + "num_tokens": 720534931.0, + "step": 18882 + }, + { + "epoch": 2.4021116906246025, + "ewc_loss": 0.008421061560511589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.42106164782308e-05, + "grad_norm": 4.1791887283325195, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.885624885559082, + "num_tokens": 720574062.0, + "step": 18883 + }, + { + "epoch": 2.402238900903193, + "ewc_loss": 0.008448038250207901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448037988273427e-05, + "grad_norm": 4.174696445465088, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.875590443611145, + "num_tokens": 720611552.0, + "step": 18884 + }, + { + "epoch": 2.4023661111817836, + "ewc_loss": 0.008437827229499817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437826909357682e-05, + "grad_norm": 4.215551376342773, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8801708221435547, + "num_tokens": 720647886.0, + "step": 18885 + }, + { + "epoch": 2.402493321460374, + "ewc_loss": 0.008448327891528606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448328298982233e-05, + "grad_norm": 4.126892566680908, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8878940939903259, + "num_tokens": 720688034.0, + "step": 18886 + }, + { + "epoch": 2.4026205317389646, + "ewc_loss": 0.008384880609810352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38488049339503e-05, + "grad_norm": 4.140836238861084, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8892700672149658, + "num_tokens": 720727805.0, + "step": 18887 + }, + { + "epoch": 2.402747742017555, + "ewc_loss": 0.008441695012152195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441694808425382e-05, + "grad_norm": 4.230064392089844, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8865844011306763, + "num_tokens": 720759717.0, + "step": 18888 + }, + { + "epoch": 2.4028749522961457, + "ewc_loss": 0.008465627208352089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465626888209954e-05, + "grad_norm": 4.236095428466797, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8773936033248901, + "num_tokens": 720790435.0, + "step": 18889 + }, + { + "epoch": 2.403002162574736, + "ewc_loss": 0.008474546484649181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474546484649181e-05, + "grad_norm": 4.150788307189941, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8792402148246765, + "num_tokens": 720831715.0, + "step": 18890 + }, + { + "epoch": 2.4031293728533267, + "ewc_loss": 0.008417108096182346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417107892455533e-05, + "grad_norm": 4.139182090759277, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.880413293838501, + "num_tokens": 720869598.0, + "step": 18891 + }, + { + "epoch": 2.403256583131917, + "ewc_loss": 0.008455639705061913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455639908788726e-05, + "grad_norm": 4.162039756774902, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8779909014701843, + "num_tokens": 720911122.0, + "step": 18892 + }, + { + "epoch": 2.403383793410508, + "ewc_loss": 0.00846446119248867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464461279800162e-05, + "grad_norm": 4.186795711517334, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8863343000411987, + "num_tokens": 720952265.0, + "step": 18893 + }, + { + "epoch": 2.403511003689098, + "ewc_loss": 0.008468549698591232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468549640383571e-05, + "grad_norm": 4.179641246795654, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8854888081550598, + "num_tokens": 720991191.0, + "step": 18894 + }, + { + "epoch": 2.4036382139676884, + "ewc_loss": 0.008459596894681454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459596574539319e-05, + "grad_norm": 4.1336822509765625, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8928853273391724, + "num_tokens": 721032524.0, + "step": 18895 + }, + { + "epoch": 2.403765424246279, + "ewc_loss": 0.008429758250713348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.429758599959314e-05, + "grad_norm": 4.2108588218688965, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8828799724578857, + "num_tokens": 721072699.0, + "step": 18896 + }, + { + "epoch": 2.4038926345248695, + "ewc_loss": 0.008496039547026157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496039663441479e-05, + "grad_norm": 4.152586936950684, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.894163966178894, + "num_tokens": 721109467.0, + "step": 18897 + }, + { + "epoch": 2.40401984480346, + "ewc_loss": 0.008413557894527912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413557952735573e-05, + "grad_norm": 4.252681732177734, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8607713580131531, + "num_tokens": 721142108.0, + "step": 18898 + }, + { + "epoch": 2.4041470550820505, + "ewc_loss": 0.008494357578456402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49435746204108e-05, + "grad_norm": 4.169596195220947, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.875580370426178, + "num_tokens": 721179875.0, + "step": 18899 + }, + { + "epoch": 2.404274265360641, + "ewc_loss": 0.008432178758084774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432178583461791e-05, + "grad_norm": 4.20407247543335, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8795956373214722, + "num_tokens": 721217141.0, + "step": 18900 + }, + { + "epoch": 2.4044014756392316, + "ewc_loss": 0.008481474593281746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481474651489407e-05, + "grad_norm": 4.148609161376953, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8768522143363953, + "num_tokens": 721256357.0, + "step": 18901 + }, + { + "epoch": 2.404528685917822, + "ewc_loss": 0.00843520276248455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435203199042007e-05, + "grad_norm": 4.173372745513916, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8817870020866394, + "num_tokens": 721292610.0, + "step": 18902 + }, + { + "epoch": 2.4046558961964126, + "ewc_loss": 0.008490301668643951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490301843266934e-05, + "grad_norm": 4.202742099761963, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8889961242675781, + "num_tokens": 721329048.0, + "step": 18903 + }, + { + "epoch": 2.404783106475003, + "ewc_loss": 0.008500492200255394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500491821905598e-05, + "grad_norm": 4.143767356872559, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8967599868774414, + "num_tokens": 721364249.0, + "step": 18904 + }, + { + "epoch": 2.4049103167535937, + "ewc_loss": 0.00847641285508871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47641276777722e-05, + "grad_norm": 4.256538391113281, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8905594944953918, + "num_tokens": 721391154.0, + "step": 18905 + }, + { + "epoch": 2.405037527032184, + "ewc_loss": 0.008568648248910904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5686486272607e-05, + "grad_norm": 4.165169715881348, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8935190439224243, + "num_tokens": 721428443.0, + "step": 18906 + }, + { + "epoch": 2.4051647373107747, + "ewc_loss": 0.008487233892083168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487233571941033e-05, + "grad_norm": 4.136873722076416, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8717753887176514, + "num_tokens": 721474545.0, + "step": 18907 + }, + { + "epoch": 2.4052919475893653, + "ewc_loss": 0.008515751920640469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515751687809825e-05, + "grad_norm": 4.177998065948486, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8840330243110657, + "num_tokens": 721513153.0, + "step": 18908 + }, + { + "epoch": 2.405419157867956, + "ewc_loss": 0.008555304259061813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555304520996287e-05, + "grad_norm": 4.16269063949585, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8785839080810547, + "num_tokens": 721556287.0, + "step": 18909 + }, + { + "epoch": 2.4055463681465463, + "ewc_loss": 0.00851436983793974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514369983458892e-05, + "grad_norm": 4.202416896820068, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8890657424926758, + "num_tokens": 721590219.0, + "step": 18910 + }, + { + "epoch": 2.405673578425137, + "ewc_loss": 0.008532593958079815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532594074495137e-05, + "grad_norm": 4.158339977264404, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8823888301849365, + "num_tokens": 721627308.0, + "step": 18911 + }, + { + "epoch": 2.4058007887037274, + "ewc_loss": 0.008492951281368732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492951019434258e-05, + "grad_norm": 4.200314998626709, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8773348927497864, + "num_tokens": 721664218.0, + "step": 18912 + }, + { + "epoch": 2.405927998982318, + "ewc_loss": 0.0085523696616292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552369399694726e-05, + "grad_norm": 4.17122745513916, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.893444836139679, + "num_tokens": 721703511.0, + "step": 18913 + }, + { + "epoch": 2.4060552092609084, + "ewc_loss": 0.008485566824674606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485566650051624e-05, + "grad_norm": 4.161525249481201, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8941285610198975, + "num_tokens": 721741512.0, + "step": 18914 + }, + { + "epoch": 2.406182419539499, + "ewc_loss": 0.008487883023917675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48788331495598e-05, + "grad_norm": 4.159763813018799, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8843522071838379, + "num_tokens": 721778443.0, + "step": 18915 + }, + { + "epoch": 2.4063096298180895, + "ewc_loss": 0.008486718870699406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48671916173771e-05, + "grad_norm": 4.185885429382324, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8797838687896729, + "num_tokens": 721817226.0, + "step": 18916 + }, + { + "epoch": 2.4064368400966796, + "ewc_loss": 0.008491064421832561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4910643636249e-05, + "grad_norm": 4.15709924697876, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.870587170124054, + "num_tokens": 721859419.0, + "step": 18917 + }, + { + "epoch": 2.4065640503752705, + "ewc_loss": 0.008465922437608242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46592229208909e-05, + "grad_norm": 4.154128551483154, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8956973552703857, + "num_tokens": 721895183.0, + "step": 18918 + }, + { + "epoch": 2.4066912606538606, + "ewc_loss": 0.008484816178679466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48481577122584e-05, + "grad_norm": 4.19503116607666, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8702870011329651, + "num_tokens": 721933566.0, + "step": 18919 + }, + { + "epoch": 2.406818470932451, + "ewc_loss": 0.008486617356538773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486617298331112e-05, + "grad_norm": 4.186709403991699, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8863521814346313, + "num_tokens": 721968827.0, + "step": 18920 + }, + { + "epoch": 2.4069456812110417, + "ewc_loss": 0.008473494090139866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473493653582409e-05, + "grad_norm": 4.196563720703125, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8852296471595764, + "num_tokens": 722004989.0, + "step": 18921 + }, + { + "epoch": 2.407072891489632, + "ewc_loss": 0.008489895612001419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489895844832063e-05, + "grad_norm": 4.191123008728027, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8845070600509644, + "num_tokens": 722043681.0, + "step": 18922 + }, + { + "epoch": 2.4072001017682227, + "ewc_loss": 0.008473187685012817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473188063362613e-05, + "grad_norm": 4.158073902130127, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8814199566841125, + "num_tokens": 722086541.0, + "step": 18923 + }, + { + "epoch": 2.4073273120468133, + "ewc_loss": 0.008460568264126778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460567914880812e-05, + "grad_norm": 4.173388481140137, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8839194774627686, + "num_tokens": 722125519.0, + "step": 18924 + }, + { + "epoch": 2.407454522325404, + "ewc_loss": 0.008467121049761772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467120642308146e-05, + "grad_norm": 4.247857093811035, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8786524534225464, + "num_tokens": 722157665.0, + "step": 18925 + }, + { + "epoch": 2.4075817326039943, + "ewc_loss": 0.008507197722792625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50719734444283e-05, + "grad_norm": 4.177619457244873, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8935307264328003, + "num_tokens": 722191819.0, + "step": 18926 + }, + { + "epoch": 2.407708942882585, + "ewc_loss": 0.008441939018666744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441939280601218e-05, + "grad_norm": 4.180849075317383, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8721877932548523, + "num_tokens": 722229755.0, + "step": 18927 + }, + { + "epoch": 2.4078361531611754, + "ewc_loss": 0.008478675037622452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478674862999469e-05, + "grad_norm": 4.2381415367126465, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8728325366973877, + "num_tokens": 722267440.0, + "step": 18928 + }, + { + "epoch": 2.407963363439766, + "ewc_loss": 0.008488102816045284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488103048875928e-05, + "grad_norm": 4.1291704177856445, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8774970173835754, + "num_tokens": 722310102.0, + "step": 18929 + }, + { + "epoch": 2.4080905737183564, + "ewc_loss": 0.008415856398642063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415856427745894e-05, + "grad_norm": 4.184184551239014, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8752079010009766, + "num_tokens": 722346713.0, + "step": 18930 + }, + { + "epoch": 2.408217783996947, + "ewc_loss": 0.008472357876598835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.472357876598835e-05, + "grad_norm": 4.148599147796631, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8779401183128357, + "num_tokens": 722387803.0, + "step": 18931 + }, + { + "epoch": 2.4083449942755375, + "ewc_loss": 0.008446386083960533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446386345895007e-05, + "grad_norm": 4.190932273864746, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8732913732528687, + "num_tokens": 722428943.0, + "step": 18932 + }, + { + "epoch": 2.408472204554128, + "ewc_loss": 0.00846297387033701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462974074063823e-05, + "grad_norm": 4.193066120147705, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8751630783081055, + "num_tokens": 722467860.0, + "step": 18933 + }, + { + "epoch": 2.4085994148327186, + "ewc_loss": 0.008462886326014996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462886034976691e-05, + "grad_norm": 4.179793357849121, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8778858184814453, + "num_tokens": 722503067.0, + "step": 18934 + }, + { + "epoch": 2.408726625111309, + "ewc_loss": 0.008438552729785442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438553049927577e-05, + "grad_norm": 4.209889888763428, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8837260007858276, + "num_tokens": 722537842.0, + "step": 18935 + }, + { + "epoch": 2.4088538353898996, + "ewc_loss": 0.008466718718409538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.466719009447843e-05, + "grad_norm": 4.197629451751709, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8878475427627563, + "num_tokens": 722574036.0, + "step": 18936 + }, + { + "epoch": 2.40898104566849, + "ewc_loss": 0.00844715815037489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447158324997872e-05, + "grad_norm": 4.2561845779418945, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.884110689163208, + "num_tokens": 722606087.0, + "step": 18937 + }, + { + "epoch": 2.4091082559470807, + "ewc_loss": 0.008474153466522694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474153582938015e-05, + "grad_norm": 4.1343817710876465, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8825350999832153, + "num_tokens": 722648745.0, + "step": 18938 + }, + { + "epoch": 2.409235466225671, + "ewc_loss": 0.008400452323257923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400452497880906e-05, + "grad_norm": 4.158937931060791, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8759005665779114, + "num_tokens": 722688589.0, + "step": 18939 + }, + { + "epoch": 2.4093626765042617, + "ewc_loss": 0.008447193540632725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44719324959442e-05, + "grad_norm": 4.16554069519043, + "learning_rate": 1e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8991824388504028, + "num_tokens": 722723941.0, + "step": 18940 + }, + { + "epoch": 2.4094898867828523, + "ewc_loss": 0.008449194021522999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44919413793832e-05, + "grad_norm": 4.188415050506592, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8645938634872437, + "num_tokens": 722768242.0, + "step": 18941 + }, + { + "epoch": 2.4096170970614423, + "ewc_loss": 0.008435079827904701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435079507762566e-05, + "grad_norm": 4.20358419418335, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8826361894607544, + "num_tokens": 722800970.0, + "step": 18942 + }, + { + "epoch": 2.4097443073400333, + "ewc_loss": 0.008470472879707813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470472675981e-05, + "grad_norm": 4.214871406555176, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.874427318572998, + "num_tokens": 722839168.0, + "step": 18943 + }, + { + "epoch": 2.4098715176186234, + "ewc_loss": 0.008443126454949379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443125989288092e-05, + "grad_norm": 4.1080803871154785, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8930231332778931, + "num_tokens": 722883451.0, + "step": 18944 + }, + { + "epoch": 2.409998727897214, + "ewc_loss": 0.008395671844482422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395671466132626e-05, + "grad_norm": 4.144949913024902, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.882961094379425, + "num_tokens": 722928581.0, + "step": 18945 + }, + { + "epoch": 2.4101259381758044, + "ewc_loss": 0.008443833328783512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44383321236819e-05, + "grad_norm": 4.150610446929932, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8931812644004822, + "num_tokens": 722967141.0, + "step": 18946 + }, + { + "epoch": 2.410253148454395, + "ewc_loss": 0.008433792740106583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433792390860617e-05, + "grad_norm": 4.180354118347168, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8782119750976562, + "num_tokens": 723006263.0, + "step": 18947 + }, + { + "epoch": 2.4103803587329855, + "ewc_loss": 0.008430792950093746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430793241132051e-05, + "grad_norm": 4.127805709838867, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8838664293289185, + "num_tokens": 723050843.0, + "step": 18948 + }, + { + "epoch": 2.410507569011576, + "ewc_loss": 0.008387467823922634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387467823922634e-05, + "grad_norm": 4.1789445877075195, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8812090754508972, + "num_tokens": 723092124.0, + "step": 18949 + }, + { + "epoch": 2.4106347792901666, + "ewc_loss": 0.008433505892753601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433505718130618e-05, + "grad_norm": 4.219305038452148, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8946307897567749, + "num_tokens": 723122478.0, + "step": 18950 + }, + { + "epoch": 2.410761989568757, + "ewc_loss": 0.008418838493525982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418838115176186e-05, + "grad_norm": 4.1307830810546875, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8764880299568176, + "num_tokens": 723163054.0, + "step": 18951 + }, + { + "epoch": 2.4108891998473476, + "ewc_loss": 0.00837706495076418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377064659725875e-05, + "grad_norm": 4.167438983917236, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8820807933807373, + "num_tokens": 723201518.0, + "step": 18952 + }, + { + "epoch": 2.411016410125938, + "ewc_loss": 0.008416631259024143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416631317231804e-05, + "grad_norm": 4.171872138977051, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8824682235717773, + "num_tokens": 723235685.0, + "step": 18953 + }, + { + "epoch": 2.4111436204045287, + "ewc_loss": 0.00841823872178793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418238576268777e-05, + "grad_norm": 4.282522201538086, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8842024803161621, + "num_tokens": 723266510.0, + "step": 18954 + }, + { + "epoch": 2.411270830683119, + "ewc_loss": 0.008489425294101238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489425090374425e-05, + "grad_norm": 4.136961936950684, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8847944736480713, + "num_tokens": 723302800.0, + "step": 18955 + }, + { + "epoch": 2.4113980409617097, + "ewc_loss": 0.008384316228330135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38431587908417e-05, + "grad_norm": 4.218189239501953, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8838135004043579, + "num_tokens": 723340669.0, + "step": 18956 + }, + { + "epoch": 2.4115252512403003, + "ewc_loss": 0.008494069799780846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49407006171532e-05, + "grad_norm": 4.1626877784729, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.885282039642334, + "num_tokens": 723377556.0, + "step": 18957 + }, + { + "epoch": 2.411652461518891, + "ewc_loss": 0.008430041372776031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430040907114744e-05, + "grad_norm": 4.222700119018555, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8872536420822144, + "num_tokens": 723410942.0, + "step": 18958 + }, + { + "epoch": 2.4117796717974813, + "ewc_loss": 0.008483662270009518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483662531943992e-05, + "grad_norm": 4.123739242553711, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8955804109573364, + "num_tokens": 723451131.0, + "step": 18959 + }, + { + "epoch": 2.411906882076072, + "ewc_loss": 0.008427733555436134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427733700955287e-05, + "grad_norm": 4.260362148284912, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8868014812469482, + "num_tokens": 723485876.0, + "step": 18960 + }, + { + "epoch": 2.4120340923546624, + "ewc_loss": 0.008515940047800541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515940135112032e-05, + "grad_norm": 4.151015281677246, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8908766508102417, + "num_tokens": 723521252.0, + "step": 18961 + }, + { + "epoch": 2.412161302633253, + "ewc_loss": 0.008408473804593086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408473513554782e-05, + "grad_norm": 4.217153549194336, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8691397905349731, + "num_tokens": 723561094.0, + "step": 18962 + }, + { + "epoch": 2.4122885129118434, + "ewc_loss": 0.00849919579923153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499195973854512e-05, + "grad_norm": 4.208755970001221, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8801875114440918, + "num_tokens": 723597281.0, + "step": 18963 + }, + { + "epoch": 2.412415723190434, + "ewc_loss": 0.008469275198876858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469275053357705e-05, + "grad_norm": 4.138705253601074, + "learning_rate": 1e-06, + "loss": 0.2717, + "mean_token_accuracy": 0.9034488201141357, + "num_tokens": 723637690.0, + "step": 18964 + }, + { + "epoch": 2.4125429334690245, + "ewc_loss": 0.008428601548075676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428601722698659e-05, + "grad_norm": 4.186845302581787, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8833651542663574, + "num_tokens": 723676250.0, + "step": 18965 + }, + { + "epoch": 2.412670143747615, + "ewc_loss": 0.008484629914164543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484630234306678e-05, + "grad_norm": 4.16854190826416, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8928313255310059, + "num_tokens": 723713157.0, + "step": 18966 + }, + { + "epoch": 2.412797354026205, + "ewc_loss": 0.00844718236476183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447182335657999e-05, + "grad_norm": 4.268925666809082, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8881603479385376, + "num_tokens": 723745877.0, + "step": 18967 + }, + { + "epoch": 2.412924564304796, + "ewc_loss": 0.008534555323421955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534554945072159e-05, + "grad_norm": 4.199669361114502, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8832371234893799, + "num_tokens": 723784285.0, + "step": 18968 + }, + { + "epoch": 2.413051774583386, + "ewc_loss": 0.00845177099108696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451771282125264e-05, + "grad_norm": 4.258481502532959, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8716033101081848, + "num_tokens": 723817085.0, + "step": 18969 + }, + { + "epoch": 2.4131789848619767, + "ewc_loss": 0.008515812456607819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515812078258023e-05, + "grad_norm": 4.182490348815918, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8774352073669434, + "num_tokens": 723858466.0, + "step": 18970 + }, + { + "epoch": 2.413306195140567, + "ewc_loss": 0.008446739986538887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446739957435057e-05, + "grad_norm": 4.158337116241455, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8813436031341553, + "num_tokens": 723896627.0, + "step": 18971 + }, + { + "epoch": 2.4134334054191577, + "ewc_loss": 0.008472681976854801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.472681656712666e-05, + "grad_norm": 4.239947319030762, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8783180117607117, + "num_tokens": 723930727.0, + "step": 18972 + }, + { + "epoch": 2.4135606156977483, + "ewc_loss": 0.008535283617675304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5352839960251e-05, + "grad_norm": 4.242867946624756, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8734081983566284, + "num_tokens": 723965996.0, + "step": 18973 + }, + { + "epoch": 2.413687825976339, + "ewc_loss": 0.00848834216594696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488341700285673e-05, + "grad_norm": 4.17786979675293, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8781031966209412, + "num_tokens": 724002441.0, + "step": 18974 + }, + { + "epoch": 2.4138150362549293, + "ewc_loss": 0.008474004454910755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474004425806925e-05, + "grad_norm": 4.217321395874023, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8869122266769409, + "num_tokens": 724039238.0, + "step": 18975 + }, + { + "epoch": 2.41394224653352, + "ewc_loss": 0.008526209741830826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526210149284452e-05, + "grad_norm": 4.175001621246338, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8874282240867615, + "num_tokens": 724077430.0, + "step": 18976 + }, + { + "epoch": 2.4140694568121104, + "ewc_loss": 0.00848739966750145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487399463774636e-05, + "grad_norm": 4.171485900878906, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8769757747650146, + "num_tokens": 724113175.0, + "step": 18977 + }, + { + "epoch": 2.414196667090701, + "ewc_loss": 0.008519391529262066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519391849404201e-05, + "grad_norm": 4.176435947418213, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.877938985824585, + "num_tokens": 724156409.0, + "step": 18978 + }, + { + "epoch": 2.4143238773692914, + "ewc_loss": 0.00849927868694067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499278919771314e-05, + "grad_norm": 4.142507076263428, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8829118609428406, + "num_tokens": 724199879.0, + "step": 18979 + }, + { + "epoch": 2.414451087647882, + "ewc_loss": 0.00847853347659111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478533709421754e-05, + "grad_norm": 4.144087314605713, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8940925598144531, + "num_tokens": 724238065.0, + "step": 18980 + }, + { + "epoch": 2.4145782979264725, + "ewc_loss": 0.008495308458805084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495308429701254e-05, + "grad_norm": 4.250709533691406, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8767856359481812, + "num_tokens": 724272336.0, + "step": 18981 + }, + { + "epoch": 2.414705508205063, + "ewc_loss": 0.00854399986565113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54399986565113e-05, + "grad_norm": 4.246435165405273, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.874177098274231, + "num_tokens": 724305200.0, + "step": 18982 + }, + { + "epoch": 2.4148327184836536, + "ewc_loss": 0.008514714427292347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514714136254042e-05, + "grad_norm": 4.209272384643555, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8877949714660645, + "num_tokens": 724341295.0, + "step": 18983 + }, + { + "epoch": 2.414959928762244, + "ewc_loss": 0.008501734584569931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5017345554661e-05, + "grad_norm": 4.158716201782227, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8868384957313538, + "num_tokens": 724384336.0, + "step": 18984 + }, + { + "epoch": 2.4150871390408346, + "ewc_loss": 0.00849409680813551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494096982758492e-05, + "grad_norm": 4.23292350769043, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.881036639213562, + "num_tokens": 724417279.0, + "step": 18985 + }, + { + "epoch": 2.415214349319425, + "ewc_loss": 0.008532961830496788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532961510354653e-05, + "grad_norm": 4.178952217102051, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8808040618896484, + "num_tokens": 724456509.0, + "step": 18986 + }, + { + "epoch": 2.4153415595980157, + "ewc_loss": 0.008469471707940102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469471504213288e-05, + "grad_norm": 4.134390830993652, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8917023539543152, + "num_tokens": 724498812.0, + "step": 18987 + }, + { + "epoch": 2.415468769876606, + "ewc_loss": 0.008471488021314144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471487672068179e-05, + "grad_norm": 4.202711582183838, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8718114495277405, + "num_tokens": 724538174.0, + "step": 18988 + }, + { + "epoch": 2.4155959801551967, + "ewc_loss": 0.00849834457039833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498344686813653e-05, + "grad_norm": 4.173620700836182, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8884057402610779, + "num_tokens": 724577171.0, + "step": 18989 + }, + { + "epoch": 2.415723190433787, + "ewc_loss": 0.008439195342361927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43919551698491e-05, + "grad_norm": 4.187703609466553, + "learning_rate": 1e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.9032301306724548, + "num_tokens": 724610626.0, + "step": 18990 + }, + { + "epoch": 2.4158504007123778, + "ewc_loss": 0.008465683087706566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465682913083583e-05, + "grad_norm": 4.194764614105225, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8945908546447754, + "num_tokens": 724647613.0, + "step": 18991 + }, + { + "epoch": 2.415977610990968, + "ewc_loss": 0.008456602692604065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456602517981082e-05, + "grad_norm": 4.224451065063477, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8849592208862305, + "num_tokens": 724684898.0, + "step": 18992 + }, + { + "epoch": 2.4161048212695584, + "ewc_loss": 0.008474843576550484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47484334371984e-05, + "grad_norm": 4.168274879455566, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8805697560310364, + "num_tokens": 724725566.0, + "step": 18993 + }, + { + "epoch": 2.416232031548149, + "ewc_loss": 0.008405571803450584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405571861658245e-05, + "grad_norm": 4.158798694610596, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8739067912101746, + "num_tokens": 724763701.0, + "step": 18994 + }, + { + "epoch": 2.4163592418267394, + "ewc_loss": 0.008452314883470535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452314796159044e-05, + "grad_norm": 4.237847805023193, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8827452659606934, + "num_tokens": 724798108.0, + "step": 18995 + }, + { + "epoch": 2.41648645210533, + "ewc_loss": 0.008466542698442936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46654293127358e-05, + "grad_norm": 4.147669315338135, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8625173568725586, + "num_tokens": 724842430.0, + "step": 18996 + }, + { + "epoch": 2.4166136623839205, + "ewc_loss": 0.008386691100895405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.386691479245201e-05, + "grad_norm": 4.1835174560546875, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8780198097229004, + "num_tokens": 724884585.0, + "step": 18997 + }, + { + "epoch": 2.416740872662511, + "ewc_loss": 0.008443126454949379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443126716883853e-05, + "grad_norm": 4.1911468505859375, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8927441835403442, + "num_tokens": 724919338.0, + "step": 18998 + }, + { + "epoch": 2.4168680829411016, + "ewc_loss": 0.008427456952631474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427457214565948e-05, + "grad_norm": 4.158108234405518, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.9032930135726929, + "num_tokens": 724956007.0, + "step": 18999 + }, + { + "epoch": 2.416995293219692, + "ewc_loss": 0.008396598510444164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396598423132673e-05, + "grad_norm": 4.140059471130371, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.899640679359436, + "num_tokens": 724996994.0, + "step": 19000 + }, + { + "epoch": 2.4171225034982826, + "ewc_loss": 0.0083958450704813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395844633923844e-05, + "grad_norm": 4.2353668212890625, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8769635558128357, + "num_tokens": 725035607.0, + "step": 19001 + }, + { + "epoch": 2.417249713776873, + "ewc_loss": 0.008438513614237309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438513759756461e-05, + "grad_norm": 4.190253257751465, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8862136006355286, + "num_tokens": 725075370.0, + "step": 19002 + }, + { + "epoch": 2.4173769240554637, + "ewc_loss": 0.008375558070838451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.375557808903977e-05, + "grad_norm": 4.139847755432129, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8806519508361816, + "num_tokens": 725119536.0, + "step": 19003 + }, + { + "epoch": 2.417504134334054, + "ewc_loss": 0.00835855770856142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358557533938438e-05, + "grad_norm": 4.1738128662109375, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8821964859962463, + "num_tokens": 725158048.0, + "step": 19004 + }, + { + "epoch": 2.4176313446126447, + "ewc_loss": 0.008397357538342476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397357305511832e-05, + "grad_norm": 4.155117988586426, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8912100791931152, + "num_tokens": 725196343.0, + "step": 19005 + }, + { + "epoch": 2.4177585548912353, + "ewc_loss": 0.00836002454161644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360024366993457e-05, + "grad_norm": 4.175149917602539, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8817479610443115, + "num_tokens": 725234613.0, + "step": 19006 + }, + { + "epoch": 2.417885765169826, + "ewc_loss": 0.00838002935051918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380028884857893e-05, + "grad_norm": 4.171526908874512, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8815655708312988, + "num_tokens": 725272129.0, + "step": 19007 + }, + { + "epoch": 2.4180129754484163, + "ewc_loss": 0.008360279724001884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360279753105715e-05, + "grad_norm": 4.264007568359375, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8727482557296753, + "num_tokens": 725305740.0, + "step": 19008 + }, + { + "epoch": 2.418140185727007, + "ewc_loss": 0.008417940698564053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417940989602357e-05, + "grad_norm": 4.19185209274292, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8795820474624634, + "num_tokens": 725340579.0, + "step": 19009 + }, + { + "epoch": 2.4182673960055974, + "ewc_loss": 0.008365876041352749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365876419702545e-05, + "grad_norm": 4.150033473968506, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8862485885620117, + "num_tokens": 725380492.0, + "step": 19010 + }, + { + "epoch": 2.418394606284188, + "ewc_loss": 0.008360321633517742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.360321953659877e-05, + "grad_norm": 4.179769515991211, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.883858323097229, + "num_tokens": 725418043.0, + "step": 19011 + }, + { + "epoch": 2.4185218165627784, + "ewc_loss": 0.008394842967391014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39484273456037e-05, + "grad_norm": 4.233780860900879, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8755930662155151, + "num_tokens": 725456260.0, + "step": 19012 + }, + { + "epoch": 2.418649026841369, + "ewc_loss": 0.008406541310250759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406541746808216e-05, + "grad_norm": 4.18454122543335, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.878668487071991, + "num_tokens": 725496605.0, + "step": 19013 + }, + { + "epoch": 2.4187762371199595, + "ewc_loss": 0.008358754217624664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35875398479402e-05, + "grad_norm": 4.206716537475586, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8843578696250916, + "num_tokens": 725532395.0, + "step": 19014 + }, + { + "epoch": 2.4189034473985496, + "ewc_loss": 0.008398489095270634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398489444516599e-05, + "grad_norm": 4.168297290802002, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8772737979888916, + "num_tokens": 725575335.0, + "step": 19015 + }, + { + "epoch": 2.4190306576771405, + "ewc_loss": 0.008368119597434998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.368119597434998e-05, + "grad_norm": 4.2069268226623535, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8888214230537415, + "num_tokens": 725612345.0, + "step": 19016 + }, + { + "epoch": 2.4191578679557306, + "ewc_loss": 0.00839967280626297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399672515224665e-05, + "grad_norm": 4.237969875335693, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8853725790977478, + "num_tokens": 725645566.0, + "step": 19017 + }, + { + "epoch": 2.419285078234321, + "ewc_loss": 0.008420933037996292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.42093286337331e-05, + "grad_norm": 4.190043926239014, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8776152729988098, + "num_tokens": 725684218.0, + "step": 19018 + }, + { + "epoch": 2.4194122885129117, + "ewc_loss": 0.008364646695554256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.364646782865748e-05, + "grad_norm": 4.195387840270996, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8857591152191162, + "num_tokens": 725718478.0, + "step": 19019 + }, + { + "epoch": 2.419539498791502, + "ewc_loss": 0.008415144868195057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415144839091226e-05, + "grad_norm": 4.231459617614746, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8766971826553345, + "num_tokens": 725756208.0, + "step": 19020 + }, + { + "epoch": 2.4196667090700927, + "ewc_loss": 0.008443346247076988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443346450803801e-05, + "grad_norm": 4.212302207946777, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8781546354293823, + "num_tokens": 725791884.0, + "step": 19021 + }, + { + "epoch": 2.4197939193486833, + "ewc_loss": 0.008408158086240292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408157736994326e-05, + "grad_norm": 4.199541091918945, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8857824206352234, + "num_tokens": 725829138.0, + "step": 19022 + }, + { + "epoch": 2.419921129627274, + "ewc_loss": 0.00841087382286787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410873851971701e-05, + "grad_norm": 4.177213191986084, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.883326530456543, + "num_tokens": 725868301.0, + "step": 19023 + }, + { + "epoch": 2.4200483399058643, + "ewc_loss": 0.008410949259996414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410948794335127e-05, + "grad_norm": 4.232262134552002, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8731262683868408, + "num_tokens": 725906186.0, + "step": 19024 + }, + { + "epoch": 2.420175550184455, + "ewc_loss": 0.008449388667941093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44938840600662e-05, + "grad_norm": 4.1980881690979, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8713656067848206, + "num_tokens": 725947006.0, + "step": 19025 + }, + { + "epoch": 2.4203027604630454, + "ewc_loss": 0.008410927839577198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410927694058046e-05, + "grad_norm": 4.156618118286133, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8862190246582031, + "num_tokens": 725987363.0, + "step": 19026 + }, + { + "epoch": 2.420429970741636, + "ewc_loss": 0.00841456837952137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414568583248183e-05, + "grad_norm": 4.177946090698242, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.873992919921875, + "num_tokens": 726026176.0, + "step": 19027 + }, + { + "epoch": 2.4205571810202264, + "ewc_loss": 0.00845076609402895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450765744782984e-05, + "grad_norm": 4.192135334014893, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.880246639251709, + "num_tokens": 726068124.0, + "step": 19028 + }, + { + "epoch": 2.420684391298817, + "ewc_loss": 0.008444280363619328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444279956165701e-05, + "grad_norm": 4.197382926940918, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8737292289733887, + "num_tokens": 726110270.0, + "step": 19029 + }, + { + "epoch": 2.4208116015774075, + "ewc_loss": 0.008442315272986889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442315447609872e-05, + "grad_norm": 4.245103359222412, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8976587057113647, + "num_tokens": 726143908.0, + "step": 19030 + }, + { + "epoch": 2.420938811855998, + "ewc_loss": 0.008446847088634968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446846914011985e-05, + "grad_norm": 4.193324089050293, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.880728006362915, + "num_tokens": 726179082.0, + "step": 19031 + }, + { + "epoch": 2.4210660221345885, + "ewc_loss": 0.008414647541940212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414647891186178e-05, + "grad_norm": 4.212803840637207, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8779473900794983, + "num_tokens": 726216387.0, + "step": 19032 + }, + { + "epoch": 2.421193232413179, + "ewc_loss": 0.008457507938146591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457508374704048e-05, + "grad_norm": 4.1473565101623535, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8903124332427979, + "num_tokens": 726258831.0, + "step": 19033 + }, + { + "epoch": 2.4213204426917696, + "ewc_loss": 0.008404809050261974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404809341300279e-05, + "grad_norm": 4.167033672332764, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8837518692016602, + "num_tokens": 726301929.0, + "step": 19034 + }, + { + "epoch": 2.42144765297036, + "ewc_loss": 0.008441288024187088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441288082394749e-05, + "grad_norm": 4.198843955993652, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8855884671211243, + "num_tokens": 726339938.0, + "step": 19035 + }, + { + "epoch": 2.4215748632489507, + "ewc_loss": 0.008448564447462559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448564040008932e-05, + "grad_norm": 4.215417385101318, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8827666640281677, + "num_tokens": 726377197.0, + "step": 19036 + }, + { + "epoch": 2.421702073527541, + "ewc_loss": 0.008420067839324474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420067752012983e-05, + "grad_norm": 4.176300525665283, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.886620283126831, + "num_tokens": 726417591.0, + "step": 19037 + }, + { + "epoch": 2.4218292838061317, + "ewc_loss": 0.008393646217882633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393646567128599e-05, + "grad_norm": 4.2265777587890625, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8723082542419434, + "num_tokens": 726456094.0, + "step": 19038 + }, + { + "epoch": 2.4219564940847222, + "ewc_loss": 0.008435080759227276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435080962954089e-05, + "grad_norm": 4.172002792358398, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8917375206947327, + "num_tokens": 726491158.0, + "step": 19039 + }, + { + "epoch": 2.4220837043633123, + "ewc_loss": 0.00834923516958952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.349234849447384e-05, + "grad_norm": 4.188932418823242, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8700209856033325, + "num_tokens": 726531334.0, + "step": 19040 + }, + { + "epoch": 2.4222109146419033, + "ewc_loss": 0.008394184522330761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394184260396287e-05, + "grad_norm": 4.1705427169799805, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8735229969024658, + "num_tokens": 726571999.0, + "step": 19041 + }, + { + "epoch": 2.4223381249204934, + "ewc_loss": 0.00837718602269888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377186168218032e-05, + "grad_norm": 4.301119804382324, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8736987709999084, + "num_tokens": 726604945.0, + "step": 19042 + }, + { + "epoch": 2.422465335199084, + "ewc_loss": 0.008444257080554962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444257400697097e-05, + "grad_norm": 4.186820030212402, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8730011582374573, + "num_tokens": 726645023.0, + "step": 19043 + }, + { + "epoch": 2.4225925454776744, + "ewc_loss": 0.008339060470461845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.339060877915472e-05, + "grad_norm": 4.182913303375244, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8812379240989685, + "num_tokens": 726684297.0, + "step": 19044 + }, + { + "epoch": 2.422719755756265, + "ewc_loss": 0.008388732559978962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388732385355979e-05, + "grad_norm": 4.159173965454102, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8715031147003174, + "num_tokens": 726726950.0, + "step": 19045 + }, + { + "epoch": 2.4228469660348555, + "ewc_loss": 0.008384497836232185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.384497778024524e-05, + "grad_norm": 4.263393878936768, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8632148504257202, + "num_tokens": 726761944.0, + "step": 19046 + }, + { + "epoch": 2.422974176313446, + "ewc_loss": 0.008450718596577644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450718451058492e-05, + "grad_norm": 4.163569927215576, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.893988847732544, + "num_tokens": 726799041.0, + "step": 19047 + }, + { + "epoch": 2.4231013865920366, + "ewc_loss": 0.008396429009735584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396428893320262e-05, + "grad_norm": 4.174867630004883, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8772776126861572, + "num_tokens": 726838160.0, + "step": 19048 + }, + { + "epoch": 2.423228596870627, + "ewc_loss": 0.00842007715255022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420077210757881e-05, + "grad_norm": 4.160003185272217, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8920135498046875, + "num_tokens": 726873150.0, + "step": 19049 + }, + { + "epoch": 2.4233558071492176, + "ewc_loss": 0.00842195376753807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421953680226579e-05, + "grad_norm": 4.2293195724487305, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8852900862693787, + "num_tokens": 726905253.0, + "step": 19050 + }, + { + "epoch": 2.423483017427808, + "ewc_loss": 0.008480705320835114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480705582769588e-05, + "grad_norm": 4.191643714904785, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8910490870475769, + "num_tokens": 726942842.0, + "step": 19051 + }, + { + "epoch": 2.4236102277063987, + "ewc_loss": 0.008415564894676208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415564661845565e-05, + "grad_norm": 4.084918975830078, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8852487802505493, + "num_tokens": 726987974.0, + "step": 19052 + }, + { + "epoch": 2.423737437984989, + "ewc_loss": 0.008385440334677696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385440014535561e-05, + "grad_norm": 4.170917510986328, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.9008209109306335, + "num_tokens": 727021614.0, + "step": 19053 + }, + { + "epoch": 2.4238646482635797, + "ewc_loss": 0.00846176128834486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461761171929538e-05, + "grad_norm": 4.199520587921143, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8817813396453857, + "num_tokens": 727061773.0, + "step": 19054 + }, + { + "epoch": 2.4239918585421703, + "ewc_loss": 0.008436673320829868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436672942480072e-05, + "grad_norm": 4.22825813293457, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8773218989372253, + "num_tokens": 727095877.0, + "step": 19055 + }, + { + "epoch": 2.424119068820761, + "ewc_loss": 0.008445466868579388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445466664852574e-05, + "grad_norm": 4.218245029449463, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8874797821044922, + "num_tokens": 727128676.0, + "step": 19056 + }, + { + "epoch": 2.4242462790993513, + "ewc_loss": 0.0084421681240201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442168473266065e-05, + "grad_norm": 4.1751484870910645, + "learning_rate": 1e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8909866809844971, + "num_tokens": 727164528.0, + "step": 19057 + }, + { + "epoch": 2.424373489377942, + "ewc_loss": 0.008409667760133743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40966822579503e-05, + "grad_norm": 4.188303470611572, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8870664238929749, + "num_tokens": 727202403.0, + "step": 19058 + }, + { + "epoch": 2.4245006996565324, + "ewc_loss": 0.008448204956948757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448204607702792e-05, + "grad_norm": 4.164374828338623, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8845356702804565, + "num_tokens": 727242231.0, + "step": 19059 + }, + { + "epoch": 2.424627909935123, + "ewc_loss": 0.00840831734240055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408317808061838e-05, + "grad_norm": 4.231293201446533, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8702608346939087, + "num_tokens": 727276371.0, + "step": 19060 + }, + { + "epoch": 2.4247551202137134, + "ewc_loss": 0.008453859016299248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453859481960535e-05, + "grad_norm": 4.179887771606445, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.884790301322937, + "num_tokens": 727320101.0, + "step": 19061 + }, + { + "epoch": 2.424882330492304, + "ewc_loss": 0.008405775763094425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405775588471442e-05, + "grad_norm": 4.148876667022705, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8753561973571777, + "num_tokens": 727363687.0, + "step": 19062 + }, + { + "epoch": 2.4250095407708945, + "ewc_loss": 0.008399476297199726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.399476064369082e-05, + "grad_norm": 4.224780559539795, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8886741399765015, + "num_tokens": 727399772.0, + "step": 19063 + }, + { + "epoch": 2.425136751049485, + "ewc_loss": 0.00844925083220005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449250890407711e-05, + "grad_norm": 4.144003868103027, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8897932767868042, + "num_tokens": 727442944.0, + "step": 19064 + }, + { + "epoch": 2.425263961328075, + "ewc_loss": 0.008367449045181274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36744875414297e-05, + "grad_norm": 4.182503700256348, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8896580934524536, + "num_tokens": 727483283.0, + "step": 19065 + }, + { + "epoch": 2.425391171606666, + "ewc_loss": 0.008412787690758705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.412788156419992e-05, + "grad_norm": 4.1657395362854, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8800923824310303, + "num_tokens": 727526316.0, + "step": 19066 + }, + { + "epoch": 2.425518381885256, + "ewc_loss": 0.008389311842620373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38931227917783e-05, + "grad_norm": 4.194065093994141, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.886972188949585, + "num_tokens": 727563218.0, + "step": 19067 + }, + { + "epoch": 2.4256455921638467, + "ewc_loss": 0.008411653339862823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41165310703218e-05, + "grad_norm": 4.214364051818848, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8954816460609436, + "num_tokens": 727594877.0, + "step": 19068 + }, + { + "epoch": 2.425772802442437, + "ewc_loss": 0.00842039380222559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420393714914098e-05, + "grad_norm": 4.2312211990356445, + "learning_rate": 1e-06, + "loss": 0.297, + "mean_token_accuracy": 0.895302414894104, + "num_tokens": 727627561.0, + "step": 19069 + }, + { + "epoch": 2.4259000127210277, + "ewc_loss": 0.008389591239392757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389590948354453e-05, + "grad_norm": 4.175143718719482, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8720391392707825, + "num_tokens": 727664384.0, + "step": 19070 + }, + { + "epoch": 2.4260272229996183, + "ewc_loss": 0.008378221653401852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37822153698653e-05, + "grad_norm": 4.215896129608154, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8798333406448364, + "num_tokens": 727700866.0, + "step": 19071 + }, + { + "epoch": 2.426154433278209, + "ewc_loss": 0.008424444124102592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424444240517914e-05, + "grad_norm": 4.1749396324157715, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8972844481468201, + "num_tokens": 727741539.0, + "step": 19072 + }, + { + "epoch": 2.4262816435567993, + "ewc_loss": 0.008360734209418297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36073377286084e-05, + "grad_norm": 4.205894470214844, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8822368383407593, + "num_tokens": 727777828.0, + "step": 19073 + }, + { + "epoch": 2.42640885383539, + "ewc_loss": 0.008417557924985886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41755754663609e-05, + "grad_norm": 4.144088268280029, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8908112049102783, + "num_tokens": 727818979.0, + "step": 19074 + }, + { + "epoch": 2.4265360641139804, + "ewc_loss": 0.008367662318050861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.367662667296827e-05, + "grad_norm": 4.234339237213135, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8854329586029053, + "num_tokens": 727856440.0, + "step": 19075 + }, + { + "epoch": 2.426663274392571, + "ewc_loss": 0.008426145650446415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.42614535940811e-05, + "grad_norm": 4.160115718841553, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8911005258560181, + "num_tokens": 727895036.0, + "step": 19076 + }, + { + "epoch": 2.4267904846711614, + "ewc_loss": 0.008356671780347824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356671605724841e-05, + "grad_norm": 4.217659950256348, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8743249177932739, + "num_tokens": 727932292.0, + "step": 19077 + }, + { + "epoch": 2.426917694949752, + "ewc_loss": 0.008417295292019844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417295612161979e-05, + "grad_norm": 4.239612579345703, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8827469348907471, + "num_tokens": 727967005.0, + "step": 19078 + }, + { + "epoch": 2.4270449052283425, + "ewc_loss": 0.008398768492043018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398768841288984e-05, + "grad_norm": 4.195843696594238, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8742637634277344, + "num_tokens": 728004262.0, + "step": 19079 + }, + { + "epoch": 2.427172115506933, + "ewc_loss": 0.008373907767236233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37390762171708e-05, + "grad_norm": 4.164751052856445, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8915039300918579, + "num_tokens": 728041451.0, + "step": 19080 + }, + { + "epoch": 2.4272993257855235, + "ewc_loss": 0.008383553475141525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383553358726203e-05, + "grad_norm": 4.195742130279541, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8811606168746948, + "num_tokens": 728079217.0, + "step": 19081 + }, + { + "epoch": 2.427426536064114, + "ewc_loss": 0.008405051194131374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40505090309307e-05, + "grad_norm": 4.158830642700195, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.8972941637039185, + "num_tokens": 728116221.0, + "step": 19082 + }, + { + "epoch": 2.4275537463427046, + "ewc_loss": 0.008380933664739132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380934013985097e-05, + "grad_norm": 4.193201541900635, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8758317232131958, + "num_tokens": 728154861.0, + "step": 19083 + }, + { + "epoch": 2.427680956621295, + "ewc_loss": 0.008406450040638447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.406450069742277e-05, + "grad_norm": 4.218189239501953, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8725883364677429, + "num_tokens": 728188276.0, + "step": 19084 + }, + { + "epoch": 2.4278081668998857, + "ewc_loss": 0.008418855257332325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418854849878699e-05, + "grad_norm": 4.177755832672119, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8933616876602173, + "num_tokens": 728229313.0, + "step": 19085 + }, + { + "epoch": 2.427935377178476, + "ewc_loss": 0.00837782397866249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377824269700795e-05, + "grad_norm": 4.1861348152160645, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8710564970970154, + "num_tokens": 728268706.0, + "step": 19086 + }, + { + "epoch": 2.4280625874570667, + "ewc_loss": 0.008416433818638325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41643413878046e-05, + "grad_norm": 4.241998672485352, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8965569734573364, + "num_tokens": 728297490.0, + "step": 19087 + }, + { + "epoch": 2.428189797735657, + "ewc_loss": 0.008435320109128952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435320341959596e-05, + "grad_norm": 4.170282363891602, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8742176294326782, + "num_tokens": 728336041.0, + "step": 19088 + }, + { + "epoch": 2.4283170080142478, + "ewc_loss": 0.008394160307943821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39416024973616e-05, + "grad_norm": 4.191888332366943, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8907518982887268, + "num_tokens": 728372701.0, + "step": 19089 + }, + { + "epoch": 2.428444218292838, + "ewc_loss": 0.008424930274486542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424930274486542e-05, + "grad_norm": 4.229252815246582, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8852332234382629, + "num_tokens": 728408138.0, + "step": 19090 + }, + { + "epoch": 2.4285714285714284, + "ewc_loss": 0.008430506102740765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43050584080629e-05, + "grad_norm": 4.124880313873291, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8902246356010437, + "num_tokens": 728452649.0, + "step": 19091 + }, + { + "epoch": 2.428698638850019, + "ewc_loss": 0.008358731865882874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.358732156921178e-05, + "grad_norm": 4.204689979553223, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8724185824394226, + "num_tokens": 728490551.0, + "step": 19092 + }, + { + "epoch": 2.4288258491286094, + "ewc_loss": 0.008443593047559261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443593105766922e-05, + "grad_norm": 4.186816215515137, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8902872800827026, + "num_tokens": 728529558.0, + "step": 19093 + }, + { + "epoch": 2.4289530594072, + "ewc_loss": 0.008411376737058163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411376620642841e-05, + "grad_norm": 4.183368682861328, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8620094060897827, + "num_tokens": 728573495.0, + "step": 19094 + }, + { + "epoch": 2.4290802696857905, + "ewc_loss": 0.008387097157537937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387096750084311e-05, + "grad_norm": 4.233349323272705, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8842116594314575, + "num_tokens": 728609166.0, + "step": 19095 + }, + { + "epoch": 2.429207479964381, + "ewc_loss": 0.008428477682173252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428478031419218e-05, + "grad_norm": 4.144160747528076, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8875787258148193, + "num_tokens": 728653070.0, + "step": 19096 + }, + { + "epoch": 2.4293346902429716, + "ewc_loss": 0.008371985517442226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371985313715413e-05, + "grad_norm": 4.225642681121826, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8849369287490845, + "num_tokens": 728689142.0, + "step": 19097 + }, + { + "epoch": 2.429461900521562, + "ewc_loss": 0.008437488228082657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437488577328622e-05, + "grad_norm": 4.191883563995361, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8875267505645752, + "num_tokens": 728725615.0, + "step": 19098 + }, + { + "epoch": 2.4295891108001526, + "ewc_loss": 0.008397454395890236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.397454803343862e-05, + "grad_norm": 4.239897727966309, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8749046325683594, + "num_tokens": 728762366.0, + "step": 19099 + }, + { + "epoch": 2.429716321078743, + "ewc_loss": 0.008429468609392643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.429468289250508e-05, + "grad_norm": 4.177072525024414, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8713215589523315, + "num_tokens": 728801951.0, + "step": 19100 + }, + { + "epoch": 2.4298435313573337, + "ewc_loss": 0.008398199453949928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398199861403555e-05, + "grad_norm": 4.227426528930664, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.890810489654541, + "num_tokens": 728836403.0, + "step": 19101 + }, + { + "epoch": 2.429970741635924, + "ewc_loss": 0.008421256206929684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421255915891379e-05, + "grad_norm": 4.143404006958008, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8889908790588379, + "num_tokens": 728878485.0, + "step": 19102 + }, + { + "epoch": 2.4300979519145147, + "ewc_loss": 0.008371762000024319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.371761941816658e-05, + "grad_norm": 4.1690287590026855, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8908274173736572, + "num_tokens": 728915935.0, + "step": 19103 + }, + { + "epoch": 2.4302251621931052, + "ewc_loss": 0.008422466926276684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422466635238379e-05, + "grad_norm": 4.239516258239746, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8681739568710327, + "num_tokens": 728956847.0, + "step": 19104 + }, + { + "epoch": 2.4303523724716958, + "ewc_loss": 0.008459936827421188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459937089355662e-05, + "grad_norm": 4.15915060043335, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8879894018173218, + "num_tokens": 728994865.0, + "step": 19105 + }, + { + "epoch": 2.4304795827502863, + "ewc_loss": 0.008383617736399174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383617387153208e-05, + "grad_norm": 4.2270588874816895, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8777604103088379, + "num_tokens": 729033030.0, + "step": 19106 + }, + { + "epoch": 2.430606793028877, + "ewc_loss": 0.008459273725748062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459273522021249e-05, + "grad_norm": 4.18613338470459, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8936123847961426, + "num_tokens": 729070430.0, + "step": 19107 + }, + { + "epoch": 2.4307340033074674, + "ewc_loss": 0.008401048369705677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401048398809507e-05, + "grad_norm": 4.218231201171875, + "learning_rate": 1e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.893161416053772, + "num_tokens": 729102127.0, + "step": 19108 + }, + { + "epoch": 2.430861213586058, + "ewc_loss": 0.008428924717009068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428924775216728e-05, + "grad_norm": 4.214831829071045, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8852300047874451, + "num_tokens": 729138128.0, + "step": 19109 + }, + { + "epoch": 2.4309884238646484, + "ewc_loss": 0.008439667522907257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43966772663407e-05, + "grad_norm": 4.149878978729248, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8803564310073853, + "num_tokens": 729179756.0, + "step": 19110 + }, + { + "epoch": 2.431115634143239, + "ewc_loss": 0.008393580093979836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39358035591431e-05, + "grad_norm": 4.164192199707031, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8719438910484314, + "num_tokens": 729221356.0, + "step": 19111 + }, + { + "epoch": 2.4312428444218295, + "ewc_loss": 0.008438588120043278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438587974524125e-05, + "grad_norm": 4.182563304901123, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8878669738769531, + "num_tokens": 729260456.0, + "step": 19112 + }, + { + "epoch": 2.4313700547004196, + "ewc_loss": 0.008425842970609665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425843407167122e-05, + "grad_norm": 4.163098335266113, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.872222900390625, + "num_tokens": 729299654.0, + "step": 19113 + }, + { + "epoch": 2.4314972649790105, + "ewc_loss": 0.008409244008362293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409244037466124e-05, + "grad_norm": 4.158606052398682, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8850333094596863, + "num_tokens": 729341296.0, + "step": 19114 + }, + { + "epoch": 2.4316244752576006, + "ewc_loss": 0.008428923785686493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428924047620967e-05, + "grad_norm": 4.205592155456543, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8874295949935913, + "num_tokens": 729379632.0, + "step": 19115 + }, + { + "epoch": 2.431751685536191, + "ewc_loss": 0.008449625223875046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449625602224842e-05, + "grad_norm": 4.179963111877441, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8851001858711243, + "num_tokens": 729423202.0, + "step": 19116 + }, + { + "epoch": 2.4318788958147817, + "ewc_loss": 0.008424182422459126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424182306043804e-05, + "grad_norm": 4.166554927825928, + "learning_rate": 1e-06, + "loss": 0.2793, + "mean_token_accuracy": 0.9013409614562988, + "num_tokens": 729457108.0, + "step": 19117 + }, + { + "epoch": 2.432006106093372, + "ewc_loss": 0.00842246599495411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422465907642618e-05, + "grad_norm": 4.242823600769043, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.878734290599823, + "num_tokens": 729494065.0, + "step": 19118 + }, + { + "epoch": 2.4321333163719627, + "ewc_loss": 0.008460400626063347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460400567855686e-05, + "grad_norm": 4.156783103942871, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8877248764038086, + "num_tokens": 729536621.0, + "step": 19119 + }, + { + "epoch": 2.4322605266505533, + "ewc_loss": 0.008398093283176422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398093632422388e-05, + "grad_norm": 4.192718029022217, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8833973407745361, + "num_tokens": 729574423.0, + "step": 19120 + }, + { + "epoch": 2.432387736929144, + "ewc_loss": 0.008452732115983963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452731708530337e-05, + "grad_norm": 4.172975540161133, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8785302639007568, + "num_tokens": 729614673.0, + "step": 19121 + }, + { + "epoch": 2.4325149472077343, + "ewc_loss": 0.008403528481721878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403528772760183e-05, + "grad_norm": 4.164280891418457, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8855865001678467, + "num_tokens": 729653049.0, + "step": 19122 + }, + { + "epoch": 2.432642157486325, + "ewc_loss": 0.00842932891100645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.429328590864316e-05, + "grad_norm": 4.208998680114746, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.883688747882843, + "num_tokens": 729689667.0, + "step": 19123 + }, + { + "epoch": 2.4327693677649154, + "ewc_loss": 0.008467755280435085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467755105812103e-05, + "grad_norm": 4.124518394470215, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8863891363143921, + "num_tokens": 729734747.0, + "step": 19124 + }, + { + "epoch": 2.432896578043506, + "ewc_loss": 0.008387960493564606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387960406253114e-05, + "grad_norm": 4.214419841766357, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8783726692199707, + "num_tokens": 729770135.0, + "step": 19125 + }, + { + "epoch": 2.4330237883220964, + "ewc_loss": 0.008476757444441319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476757648168132e-05, + "grad_norm": 4.1532158851623535, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8803955912590027, + "num_tokens": 729809203.0, + "step": 19126 + }, + { + "epoch": 2.433150998600687, + "ewc_loss": 0.008415589109063148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415589400101453e-05, + "grad_norm": 4.171867847442627, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8896215558052063, + "num_tokens": 729850164.0, + "step": 19127 + }, + { + "epoch": 2.4332782088792775, + "ewc_loss": 0.00843016803264618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430168236372992e-05, + "grad_norm": 4.237448215484619, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8739030957221985, + "num_tokens": 729884337.0, + "step": 19128 + }, + { + "epoch": 2.433405419157868, + "ewc_loss": 0.00847632810473442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476327639073133e-05, + "grad_norm": 4.182709693908691, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.8959050178527832, + "num_tokens": 729924098.0, + "step": 19129 + }, + { + "epoch": 2.4335326294364585, + "ewc_loss": 0.008412998169660568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.412997703999281e-05, + "grad_norm": 4.1964850425720215, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.901189923286438, + "num_tokens": 729960671.0, + "step": 19130 + }, + { + "epoch": 2.433659839715049, + "ewc_loss": 0.008437518030405045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437517681159079e-05, + "grad_norm": 4.197511196136475, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8840714693069458, + "num_tokens": 730000780.0, + "step": 19131 + }, + { + "epoch": 2.4337870499936396, + "ewc_loss": 0.008429832756519318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.429832814726979e-05, + "grad_norm": 4.166987419128418, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8796638250350952, + "num_tokens": 730045198.0, + "step": 19132 + }, + { + "epoch": 2.43391426027223, + "ewc_loss": 0.008416040800511837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416040509473532e-05, + "grad_norm": 4.18270206451416, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8763332962989807, + "num_tokens": 730085010.0, + "step": 19133 + }, + { + "epoch": 2.4340414705508207, + "ewc_loss": 0.008418719284236431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418719517067075e-05, + "grad_norm": 4.192261695861816, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8775696754455566, + "num_tokens": 730121512.0, + "step": 19134 + }, + { + "epoch": 2.434168680829411, + "ewc_loss": 0.008440488018095493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440487727057189e-05, + "grad_norm": 4.174227237701416, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8812510967254639, + "num_tokens": 730164859.0, + "step": 19135 + }, + { + "epoch": 2.4342958911080017, + "ewc_loss": 0.008396203629672527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396204066229984e-05, + "grad_norm": 4.18950080871582, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8829774856567383, + "num_tokens": 730202273.0, + "step": 19136 + }, + { + "epoch": 2.4344231013865922, + "ewc_loss": 0.008396323770284653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396324119530618e-05, + "grad_norm": 4.153489112854004, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8770009279251099, + "num_tokens": 730248208.0, + "step": 19137 + }, + { + "epoch": 2.4345503116651823, + "ewc_loss": 0.008390931412577629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390931179746985e-05, + "grad_norm": 4.154209136962891, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8927263021469116, + "num_tokens": 730291084.0, + "step": 19138 + }, + { + "epoch": 2.4346775219437733, + "ewc_loss": 0.00837340485304594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.37340485304594e-05, + "grad_norm": 4.1583404541015625, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8868407011032104, + "num_tokens": 730334504.0, + "step": 19139 + }, + { + "epoch": 2.4348047322223634, + "ewc_loss": 0.0083612697198987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361270010937005e-05, + "grad_norm": 4.160736083984375, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8864188194274902, + "num_tokens": 730373356.0, + "step": 19140 + }, + { + "epoch": 2.434931942500954, + "ewc_loss": 0.00836262945085764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362629159819335e-05, + "grad_norm": 4.227960109710693, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8723705410957336, + "num_tokens": 730416229.0, + "step": 19141 + }, + { + "epoch": 2.4350591527795444, + "ewc_loss": 0.008369139395654202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369138959096745e-05, + "grad_norm": 4.14912748336792, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8933622241020203, + "num_tokens": 730458922.0, + "step": 19142 + }, + { + "epoch": 2.435186363058135, + "ewc_loss": 0.008296024054288864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.296023588627577e-05, + "grad_norm": 4.262997150421143, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8819090127944946, + "num_tokens": 730494583.0, + "step": 19143 + }, + { + "epoch": 2.4353135733367255, + "ewc_loss": 0.008385254070162773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385253750020638e-05, + "grad_norm": 4.2186784744262695, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8864974975585938, + "num_tokens": 730529526.0, + "step": 19144 + }, + { + "epoch": 2.435440783615316, + "ewc_loss": 0.008320547640323639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.320547203766182e-05, + "grad_norm": 4.278993606567383, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8871574401855469, + "num_tokens": 730558584.0, + "step": 19145 + }, + { + "epoch": 2.4355679938939065, + "ewc_loss": 0.008390676230192184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390676521230489e-05, + "grad_norm": 4.177468299865723, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8821293115615845, + "num_tokens": 730598677.0, + "step": 19146 + }, + { + "epoch": 2.435695204172497, + "ewc_loss": 0.008302661590278149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.302661444758996e-05, + "grad_norm": 4.198345184326172, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8929975628852844, + "num_tokens": 730636985.0, + "step": 19147 + }, + { + "epoch": 2.4358224144510876, + "ewc_loss": 0.008363253436982632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.363253436982632e-05, + "grad_norm": 4.228026866912842, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8669071197509766, + "num_tokens": 730675409.0, + "step": 19148 + }, + { + "epoch": 2.435949624729678, + "ewc_loss": 0.00838063471019268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380634244531393e-05, + "grad_norm": 4.224830150604248, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8721604347229004, + "num_tokens": 730714739.0, + "step": 19149 + }, + { + "epoch": 2.4360768350082687, + "ewc_loss": 0.008352314122021198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352314034709707e-05, + "grad_norm": 4.154229164123535, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8819416165351868, + "num_tokens": 730757059.0, + "step": 19150 + }, + { + "epoch": 2.436204045286859, + "ewc_loss": 0.008337209932506084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.337209874298424e-05, + "grad_norm": 4.22305965423584, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8828648924827576, + "num_tokens": 730793469.0, + "step": 19151 + }, + { + "epoch": 2.4363312555654497, + "ewc_loss": 0.008417446166276932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.417446224484593e-05, + "grad_norm": 4.1934709548950195, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8874857425689697, + "num_tokens": 730829336.0, + "step": 19152 + }, + { + "epoch": 2.4364584658440402, + "ewc_loss": 0.008361462503671646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36146209621802e-05, + "grad_norm": 4.196481704711914, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8821324110031128, + "num_tokens": 730869304.0, + "step": 19153 + }, + { + "epoch": 2.4365856761226308, + "ewc_loss": 0.008395237848162651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39523781905882e-05, + "grad_norm": 4.2173333168029785, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8701730370521545, + "num_tokens": 730905315.0, + "step": 19154 + }, + { + "epoch": 2.4367128864012213, + "ewc_loss": 0.008420953527092934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420953236054629e-05, + "grad_norm": 4.183304786682129, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8740091919898987, + "num_tokens": 730947346.0, + "step": 19155 + }, + { + "epoch": 2.436840096679812, + "ewc_loss": 0.008369235321879387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369235729333013e-05, + "grad_norm": 4.227944850921631, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8892598152160645, + "num_tokens": 730976907.0, + "step": 19156 + }, + { + "epoch": 2.4369673069584024, + "ewc_loss": 0.008426818996667862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426819113083184e-05, + "grad_norm": 4.159247398376465, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8799496293067932, + "num_tokens": 731015940.0, + "step": 19157 + }, + { + "epoch": 2.437094517236993, + "ewc_loss": 0.008391796611249447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391796291107312e-05, + "grad_norm": 4.141696453094482, + "learning_rate": 1e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.8970385789871216, + "num_tokens": 731051345.0, + "step": 19158 + }, + { + "epoch": 2.4372217275155834, + "ewc_loss": 0.008424206636846066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424207044299692e-05, + "grad_norm": 4.244718074798584, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8546391129493713, + "num_tokens": 731089642.0, + "step": 19159 + }, + { + "epoch": 2.437348937794174, + "ewc_loss": 0.008480455726385117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480456017423421e-05, + "grad_norm": 4.1253767013549805, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8983771204948425, + "num_tokens": 731133196.0, + "step": 19160 + }, + { + "epoch": 2.4374761480727645, + "ewc_loss": 0.008384233340620995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38423366076313e-05, + "grad_norm": 4.208136558532715, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8713750839233398, + "num_tokens": 731175510.0, + "step": 19161 + }, + { + "epoch": 2.437603358351355, + "ewc_loss": 0.008465675637125969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465675637125969e-05, + "grad_norm": 4.120666980743408, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8903090953826904, + "num_tokens": 731217526.0, + "step": 19162 + }, + { + "epoch": 2.437730568629945, + "ewc_loss": 0.008388550952076912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388551214011386e-05, + "grad_norm": 4.192691802978516, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8709625601768494, + "num_tokens": 731259525.0, + "step": 19163 + }, + { + "epoch": 2.437857778908536, + "ewc_loss": 0.008458387106657028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458387310383841e-05, + "grad_norm": 4.167510509490967, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8864413499832153, + "num_tokens": 731301480.0, + "step": 19164 + }, + { + "epoch": 2.437984989187126, + "ewc_loss": 0.008406220935285091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40622087707743e-05, + "grad_norm": 4.112244129180908, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8916153311729431, + "num_tokens": 731347385.0, + "step": 19165 + }, + { + "epoch": 2.4381121994657167, + "ewc_loss": 0.008398598060011864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39859785628505e-05, + "grad_norm": 4.220222473144531, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8853120803833008, + "num_tokens": 731383666.0, + "step": 19166 + }, + { + "epoch": 2.438239409744307, + "ewc_loss": 0.00847085565328598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470856118947268e-05, + "grad_norm": 4.260403633117676, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8833332061767578, + "num_tokens": 731420297.0, + "step": 19167 + }, + { + "epoch": 2.4383666200228977, + "ewc_loss": 0.008426657877862453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.42665831441991e-05, + "grad_norm": 4.21671724319458, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.880719780921936, + "num_tokens": 731455083.0, + "step": 19168 + }, + { + "epoch": 2.4384938303014883, + "ewc_loss": 0.008383220061659813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.383220119867474e-05, + "grad_norm": 4.266989231109619, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8894596099853516, + "num_tokens": 731482555.0, + "step": 19169 + }, + { + "epoch": 2.438621040580079, + "ewc_loss": 0.00844837911427021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448379230685532e-05, + "grad_norm": 4.174625396728516, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8941593170166016, + "num_tokens": 731519459.0, + "step": 19170 + }, + { + "epoch": 2.4387482508586693, + "ewc_loss": 0.00838785246014595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387852722080424e-05, + "grad_norm": 4.186623573303223, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8877086043357849, + "num_tokens": 731554577.0, + "step": 19171 + }, + { + "epoch": 2.43887546113726, + "ewc_loss": 0.008434096351265907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43409652588889e-05, + "grad_norm": 4.16967248916626, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.875366747379303, + "num_tokens": 731594993.0, + "step": 19172 + }, + { + "epoch": 2.4390026714158504, + "ewc_loss": 0.008431823924183846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43182424432598e-05, + "grad_norm": 4.265843868255615, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8863646984100342, + "num_tokens": 731629182.0, + "step": 19173 + }, + { + "epoch": 2.439129881694441, + "ewc_loss": 0.008488023653626442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488023740937933e-05, + "grad_norm": 4.16094446182251, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8824092149734497, + "num_tokens": 731667661.0, + "step": 19174 + }, + { + "epoch": 2.4392570919730314, + "ewc_loss": 0.008402865380048752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402865205425769e-05, + "grad_norm": 4.247968673706055, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8828946948051453, + "num_tokens": 731704911.0, + "step": 19175 + }, + { + "epoch": 2.439384302251622, + "ewc_loss": 0.008487596176564693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48759591463022e-05, + "grad_norm": 4.2400312423706055, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8692956566810608, + "num_tokens": 731744990.0, + "step": 19176 + }, + { + "epoch": 2.4395115125302125, + "ewc_loss": 0.008453396148979664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453396003460512e-05, + "grad_norm": 4.175643444061279, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8839522004127502, + "num_tokens": 731783335.0, + "step": 19177 + }, + { + "epoch": 2.439638722808803, + "ewc_loss": 0.008405580185353756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405580592807382e-05, + "grad_norm": 4.2255706787109375, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.884841799736023, + "num_tokens": 731819206.0, + "step": 19178 + }, + { + "epoch": 2.4397659330873935, + "ewc_loss": 0.00847397930920124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473979687551036e-05, + "grad_norm": 4.224059104919434, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8831164836883545, + "num_tokens": 731855291.0, + "step": 19179 + }, + { + "epoch": 2.439893143365984, + "ewc_loss": 0.008444057777523994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444058039458469e-05, + "grad_norm": 4.128383636474609, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8871991634368896, + "num_tokens": 731899392.0, + "step": 19180 + }, + { + "epoch": 2.4400203536445746, + "ewc_loss": 0.008395585231482983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395585609832779e-05, + "grad_norm": 4.182738304138184, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8826554417610168, + "num_tokens": 731937502.0, + "step": 19181 + }, + { + "epoch": 2.440147563923165, + "ewc_loss": 0.008470107801258564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47010815050453e-05, + "grad_norm": 4.195394992828369, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8861300349235535, + "num_tokens": 731974946.0, + "step": 19182 + }, + { + "epoch": 2.4402747742017556, + "ewc_loss": 0.008457507938146591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457508374704048e-05, + "grad_norm": 4.19230318069458, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8720515370368958, + "num_tokens": 732018732.0, + "step": 19183 + }, + { + "epoch": 2.440401984480346, + "ewc_loss": 0.00845345389097929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453454211121425e-05, + "grad_norm": 4.214799404144287, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8830628991127014, + "num_tokens": 732052036.0, + "step": 19184 + }, + { + "epoch": 2.4405291947589367, + "ewc_loss": 0.008455973118543625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455973147647455e-05, + "grad_norm": 4.193897247314453, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8790373802185059, + "num_tokens": 732090944.0, + "step": 19185 + }, + { + "epoch": 2.440656405037527, + "ewc_loss": 0.008449768647551537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44976821099408e-05, + "grad_norm": 4.172832489013672, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8857049942016602, + "num_tokens": 732131406.0, + "step": 19186 + }, + { + "epoch": 2.4407836153161178, + "ewc_loss": 0.008427770808339119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427770808339119e-05, + "grad_norm": 4.155155658721924, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8817563652992249, + "num_tokens": 732171701.0, + "step": 19187 + }, + { + "epoch": 2.440910825594708, + "ewc_loss": 0.008433227427303791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433227776549757e-05, + "grad_norm": 4.199834823608398, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8928320407867432, + "num_tokens": 732209658.0, + "step": 19188 + }, + { + "epoch": 2.4410380358732984, + "ewc_loss": 0.008448540233075619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448540029348806e-05, + "grad_norm": 4.170492172241211, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8714267015457153, + "num_tokens": 732251395.0, + "step": 19189 + }, + { + "epoch": 2.441165246151889, + "ewc_loss": 0.008420630358159542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.42063018353656e-05, + "grad_norm": 4.229714393615723, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8727208375930786, + "num_tokens": 732289306.0, + "step": 19190 + }, + { + "epoch": 2.4412924564304794, + "ewc_loss": 0.008447340689599514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447340951533988e-05, + "grad_norm": 4.135709285736084, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8703237771987915, + "num_tokens": 732337916.0, + "step": 19191 + }, + { + "epoch": 2.44141966670907, + "ewc_loss": 0.008377880789339542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377881022170186e-05, + "grad_norm": 4.188758373260498, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8693042397499084, + "num_tokens": 732380358.0, + "step": 19192 + }, + { + "epoch": 2.4415468769876605, + "ewc_loss": 0.008447236381471157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447236177744344e-05, + "grad_norm": 4.182470321655273, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8933829665184021, + "num_tokens": 732416534.0, + "step": 19193 + }, + { + "epoch": 2.441674087266251, + "ewc_loss": 0.008425033651292324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425033593084663e-05, + "grad_norm": 4.169997215270996, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8631494045257568, + "num_tokens": 732458927.0, + "step": 19194 + }, + { + "epoch": 2.4418012975448415, + "ewc_loss": 0.00842283945530653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422839164268225e-05, + "grad_norm": 4.206525802612305, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8735445141792297, + "num_tokens": 732501261.0, + "step": 19195 + }, + { + "epoch": 2.441928507823432, + "ewc_loss": 0.008434580639004707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434580377070233e-05, + "grad_norm": 4.141859531402588, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8890125155448914, + "num_tokens": 732541162.0, + "step": 19196 + }, + { + "epoch": 2.4420557181020226, + "ewc_loss": 0.008389082737267017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389082358917221e-05, + "grad_norm": 4.238466262817383, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8809902667999268, + "num_tokens": 732578237.0, + "step": 19197 + }, + { + "epoch": 2.442182928380613, + "ewc_loss": 0.008457877673208714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457877993350849e-05, + "grad_norm": 4.190972805023193, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8738155961036682, + "num_tokens": 732618624.0, + "step": 19198 + }, + { + "epoch": 2.4423101386592037, + "ewc_loss": 0.00841040164232254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41040164232254e-05, + "grad_norm": 4.164747714996338, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8800095915794373, + "num_tokens": 732661747.0, + "step": 19199 + }, + { + "epoch": 2.442437348937794, + "ewc_loss": 0.00841572880744934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415728370891884e-05, + "grad_norm": 4.191347599029541, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.863637387752533, + "num_tokens": 732703511.0, + "step": 19200 + }, + { + "epoch": 2.4425645592163847, + "ewc_loss": 0.008437399752438068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437399810645729e-05, + "grad_norm": 4.2195515632629395, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8632689714431763, + "num_tokens": 732746749.0, + "step": 19201 + }, + { + "epoch": 2.4426917694949752, + "ewc_loss": 0.008447480387985706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44748064992018e-05, + "grad_norm": 4.1857709884643555, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8844208717346191, + "num_tokens": 732783184.0, + "step": 19202 + }, + { + "epoch": 2.4428189797735658, + "ewc_loss": 0.008414296433329582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414296462433413e-05, + "grad_norm": 4.245090961456299, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8771440982818604, + "num_tokens": 732818664.0, + "step": 19203 + }, + { + "epoch": 2.4429461900521563, + "ewc_loss": 0.008471318520605564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471318142255768e-05, + "grad_norm": 4.213992118835449, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.873299241065979, + "num_tokens": 732854286.0, + "step": 19204 + }, + { + "epoch": 2.443073400330747, + "ewc_loss": 0.008414048701524734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41404907987453e-05, + "grad_norm": 4.158158302307129, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8890790343284607, + "num_tokens": 732893032.0, + "step": 19205 + }, + { + "epoch": 2.4432006106093374, + "ewc_loss": 0.008432364091277122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432364120380953e-05, + "grad_norm": 4.213261604309082, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8836103677749634, + "num_tokens": 732929942.0, + "step": 19206 + }, + { + "epoch": 2.443327820887928, + "ewc_loss": 0.00847078487277031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470784814562649e-05, + "grad_norm": 4.1781907081604, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8782631754875183, + "num_tokens": 732968218.0, + "step": 19207 + }, + { + "epoch": 2.4434550311665184, + "ewc_loss": 0.008432981558144093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432981849182397e-05, + "grad_norm": 4.203775882720947, + "learning_rate": 1e-06, + "loss": 0.2691, + "mean_token_accuracy": 0.9042657613754272, + "num_tokens": 733001464.0, + "step": 19208 + }, + { + "epoch": 2.443582241445109, + "ewc_loss": 0.008469835855066776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469836029689759e-05, + "grad_norm": 4.289605617523193, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.874781608581543, + "num_tokens": 733036311.0, + "step": 19209 + }, + { + "epoch": 2.4437094517236995, + "ewc_loss": 0.008524445816874504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524445729563013e-05, + "grad_norm": 4.2283430099487305, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8870115280151367, + "num_tokens": 733070487.0, + "step": 19210 + }, + { + "epoch": 2.4438366620022896, + "ewc_loss": 0.00847149919718504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471499313600361e-05, + "grad_norm": 4.150821685791016, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8802672028541565, + "num_tokens": 733110357.0, + "step": 19211 + }, + { + "epoch": 2.4439638722808805, + "ewc_loss": 0.008455980569124222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45598042360507e-05, + "grad_norm": 4.203577041625977, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8755168914794922, + "num_tokens": 733149070.0, + "step": 19212 + }, + { + "epoch": 2.4440910825594706, + "ewc_loss": 0.008529406040906906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529406477464363e-05, + "grad_norm": 4.20929479598999, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8968163728713989, + "num_tokens": 733180848.0, + "step": 19213 + }, + { + "epoch": 2.444218292838061, + "ewc_loss": 0.008491091430187225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491091284668073e-05, + "grad_norm": 4.172018051147461, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8825472593307495, + "num_tokens": 733219204.0, + "step": 19214 + }, + { + "epoch": 2.4443455031166517, + "ewc_loss": 0.008483750745654106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483750571031123e-05, + "grad_norm": 4.210810661315918, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8726763725280762, + "num_tokens": 733255237.0, + "step": 19215 + }, + { + "epoch": 2.444472713395242, + "ewc_loss": 0.008530723862349987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53072342579253e-05, + "grad_norm": 4.164897441864014, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8925926089286804, + "num_tokens": 733293243.0, + "step": 19216 + }, + { + "epoch": 2.4445999236738327, + "ewc_loss": 0.008485856465995312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485856233164668e-05, + "grad_norm": 4.200804710388184, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8860440254211426, + "num_tokens": 733333530.0, + "step": 19217 + }, + { + "epoch": 2.4447271339524232, + "ewc_loss": 0.008523466065526009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523466385668144e-05, + "grad_norm": 4.240787982940674, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8652746081352234, + "num_tokens": 733373790.0, + "step": 19218 + }, + { + "epoch": 2.4448543442310138, + "ewc_loss": 0.008534202352166176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534202061127871e-05, + "grad_norm": 4.162508964538574, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.886043906211853, + "num_tokens": 733416037.0, + "step": 19219 + }, + { + "epoch": 2.4449815545096043, + "ewc_loss": 0.008465235121548176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46523471409455e-05, + "grad_norm": 4.201797962188721, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8787925243377686, + "num_tokens": 733453014.0, + "step": 19220 + }, + { + "epoch": 2.445108764788195, + "ewc_loss": 0.008523036725819111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523037104168907e-05, + "grad_norm": 4.242496490478516, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8681037425994873, + "num_tokens": 733491197.0, + "step": 19221 + }, + { + "epoch": 2.4452359750667854, + "ewc_loss": 0.008512843400239944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512843487551436e-05, + "grad_norm": 4.148037433624268, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8930820226669312, + "num_tokens": 733531142.0, + "step": 19222 + }, + { + "epoch": 2.445363185345376, + "ewc_loss": 0.008454632945358753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454632916254923e-05, + "grad_norm": 4.166378498077393, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8768811821937561, + "num_tokens": 733573916.0, + "step": 19223 + }, + { + "epoch": 2.4454903956239664, + "ewc_loss": 0.008479597046971321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479596726829186e-05, + "grad_norm": 4.22256326675415, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8770390748977661, + "num_tokens": 733613204.0, + "step": 19224 + }, + { + "epoch": 2.445617605902557, + "ewc_loss": 0.008517652750015259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517652895534411e-05, + "grad_norm": 4.205112457275391, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8780515193939209, + "num_tokens": 733653618.0, + "step": 19225 + }, + { + "epoch": 2.4457448161811475, + "ewc_loss": 0.008488323539495468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488323510391638e-05, + "grad_norm": 4.185340881347656, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.896580696105957, + "num_tokens": 733687795.0, + "step": 19226 + }, + { + "epoch": 2.445872026459738, + "ewc_loss": 0.008464519865810871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464520215056837e-05, + "grad_norm": 4.26891565322876, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8811314105987549, + "num_tokens": 733723574.0, + "step": 19227 + }, + { + "epoch": 2.4459992367383285, + "ewc_loss": 0.008526034653186798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52603479870595e-05, + "grad_norm": 4.181933403015137, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8876919150352478, + "num_tokens": 733762051.0, + "step": 19228 + }, + { + "epoch": 2.446126447016919, + "ewc_loss": 0.008451901376247406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451901521766558e-05, + "grad_norm": 4.1556315422058105, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8983262777328491, + "num_tokens": 733800640.0, + "step": 19229 + }, + { + "epoch": 2.4462536572955096, + "ewc_loss": 0.008449803106486797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449803135590628e-05, + "grad_norm": 4.179060935974121, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8734908699989319, + "num_tokens": 733841321.0, + "step": 19230 + }, + { + "epoch": 2.4463808675741, + "ewc_loss": 0.008474370464682579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474370406474918e-05, + "grad_norm": 4.2445454597473145, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8851431012153625, + "num_tokens": 733874104.0, + "step": 19231 + }, + { + "epoch": 2.4465080778526906, + "ewc_loss": 0.00848299078643322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482990961056203e-05, + "grad_norm": 4.210297107696533, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8866158723831177, + "num_tokens": 733913127.0, + "step": 19232 + }, + { + "epoch": 2.446635288131281, + "ewc_loss": 0.008457607589662075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457607327727601e-05, + "grad_norm": 4.187097549438477, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8989186882972717, + "num_tokens": 733951900.0, + "step": 19233 + }, + { + "epoch": 2.4467624984098717, + "ewc_loss": 0.008428207598626614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.42820736579597e-05, + "grad_norm": 4.138871192932129, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8833203911781311, + "num_tokens": 733992866.0, + "step": 19234 + }, + { + "epoch": 2.4468897086884622, + "ewc_loss": 0.00841385405510664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41385408421047e-05, + "grad_norm": 4.233778953552246, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8769410848617554, + "num_tokens": 734029895.0, + "step": 19235 + }, + { + "epoch": 2.4470169189670523, + "ewc_loss": 0.00847758911550045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477589290123433e-05, + "grad_norm": 4.172179698944092, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8994876146316528, + "num_tokens": 734066954.0, + "step": 19236 + }, + { + "epoch": 2.4471441292456433, + "ewc_loss": 0.008409213274717331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409213478444144e-05, + "grad_norm": 4.208739280700684, + "learning_rate": 1e-06, + "loss": 0.2784, + "mean_token_accuracy": 0.9020050764083862, + "num_tokens": 734102412.0, + "step": 19237 + }, + { + "epoch": 2.4472713395242334, + "ewc_loss": 0.008451305329799652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451305620837957e-05, + "grad_norm": 4.161216735839844, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8985753059387207, + "num_tokens": 734141400.0, + "step": 19238 + }, + { + "epoch": 2.447398549802824, + "ewc_loss": 0.008389908820390701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.389908907702193e-05, + "grad_norm": 4.2885050773620605, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8656183481216431, + "num_tokens": 734176628.0, + "step": 19239 + }, + { + "epoch": 2.4475257600814144, + "ewc_loss": 0.008468983694911003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468984015053138e-05, + "grad_norm": 4.158303260803223, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8888204097747803, + "num_tokens": 734219756.0, + "step": 19240 + }, + { + "epoch": 2.447652970360005, + "ewc_loss": 0.008355309255421162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.355308818863705e-05, + "grad_norm": 4.300818920135498, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8830505013465881, + "num_tokens": 734250509.0, + "step": 19241 + }, + { + "epoch": 2.4477801806385955, + "ewc_loss": 0.008466855622828007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46685579745099e-05, + "grad_norm": 4.144675254821777, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8907963633537292, + "num_tokens": 734287887.0, + "step": 19242 + }, + { + "epoch": 2.447907390917186, + "ewc_loss": 0.00835749227553606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357492333743721e-05, + "grad_norm": 4.1775126457214355, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8861392736434937, + "num_tokens": 734330446.0, + "step": 19243 + }, + { + "epoch": 2.4480346011957765, + "ewc_loss": 0.008415178395807743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415178308496252e-05, + "grad_norm": 4.184653282165527, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8873409032821655, + "num_tokens": 734368427.0, + "step": 19244 + }, + { + "epoch": 2.448161811474367, + "ewc_loss": 0.008418511599302292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.418511424679309e-05, + "grad_norm": 4.156445503234863, + "learning_rate": 1e-06, + "loss": 0.2805, + "mean_token_accuracy": 0.9010659456253052, + "num_tokens": 734405103.0, + "step": 19245 + }, + { + "epoch": 2.4482890217529576, + "ewc_loss": 0.008401289582252502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401289960602298e-05, + "grad_norm": 4.189080238342285, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8800467252731323, + "num_tokens": 734448069.0, + "step": 19246 + }, + { + "epoch": 2.448416232031548, + "ewc_loss": 0.008416601456701756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416601485805586e-05, + "grad_norm": 4.445733547210693, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8704323172569275, + "num_tokens": 734486016.0, + "step": 19247 + }, + { + "epoch": 2.4485434423101387, + "ewc_loss": 0.008533813990652561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533814252587035e-05, + "grad_norm": 4.170564651489258, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8724808096885681, + "num_tokens": 734526943.0, + "step": 19248 + }, + { + "epoch": 2.448670652588729, + "ewc_loss": 0.008280424401164055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.280424663098529e-05, + "grad_norm": 4.158872127532959, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8951102495193481, + "num_tokens": 734566137.0, + "step": 19249 + }, + { + "epoch": 2.4487978628673197, + "ewc_loss": 0.008392308838665485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39230851852335e-05, + "grad_norm": 4.214211940765381, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8821311593055725, + "num_tokens": 734602366.0, + "step": 19250 + }, + { + "epoch": 2.4489250731459102, + "ewc_loss": 0.008403846062719822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403846004512161e-05, + "grad_norm": 4.152381420135498, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8890776634216309, + "num_tokens": 734644251.0, + "step": 19251 + }, + { + "epoch": 2.4490522834245008, + "ewc_loss": 0.008362138643860817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362138760276139e-05, + "grad_norm": 4.209867000579834, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8773583173751831, + "num_tokens": 734682938.0, + "step": 19252 + }, + { + "epoch": 2.4491794937030913, + "ewc_loss": 0.008407572284340858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407572022406384e-05, + "grad_norm": 4.185075759887695, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8849917650222778, + "num_tokens": 734722975.0, + "step": 19253 + }, + { + "epoch": 2.449306703981682, + "ewc_loss": 0.008354974910616875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354974852409214e-05, + "grad_norm": 4.155064105987549, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8851921558380127, + "num_tokens": 734761361.0, + "step": 19254 + }, + { + "epoch": 2.4494339142602723, + "ewc_loss": 0.008373870514333248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373870514333248e-05, + "grad_norm": 4.214326858520508, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8813464641571045, + "num_tokens": 734800597.0, + "step": 19255 + }, + { + "epoch": 2.449561124538863, + "ewc_loss": 0.008409658446907997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409658767050132e-05, + "grad_norm": 4.239116191864014, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8712602257728577, + "num_tokens": 734840575.0, + "step": 19256 + }, + { + "epoch": 2.4496883348174534, + "ewc_loss": 0.008388301357626915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.388300921069458e-05, + "grad_norm": 4.231286525726318, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8761303424835205, + "num_tokens": 734876943.0, + "step": 19257 + }, + { + "epoch": 2.449815545096044, + "ewc_loss": 0.008382805623114109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382805390283465e-05, + "grad_norm": 4.1787428855896, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8971470594406128, + "num_tokens": 734916436.0, + "step": 19258 + }, + { + "epoch": 2.4499427553746345, + "ewc_loss": 0.008351573720574379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.351573342224583e-05, + "grad_norm": 4.178997039794922, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8812601566314697, + "num_tokens": 734956336.0, + "step": 19259 + }, + { + "epoch": 2.450069965653225, + "ewc_loss": 0.008369912393391132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.369912393391132e-05, + "grad_norm": 4.1597580909729, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8813992738723755, + "num_tokens": 734997132.0, + "step": 19260 + }, + { + "epoch": 2.450197175931815, + "ewc_loss": 0.008354189805686474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354189776582643e-05, + "grad_norm": 4.212296009063721, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8733228445053101, + "num_tokens": 735036192.0, + "step": 19261 + }, + { + "epoch": 2.450324386210406, + "ewc_loss": 0.008373119868338108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373119635507464e-05, + "grad_norm": 4.122278213500977, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8949441909790039, + "num_tokens": 735079368.0, + "step": 19262 + }, + { + "epoch": 2.450451596488996, + "ewc_loss": 0.008319098502397537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319098560605198e-05, + "grad_norm": 4.234723091125488, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8882472515106201, + "num_tokens": 735109714.0, + "step": 19263 + }, + { + "epoch": 2.4505788067675867, + "ewc_loss": 0.008432338014245033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432337926933542e-05, + "grad_norm": 4.214459419250488, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8800041079521179, + "num_tokens": 735152712.0, + "step": 19264 + }, + { + "epoch": 2.450706017046177, + "ewc_loss": 0.008352452889084816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352453005500138e-05, + "grad_norm": 4.156619071960449, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8781222701072693, + "num_tokens": 735193313.0, + "step": 19265 + }, + { + "epoch": 2.4508332273247677, + "ewc_loss": 0.00832949485629797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.329495176440105e-05, + "grad_norm": 4.13284158706665, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8957843780517578, + "num_tokens": 735235413.0, + "step": 19266 + }, + { + "epoch": 2.4509604376033582, + "ewc_loss": 0.008359614759683609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.359614730579779e-05, + "grad_norm": 4.248056888580322, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8817907571792603, + "num_tokens": 735270809.0, + "step": 19267 + }, + { + "epoch": 2.4510876478819488, + "ewc_loss": 0.008417208679020405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41720902826637e-05, + "grad_norm": 4.2358174324035645, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8760392069816589, + "num_tokens": 735309790.0, + "step": 19268 + }, + { + "epoch": 2.4512148581605393, + "ewc_loss": 0.008370980620384216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.370980503968894e-05, + "grad_norm": 4.223773956298828, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.884144127368927, + "num_tokens": 735341084.0, + "step": 19269 + }, + { + "epoch": 2.45134206843913, + "ewc_loss": 0.008384423330426216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38442356325686e-05, + "grad_norm": 4.226633071899414, + "learning_rate": 1e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.8982788324356079, + "num_tokens": 735376964.0, + "step": 19270 + }, + { + "epoch": 2.4514692787177204, + "ewc_loss": 0.00839004386216402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390044240513816e-05, + "grad_norm": 4.175807476043701, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8909088373184204, + "num_tokens": 735413296.0, + "step": 19271 + }, + { + "epoch": 2.451596488996311, + "ewc_loss": 0.008366468362510204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.36646868265234e-05, + "grad_norm": 4.154272079467773, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8869622349739075, + "num_tokens": 735454992.0, + "step": 19272 + }, + { + "epoch": 2.4517236992749014, + "ewc_loss": 0.00837962981313467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.379630162380636e-05, + "grad_norm": 4.197936058044434, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8849008083343506, + "num_tokens": 735493315.0, + "step": 19273 + }, + { + "epoch": 2.451850909553492, + "ewc_loss": 0.008410535752773285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410535519942641e-05, + "grad_norm": 4.173600196838379, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8903275728225708, + "num_tokens": 735532666.0, + "step": 19274 + }, + { + "epoch": 2.4519781198320825, + "ewc_loss": 0.008376814424991608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.376814366783947e-05, + "grad_norm": 4.170903205871582, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8821241855621338, + "num_tokens": 735571987.0, + "step": 19275 + }, + { + "epoch": 2.452105330110673, + "ewc_loss": 0.008389188908040524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38918931549415e-05, + "grad_norm": 4.1878342628479, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8819616436958313, + "num_tokens": 735609409.0, + "step": 19276 + }, + { + "epoch": 2.4522325403892635, + "ewc_loss": 0.008398694917559624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398694626521319e-05, + "grad_norm": 4.264251708984375, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8585425615310669, + "num_tokens": 735647299.0, + "step": 19277 + }, + { + "epoch": 2.452359750667854, + "ewc_loss": 0.008451560512185097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451560279354453e-05, + "grad_norm": 4.224165439605713, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8910689353942871, + "num_tokens": 735677757.0, + "step": 19278 + }, + { + "epoch": 2.4524869609464446, + "ewc_loss": 0.008394881151616573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394881297135726e-05, + "grad_norm": 4.116598129272461, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8868655562400818, + "num_tokens": 735723924.0, + "step": 19279 + }, + { + "epoch": 2.452614171225035, + "ewc_loss": 0.0083544310182333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354430610779673e-05, + "grad_norm": 4.21240758895874, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8705653548240662, + "num_tokens": 735758105.0, + "step": 19280 + }, + { + "epoch": 2.4527413815036256, + "ewc_loss": 0.008475386537611485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475386130157858e-05, + "grad_norm": 4.198180675506592, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8805902004241943, + "num_tokens": 735797603.0, + "step": 19281 + }, + { + "epoch": 2.452868591782216, + "ewc_loss": 0.008432604372501373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432604226982221e-05, + "grad_norm": 4.203692436218262, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8584158420562744, + "num_tokens": 735838958.0, + "step": 19282 + }, + { + "epoch": 2.4529958020608067, + "ewc_loss": 0.008439537137746811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439537486992776e-05, + "grad_norm": 4.256002426147461, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8739415407180786, + "num_tokens": 735878894.0, + "step": 19283 + }, + { + "epoch": 2.453123012339397, + "ewc_loss": 0.00848660059273243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486600563628599e-05, + "grad_norm": 4.1746015548706055, + "learning_rate": 1e-06, + "loss": 0.2545, + "mean_token_accuracy": 0.9115989804267883, + "num_tokens": 735913222.0, + "step": 19284 + }, + { + "epoch": 2.4532502226179878, + "ewc_loss": 0.008408994413912296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408994472119957e-05, + "grad_norm": 4.158902168273926, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8899678587913513, + "num_tokens": 735951884.0, + "step": 19285 + }, + { + "epoch": 2.453377432896578, + "ewc_loss": 0.008470509201288223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470509055769071e-05, + "grad_norm": 4.185640811920166, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.877967119216919, + "num_tokens": 735998955.0, + "step": 19286 + }, + { + "epoch": 2.4535046431751684, + "ewc_loss": 0.008452505804598331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452505426248536e-05, + "grad_norm": 4.154603958129883, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8831064701080322, + "num_tokens": 736039631.0, + "step": 19287 + }, + { + "epoch": 2.453631853453759, + "ewc_loss": 0.008431285619735718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.431285823462531e-05, + "grad_norm": 4.193953990936279, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8789719939231873, + "num_tokens": 736085100.0, + "step": 19288 + }, + { + "epoch": 2.4537590637323494, + "ewc_loss": 0.00846011284738779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460112439934164e-05, + "grad_norm": 4.2103800773620605, + "learning_rate": 1e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8968464732170105, + "num_tokens": 736119680.0, + "step": 19289 + }, + { + "epoch": 2.45388627401094, + "ewc_loss": 0.00843887124210596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438871009275317e-05, + "grad_norm": 4.164356231689453, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8942039012908936, + "num_tokens": 736159769.0, + "step": 19290 + }, + { + "epoch": 2.4540134842895305, + "ewc_loss": 0.008421928621828556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.42192821437493e-05, + "grad_norm": 4.1746931076049805, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8882830739021301, + "num_tokens": 736199566.0, + "step": 19291 + }, + { + "epoch": 2.454140694568121, + "ewc_loss": 0.008440951816737652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440951933152974e-05, + "grad_norm": 4.186281204223633, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8877679109573364, + "num_tokens": 736239238.0, + "step": 19292 + }, + { + "epoch": 2.4542679048467115, + "ewc_loss": 0.008433510549366474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433510811300948e-05, + "grad_norm": 4.213951110839844, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8845807909965515, + "num_tokens": 736274576.0, + "step": 19293 + }, + { + "epoch": 2.454395115125302, + "ewc_loss": 0.00843979325145483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439792873105034e-05, + "grad_norm": 4.262174606323242, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8766325116157532, + "num_tokens": 736307509.0, + "step": 19294 + }, + { + "epoch": 2.4545223254038926, + "ewc_loss": 0.008459561504423618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45956164994277e-05, + "grad_norm": 4.181802272796631, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8785207271575928, + "num_tokens": 736348313.0, + "step": 19295 + }, + { + "epoch": 2.454649535682483, + "ewc_loss": 0.008384146727621555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38414634927176e-05, + "grad_norm": 4.241019248962402, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8798960447311401, + "num_tokens": 736380716.0, + "step": 19296 + }, + { + "epoch": 2.4547767459610736, + "ewc_loss": 0.008473435416817665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473435445921496e-05, + "grad_norm": 4.199970722198486, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8631581664085388, + "num_tokens": 736422669.0, + "step": 19297 + }, + { + "epoch": 2.454903956239664, + "ewc_loss": 0.008405647240579128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405647531617433e-05, + "grad_norm": 4.230830192565918, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8911429047584534, + "num_tokens": 736452929.0, + "step": 19298 + }, + { + "epoch": 2.4550311665182547, + "ewc_loss": 0.008456683717668056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456684008706361e-05, + "grad_norm": 4.228623390197754, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8830673694610596, + "num_tokens": 736486755.0, + "step": 19299 + }, + { + "epoch": 2.4551583767968452, + "ewc_loss": 0.00846262089908123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462621190119535e-05, + "grad_norm": 4.192409038543701, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8914110660552979, + "num_tokens": 736521542.0, + "step": 19300 + }, + { + "epoch": 2.4552855870754358, + "ewc_loss": 0.008472036570310593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47203700686805e-05, + "grad_norm": 4.1719465255737305, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8840211629867554, + "num_tokens": 736557312.0, + "step": 19301 + }, + { + "epoch": 2.4554127973540263, + "ewc_loss": 0.008477095514535904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477095980197191e-05, + "grad_norm": 4.20676326751709, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8786889314651489, + "num_tokens": 736596469.0, + "step": 19302 + }, + { + "epoch": 2.455540007632617, + "ewc_loss": 0.008518826216459274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518826507497579e-05, + "grad_norm": 4.222444534301758, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8685950040817261, + "num_tokens": 736636078.0, + "step": 19303 + }, + { + "epoch": 2.4556672179112073, + "ewc_loss": 0.008510816842317581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510817133355886e-05, + "grad_norm": 4.173398971557617, + "learning_rate": 1e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.894705057144165, + "num_tokens": 736674117.0, + "step": 19304 + }, + { + "epoch": 2.455794428189798, + "ewc_loss": 0.008480390533804893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480390533804893e-05, + "grad_norm": 4.237236976623535, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8801267147064209, + "num_tokens": 736709116.0, + "step": 19305 + }, + { + "epoch": 2.4559216384683884, + "ewc_loss": 0.008540919050574303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540919225197285e-05, + "grad_norm": 4.206973075866699, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8764148950576782, + "num_tokens": 736750383.0, + "step": 19306 + }, + { + "epoch": 2.456048848746979, + "ewc_loss": 0.008488481864333153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488481398671865e-05, + "grad_norm": 4.191828727722168, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8728379011154175, + "num_tokens": 736790836.0, + "step": 19307 + }, + { + "epoch": 2.4561760590255695, + "ewc_loss": 0.008492442779242992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492443157592788e-05, + "grad_norm": 4.2368083000183105, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8822822570800781, + "num_tokens": 736826085.0, + "step": 19308 + }, + { + "epoch": 2.4563032693041595, + "ewc_loss": 0.00853007473051548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530074410373345e-05, + "grad_norm": 4.195334434509277, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8824421167373657, + "num_tokens": 736860599.0, + "step": 19309 + }, + { + "epoch": 2.4564304795827505, + "ewc_loss": 0.008469609543681145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469609747407958e-05, + "grad_norm": 4.178624629974365, + "learning_rate": 1e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8986906409263611, + "num_tokens": 736893286.0, + "step": 19310 + }, + { + "epoch": 2.4565576898613406, + "ewc_loss": 0.008504797704517841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504797733621672e-05, + "grad_norm": 4.208248138427734, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8923300504684448, + "num_tokens": 736931597.0, + "step": 19311 + }, + { + "epoch": 2.456684900139931, + "ewc_loss": 0.008509412407875061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509412873536348e-05, + "grad_norm": 4.185355186462402, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8910262584686279, + "num_tokens": 736972183.0, + "step": 19312 + }, + { + "epoch": 2.4568121104185217, + "ewc_loss": 0.008485578931868076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485579019179568e-05, + "grad_norm": 4.24840784072876, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.888708233833313, + "num_tokens": 737004184.0, + "step": 19313 + }, + { + "epoch": 2.456939320697112, + "ewc_loss": 0.008525735698640347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525735756848007e-05, + "grad_norm": 4.197353363037109, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8829866051673889, + "num_tokens": 737042644.0, + "step": 19314 + }, + { + "epoch": 2.4570665309757027, + "ewc_loss": 0.008490754291415215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490754407830536e-05, + "grad_norm": 4.280852317810059, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8744628429412842, + "num_tokens": 737074830.0, + "step": 19315 + }, + { + "epoch": 2.4571937412542932, + "ewc_loss": 0.008547131903469563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547132165404037e-05, + "grad_norm": 4.202232360839844, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8903337717056274, + "num_tokens": 737107756.0, + "step": 19316 + }, + { + "epoch": 2.4573209515328838, + "ewc_loss": 0.008462675847113132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462675759801641e-05, + "grad_norm": 4.227415084838867, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.891683042049408, + "num_tokens": 737143457.0, + "step": 19317 + }, + { + "epoch": 2.4574481618114743, + "ewc_loss": 0.00852569192647934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525692101102322e-05, + "grad_norm": 4.242175579071045, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8782816529273987, + "num_tokens": 737177005.0, + "step": 19318 + }, + { + "epoch": 2.457575372090065, + "ewc_loss": 0.008520688861608505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520688425051048e-05, + "grad_norm": 4.1796698570251465, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8757293224334717, + "num_tokens": 737218265.0, + "step": 19319 + }, + { + "epoch": 2.4577025823686554, + "ewc_loss": 0.00848214142024517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482141856802627e-05, + "grad_norm": 4.220498085021973, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8738229274749756, + "num_tokens": 737259495.0, + "step": 19320 + }, + { + "epoch": 2.457829792647246, + "ewc_loss": 0.00853536557406187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535365486750379e-05, + "grad_norm": 4.185615539550781, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.875319242477417, + "num_tokens": 737300073.0, + "step": 19321 + }, + { + "epoch": 2.4579570029258364, + "ewc_loss": 0.008518213406205177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518213144270703e-05, + "grad_norm": 4.193271160125732, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8839160203933716, + "num_tokens": 737334716.0, + "step": 19322 + }, + { + "epoch": 2.458084213204427, + "ewc_loss": 0.00853703636676073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537036774214357e-05, + "grad_norm": 4.148466110229492, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8863580226898193, + "num_tokens": 737377086.0, + "step": 19323 + }, + { + "epoch": 2.4582114234830175, + "ewc_loss": 0.008504983969032764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504983998136595e-05, + "grad_norm": 4.200227737426758, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8725703358650208, + "num_tokens": 737418259.0, + "step": 19324 + }, + { + "epoch": 2.458338633761608, + "ewc_loss": 0.008552060462534428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552060171496123e-05, + "grad_norm": 4.231277942657471, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.874590277671814, + "num_tokens": 737453150.0, + "step": 19325 + }, + { + "epoch": 2.4584658440401985, + "ewc_loss": 0.008544446900486946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544447337044403e-05, + "grad_norm": 4.1379008293151855, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8966377973556519, + "num_tokens": 737495552.0, + "step": 19326 + }, + { + "epoch": 2.458593054318789, + "ewc_loss": 0.00848650187253952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486501610605046e-05, + "grad_norm": 4.165281295776367, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.889461874961853, + "num_tokens": 737537518.0, + "step": 19327 + }, + { + "epoch": 2.4587202645973796, + "ewc_loss": 0.008522492833435535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522492862539366e-05, + "grad_norm": 4.1823320388793945, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.880535364151001, + "num_tokens": 737577472.0, + "step": 19328 + }, + { + "epoch": 2.45884747487597, + "ewc_loss": 0.008525054901838303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52505472721532e-05, + "grad_norm": 4.1993865966796875, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8738744258880615, + "num_tokens": 737617250.0, + "step": 19329 + }, + { + "epoch": 2.4589746851545606, + "ewc_loss": 0.008512191474437714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512191561749205e-05, + "grad_norm": 4.162460803985596, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8859354257583618, + "num_tokens": 737655544.0, + "step": 19330 + }, + { + "epoch": 2.459101895433151, + "ewc_loss": 0.008483152836561203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483152487315238e-05, + "grad_norm": 4.213831901550293, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.886277437210083, + "num_tokens": 737688644.0, + "step": 19331 + }, + { + "epoch": 2.4592291057117417, + "ewc_loss": 0.0085306940600276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530694321962073e-05, + "grad_norm": 4.164946556091309, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.885223925113678, + "num_tokens": 737730264.0, + "step": 19332 + }, + { + "epoch": 2.459356315990332, + "ewc_loss": 0.008454716764390469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454716589767486e-05, + "grad_norm": 4.248441696166992, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8793140649795532, + "num_tokens": 737763578.0, + "step": 19333 + }, + { + "epoch": 2.4594835262689223, + "ewc_loss": 0.008520941250026226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520941628376022e-05, + "grad_norm": 4.14985466003418, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8845652937889099, + "num_tokens": 737804062.0, + "step": 19334 + }, + { + "epoch": 2.4596107365475133, + "ewc_loss": 0.008434095419943333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434095798293129e-05, + "grad_norm": 4.123621463775635, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8952693343162537, + "num_tokens": 737845608.0, + "step": 19335 + }, + { + "epoch": 2.4597379468261034, + "ewc_loss": 0.00846004392951727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460044045932591e-05, + "grad_norm": 4.243884563446045, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8669448494911194, + "num_tokens": 737881366.0, + "step": 19336 + }, + { + "epoch": 2.459865157104694, + "ewc_loss": 0.00852262508124113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522625284967944e-05, + "grad_norm": 4.234099388122559, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8764413595199585, + "num_tokens": 737922595.0, + "step": 19337 + }, + { + "epoch": 2.4599923673832844, + "ewc_loss": 0.008463787846267223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463787526125088e-05, + "grad_norm": 4.239622592926025, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8885570168495178, + "num_tokens": 737956368.0, + "step": 19338 + }, + { + "epoch": 2.460119577661875, + "ewc_loss": 0.008462231606245041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462231198791414e-05, + "grad_norm": 4.126619338989258, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8862574100494385, + "num_tokens": 738000167.0, + "step": 19339 + }, + { + "epoch": 2.4602467879404655, + "ewc_loss": 0.008405670523643494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405670814681798e-05, + "grad_norm": 4.238432884216309, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8766292929649353, + "num_tokens": 738040850.0, + "step": 19340 + }, + { + "epoch": 2.460373998219056, + "ewc_loss": 0.008501802571117878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501802949467674e-05, + "grad_norm": 4.217376708984375, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8822354078292847, + "num_tokens": 738080370.0, + "step": 19341 + }, + { + "epoch": 2.4605012084976465, + "ewc_loss": 0.008433368988335133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433368930127472e-05, + "grad_norm": 4.2207512855529785, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8878321051597595, + "num_tokens": 738116751.0, + "step": 19342 + }, + { + "epoch": 2.460628418776237, + "ewc_loss": 0.008441952057182789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441951649729162e-05, + "grad_norm": 4.182513236999512, + "learning_rate": 1e-06, + "loss": 0.2695, + "mean_token_accuracy": 0.8994429111480713, + "num_tokens": 738154597.0, + "step": 19343 + }, + { + "epoch": 2.4607556290548276, + "ewc_loss": 0.008428437635302544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.428437286056578e-05, + "grad_norm": 4.2488579750061035, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8899637460708618, + "num_tokens": 738186484.0, + "step": 19344 + }, + { + "epoch": 2.460882839333418, + "ewc_loss": 0.008465979248285294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46597904455848e-05, + "grad_norm": 4.20952033996582, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8864681720733643, + "num_tokens": 738217722.0, + "step": 19345 + }, + { + "epoch": 2.4610100496120086, + "ewc_loss": 0.008440944366157055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440943929599598e-05, + "grad_norm": 4.207583427429199, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8960258364677429, + "num_tokens": 738255304.0, + "step": 19346 + }, + { + "epoch": 2.461137259890599, + "ewc_loss": 0.008448573760688305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448573498753831e-05, + "grad_norm": 4.236921310424805, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8784304261207581, + "num_tokens": 738292162.0, + "step": 19347 + }, + { + "epoch": 2.4612644701691897, + "ewc_loss": 0.008467440493404865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46744078444317e-05, + "grad_norm": 4.15460205078125, + "learning_rate": 1e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8947474956512451, + "num_tokens": 738332851.0, + "step": 19348 + }, + { + "epoch": 2.4613916804477802, + "ewc_loss": 0.008408481255173683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.408481517108157e-05, + "grad_norm": 4.1479573249816895, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.892404317855835, + "num_tokens": 738375120.0, + "step": 19349 + }, + { + "epoch": 2.4615188907263708, + "ewc_loss": 0.008447293192148209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447293657809496e-05, + "grad_norm": 4.212018013000488, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8744817972183228, + "num_tokens": 738413857.0, + "step": 19350 + }, + { + "epoch": 2.4616461010049613, + "ewc_loss": 0.008459577336907387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45957692945376e-05, + "grad_norm": 4.204738616943359, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8758451342582703, + "num_tokens": 738450097.0, + "step": 19351 + }, + { + "epoch": 2.461773311283552, + "ewc_loss": 0.00843370333313942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433703624177724e-05, + "grad_norm": 4.24621057510376, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8652820587158203, + "num_tokens": 738483784.0, + "step": 19352 + }, + { + "epoch": 2.4619005215621423, + "ewc_loss": 0.00849186908453703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491869084537029e-05, + "grad_norm": 4.185464859008789, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.865928053855896, + "num_tokens": 738526820.0, + "step": 19353 + }, + { + "epoch": 2.462027731840733, + "ewc_loss": 0.008436471223831177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43647139845416e-05, + "grad_norm": 4.165841102600098, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8703502416610718, + "num_tokens": 738569049.0, + "step": 19354 + }, + { + "epoch": 2.4621549421193234, + "ewc_loss": 0.008446754887700081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446755236946046e-05, + "grad_norm": 4.248550891876221, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8797538876533508, + "num_tokens": 738608320.0, + "step": 19355 + }, + { + "epoch": 2.462282152397914, + "ewc_loss": 0.008502570912241936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502571290591732e-05, + "grad_norm": 4.179222583770752, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8970569372177124, + "num_tokens": 738645566.0, + "step": 19356 + }, + { + "epoch": 2.4624093626765045, + "ewc_loss": 0.008430293761193752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430294110439718e-05, + "grad_norm": 4.205109119415283, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8604552149772644, + "num_tokens": 738686184.0, + "step": 19357 + }, + { + "epoch": 2.462536572955095, + "ewc_loss": 0.00845659151673317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456591604044661e-05, + "grad_norm": 4.170269966125488, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8725001215934753, + "num_tokens": 738728820.0, + "step": 19358 + }, + { + "epoch": 2.462663783233685, + "ewc_loss": 0.008439531549811363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439531666226685e-05, + "grad_norm": 4.288659572601318, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8562161922454834, + "num_tokens": 738760180.0, + "step": 19359 + }, + { + "epoch": 2.462790993512276, + "ewc_loss": 0.008513471111655235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51347140269354e-05, + "grad_norm": 4.200090408325195, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8903104066848755, + "num_tokens": 738795501.0, + "step": 19360 + }, + { + "epoch": 2.462918203790866, + "ewc_loss": 0.008426694199442863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.42669396661222e-05, + "grad_norm": 4.218788146972656, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8751760721206665, + "num_tokens": 738831027.0, + "step": 19361 + }, + { + "epoch": 2.4630454140694567, + "ewc_loss": 0.008485345169901848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485345460940152e-05, + "grad_norm": 4.159726619720459, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8661637902259827, + "num_tokens": 738870383.0, + "step": 19362 + }, + { + "epoch": 2.463172624348047, + "ewc_loss": 0.008459202013909817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45920221763663e-05, + "grad_norm": 4.2189788818359375, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8881263732910156, + "num_tokens": 738908231.0, + "step": 19363 + }, + { + "epoch": 2.4632998346266377, + "ewc_loss": 0.008501579985022545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501579577568918e-05, + "grad_norm": 4.148299694061279, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8748369216918945, + "num_tokens": 738950927.0, + "step": 19364 + }, + { + "epoch": 2.4634270449052282, + "ewc_loss": 0.00845751166343689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457511285087094e-05, + "grad_norm": 4.217354774475098, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8936640620231628, + "num_tokens": 738989857.0, + "step": 19365 + }, + { + "epoch": 2.4635542551838188, + "ewc_loss": 0.008511900901794434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511901251040399e-05, + "grad_norm": 4.241650581359863, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8953405618667603, + "num_tokens": 739021281.0, + "step": 19366 + }, + { + "epoch": 2.4636814654624093, + "ewc_loss": 0.008482581935822964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482582052238286e-05, + "grad_norm": 4.166236877441406, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8898841142654419, + "num_tokens": 739063475.0, + "step": 19367 + }, + { + "epoch": 2.463808675741, + "ewc_loss": 0.008446265012025833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44626483740285e-05, + "grad_norm": 4.169337272644043, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8785725235939026, + "num_tokens": 739107083.0, + "step": 19368 + }, + { + "epoch": 2.4639358860195903, + "ewc_loss": 0.00846935622394085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469355816487223e-05, + "grad_norm": 4.191518783569336, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8637598156929016, + "num_tokens": 739152002.0, + "step": 19369 + }, + { + "epoch": 2.464063096298181, + "ewc_loss": 0.008481033146381378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481033000862226e-05, + "grad_norm": 4.196065902709961, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8805329203605652, + "num_tokens": 739191781.0, + "step": 19370 + }, + { + "epoch": 2.4641903065767714, + "ewc_loss": 0.008482598699629307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482598786940798e-05, + "grad_norm": 4.172695636749268, + "learning_rate": 1e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.9013223648071289, + "num_tokens": 739230374.0, + "step": 19371 + }, + { + "epoch": 2.464317516855362, + "ewc_loss": 0.00845215655863285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452156180283055e-05, + "grad_norm": 4.21627950668335, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8807909488677979, + "num_tokens": 739267310.0, + "step": 19372 + }, + { + "epoch": 2.4644447271339525, + "ewc_loss": 0.00849843118339777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498431270709261e-05, + "grad_norm": 4.228338718414307, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8854315280914307, + "num_tokens": 739305074.0, + "step": 19373 + }, + { + "epoch": 2.464571937412543, + "ewc_loss": 0.008473584428429604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473584603052586e-05, + "grad_norm": 4.232253551483154, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8758043050765991, + "num_tokens": 739343028.0, + "step": 19374 + }, + { + "epoch": 2.4646991476911335, + "ewc_loss": 0.008454874157905579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454874478047714e-05, + "grad_norm": 4.193840980529785, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8811554908752441, + "num_tokens": 739384364.0, + "step": 19375 + }, + { + "epoch": 2.464826357969724, + "ewc_loss": 0.008450323715806007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450324094155803e-05, + "grad_norm": 4.1514892578125, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8979389667510986, + "num_tokens": 739423564.0, + "step": 19376 + }, + { + "epoch": 2.4649535682483146, + "ewc_loss": 0.008427629247307777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427628927165642e-05, + "grad_norm": 4.189769268035889, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8924500942230225, + "num_tokens": 739457609.0, + "step": 19377 + }, + { + "epoch": 2.465080778526905, + "ewc_loss": 0.008459367789328098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459367381874472e-05, + "grad_norm": 4.193627834320068, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8830541372299194, + "num_tokens": 739497204.0, + "step": 19378 + }, + { + "epoch": 2.4652079888054956, + "ewc_loss": 0.008440656587481499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440656529273838e-05, + "grad_norm": 4.157049655914307, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8776353001594543, + "num_tokens": 739543688.0, + "step": 19379 + }, + { + "epoch": 2.465335199084086, + "ewc_loss": 0.008410497568547726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410497684963048e-05, + "grad_norm": 4.228997707366943, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8938277363777161, + "num_tokens": 739578221.0, + "step": 19380 + }, + { + "epoch": 2.4654624093626767, + "ewc_loss": 0.008468765765428543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468765736324713e-05, + "grad_norm": 4.19589900970459, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8875178098678589, + "num_tokens": 739615892.0, + "step": 19381 + }, + { + "epoch": 2.4655896196412668, + "ewc_loss": 0.008405329659581184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.405329572269693e-05, + "grad_norm": 4.161525726318359, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8682644367218018, + "num_tokens": 739657697.0, + "step": 19382 + }, + { + "epoch": 2.4657168299198577, + "ewc_loss": 0.008425330743193626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.425330452155322e-05, + "grad_norm": 4.178174018859863, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8907520771026611, + "num_tokens": 739694173.0, + "step": 19383 + }, + { + "epoch": 2.465844040198448, + "ewc_loss": 0.008448194712400436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448194421362132e-05, + "grad_norm": 4.198326110839844, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8814205527305603, + "num_tokens": 739734111.0, + "step": 19384 + }, + { + "epoch": 2.4659712504770384, + "ewc_loss": 0.008443685248494148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443684782832861e-05, + "grad_norm": 4.2240376472473145, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8838378190994263, + "num_tokens": 739771409.0, + "step": 19385 + }, + { + "epoch": 2.466098460755629, + "ewc_loss": 0.008450317196547985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45031754579395e-05, + "grad_norm": 4.248305797576904, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8706777095794678, + "num_tokens": 739807185.0, + "step": 19386 + }, + { + "epoch": 2.4662256710342194, + "ewc_loss": 0.008451631292700768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45163085614331e-05, + "grad_norm": 4.215117931365967, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8939378261566162, + "num_tokens": 739837750.0, + "step": 19387 + }, + { + "epoch": 2.46635288131281, + "ewc_loss": 0.008450119756162167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450119639746845e-05, + "grad_norm": 4.192281246185303, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.882341742515564, + "num_tokens": 739879516.0, + "step": 19388 + }, + { + "epoch": 2.4664800915914005, + "ewc_loss": 0.008421937935054302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421937673119828e-05, + "grad_norm": 4.166767597198486, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8889772891998291, + "num_tokens": 739918688.0, + "step": 19389 + }, + { + "epoch": 2.466607301869991, + "ewc_loss": 0.008438116870820522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438116492470726e-05, + "grad_norm": 4.197666168212891, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8855595588684082, + "num_tokens": 739958978.0, + "step": 19390 + }, + { + "epoch": 2.4667345121485815, + "ewc_loss": 0.008447017520666122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447017899015918e-05, + "grad_norm": 4.118133068084717, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.887177050113678, + "num_tokens": 740002428.0, + "step": 19391 + }, + { + "epoch": 2.466861722427172, + "ewc_loss": 0.008390924893319607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390924631385133e-05, + "grad_norm": 4.161290645599365, + "learning_rate": 1e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.9017488956451416, + "num_tokens": 740045856.0, + "step": 19392 + }, + { + "epoch": 2.4669889327057626, + "ewc_loss": 0.008443423546850681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443423575954512e-05, + "grad_norm": 4.208517074584961, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8813140988349915, + "num_tokens": 740084512.0, + "step": 19393 + }, + { + "epoch": 2.467116142984353, + "ewc_loss": 0.00844104029238224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441039972240105e-05, + "grad_norm": 4.114750385284424, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.892325222492218, + "num_tokens": 740129624.0, + "step": 19394 + }, + { + "epoch": 2.4672433532629436, + "ewc_loss": 0.00836237333714962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.362373773707077e-05, + "grad_norm": 4.261268138885498, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8675015568733215, + "num_tokens": 740168368.0, + "step": 19395 + }, + { + "epoch": 2.467370563541534, + "ewc_loss": 0.00849548913538456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495489601045847e-05, + "grad_norm": 4.221231460571289, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8764417171478271, + "num_tokens": 740206670.0, + "step": 19396 + }, + { + "epoch": 2.4674977738201247, + "ewc_loss": 0.008412959985435009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.412959869019687e-05, + "grad_norm": 4.217051982879639, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8627458810806274, + "num_tokens": 740246422.0, + "step": 19397 + }, + { + "epoch": 2.4676249840987152, + "ewc_loss": 0.008437029086053371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437029464403167e-05, + "grad_norm": 4.199391841888428, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8800874352455139, + "num_tokens": 740284450.0, + "step": 19398 + }, + { + "epoch": 2.4677521943773058, + "ewc_loss": 0.008410690352320671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410690497839823e-05, + "grad_norm": 4.215240478515625, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8615703582763672, + "num_tokens": 740319062.0, + "step": 19399 + }, + { + "epoch": 2.4678794046558963, + "ewc_loss": 0.008445009589195251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445009734714404e-05, + "grad_norm": 4.218307018280029, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8858727216720581, + "num_tokens": 740353996.0, + "step": 19400 + }, + { + "epoch": 2.468006614934487, + "ewc_loss": 0.00843340065330267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433400216745213e-05, + "grad_norm": 4.184953212738037, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8816642761230469, + "num_tokens": 740394694.0, + "step": 19401 + }, + { + "epoch": 2.4681338252130773, + "ewc_loss": 0.008436114527285099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436114148935303e-05, + "grad_norm": 4.128726959228516, + "learning_rate": 1e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.9007871150970459, + "num_tokens": 740436567.0, + "step": 19402 + }, + { + "epoch": 2.468261035491668, + "ewc_loss": 0.008411267772316933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.411267481278628e-05, + "grad_norm": 4.205277919769287, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8766797184944153, + "num_tokens": 740476458.0, + "step": 19403 + }, + { + "epoch": 2.4683882457702584, + "ewc_loss": 0.00845397636294365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453976624878123e-05, + "grad_norm": 4.143007278442383, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8913301229476929, + "num_tokens": 740514703.0, + "step": 19404 + }, + { + "epoch": 2.468515456048849, + "ewc_loss": 0.008393715135753155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393714961130172e-05, + "grad_norm": 4.168328285217285, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8920093178749084, + "num_tokens": 740552328.0, + "step": 19405 + }, + { + "epoch": 2.4686426663274394, + "ewc_loss": 0.008458249270915985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458249067189172e-05, + "grad_norm": 4.252287864685059, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8643060922622681, + "num_tokens": 740593141.0, + "step": 19406 + }, + { + "epoch": 2.4687698766060295, + "ewc_loss": 0.008467413485050201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467413135804236e-05, + "grad_norm": 4.135133266448975, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8924306631088257, + "num_tokens": 740634097.0, + "step": 19407 + }, + { + "epoch": 2.4688970868846205, + "ewc_loss": 0.008387536741793156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387536945519969e-05, + "grad_norm": 4.19880485534668, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8865343928337097, + "num_tokens": 740672917.0, + "step": 19408 + }, + { + "epoch": 2.4690242971632106, + "ewc_loss": 0.008465074934065342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465075370622799e-05, + "grad_norm": 4.222231864929199, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8784101605415344, + "num_tokens": 740709351.0, + "step": 19409 + }, + { + "epoch": 2.469151507441801, + "ewc_loss": 0.008459468372166157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459468517685309e-05, + "grad_norm": 4.245265960693359, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8740972280502319, + "num_tokens": 740744049.0, + "step": 19410 + }, + { + "epoch": 2.4692787177203916, + "ewc_loss": 0.00845262035727501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45262038637884e-05, + "grad_norm": 4.158044338226318, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8854938745498657, + "num_tokens": 740784765.0, + "step": 19411 + }, + { + "epoch": 2.469405927998982, + "ewc_loss": 0.008401759900152683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.401759987464175e-05, + "grad_norm": 4.366004467010498, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8765486478805542, + "num_tokens": 740815859.0, + "step": 19412 + }, + { + "epoch": 2.4695331382775727, + "ewc_loss": 0.008547921665012836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547921606805176e-05, + "grad_norm": 4.188446998596191, + "learning_rate": 1e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.9004724621772766, + "num_tokens": 740854300.0, + "step": 19413 + }, + { + "epoch": 2.4696603485561632, + "ewc_loss": 0.008366064168512821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.366064139408991e-05, + "grad_norm": 4.2114410400390625, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8671557903289795, + "num_tokens": 740891836.0, + "step": 19414 + }, + { + "epoch": 2.4697875588347538, + "ewc_loss": 0.008483046665787697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48304625833407e-05, + "grad_norm": 4.204893112182617, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8893697261810303, + "num_tokens": 740927434.0, + "step": 19415 + }, + { + "epoch": 2.4699147691133443, + "ewc_loss": 0.008459224365651608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459224773105234e-05, + "grad_norm": 4.117133617401123, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8845429420471191, + "num_tokens": 740973707.0, + "step": 19416 + }, + { + "epoch": 2.470041979391935, + "ewc_loss": 0.008403589017689228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.40358916320838e-05, + "grad_norm": 4.160372257232666, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8900133371353149, + "num_tokens": 741015953.0, + "step": 19417 + }, + { + "epoch": 2.4701691896705253, + "ewc_loss": 0.008476740680634975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476740913465619e-05, + "grad_norm": 4.2359819412231445, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8894554376602173, + "num_tokens": 741049634.0, + "step": 19418 + }, + { + "epoch": 2.470296399949116, + "ewc_loss": 0.008476410992443562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476410584989935e-05, + "grad_norm": 4.165232181549072, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8920153379440308, + "num_tokens": 741084267.0, + "step": 19419 + }, + { + "epoch": 2.4704236102277064, + "ewc_loss": 0.008430706337094307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430706657236442e-05, + "grad_norm": 4.1734619140625, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.889607310295105, + "num_tokens": 741122237.0, + "step": 19420 + }, + { + "epoch": 2.470550820506297, + "ewc_loss": 0.008459539152681828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459539094474167e-05, + "grad_norm": 4.2103352546691895, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8721295595169067, + "num_tokens": 741163593.0, + "step": 19421 + }, + { + "epoch": 2.4706780307848875, + "ewc_loss": 0.008457288146018982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457287913188338e-05, + "grad_norm": 4.193456172943115, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8717319369316101, + "num_tokens": 741202328.0, + "step": 19422 + }, + { + "epoch": 2.470805241063478, + "ewc_loss": 0.008441945537924767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44194510136731e-05, + "grad_norm": 4.162043571472168, + "learning_rate": 1e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.9021523594856262, + "num_tokens": 741240095.0, + "step": 19423 + }, + { + "epoch": 2.4709324513420685, + "ewc_loss": 0.008444386534392834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444386185146868e-05, + "grad_norm": 4.240934371948242, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8846614360809326, + "num_tokens": 741278509.0, + "step": 19424 + }, + { + "epoch": 2.471059661620659, + "ewc_loss": 0.008475970476865768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475970389554277e-05, + "grad_norm": 4.174203872680664, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.879104495048523, + "num_tokens": 741321446.0, + "step": 19425 + }, + { + "epoch": 2.4711868718992496, + "ewc_loss": 0.008412250317633152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.412250463152304e-05, + "grad_norm": 4.213153839111328, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8793053030967712, + "num_tokens": 741360469.0, + "step": 19426 + }, + { + "epoch": 2.47131408217784, + "ewc_loss": 0.008452788926661015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452789188595489e-05, + "grad_norm": 4.234818935394287, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.873997688293457, + "num_tokens": 741393833.0, + "step": 19427 + }, + { + "epoch": 2.4714412924564306, + "ewc_loss": 0.008453508839011192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453508780803531e-05, + "grad_norm": 4.253715515136719, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8884662389755249, + "num_tokens": 741426956.0, + "step": 19428 + }, + { + "epoch": 2.471568502735021, + "ewc_loss": 0.008443225175142288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443224942311645e-05, + "grad_norm": 4.175139904022217, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8826007843017578, + "num_tokens": 741464820.0, + "step": 19429 + }, + { + "epoch": 2.4716957130136117, + "ewc_loss": 0.008426014333963394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426014392171055e-05, + "grad_norm": 4.258727073669434, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.888505220413208, + "num_tokens": 741499519.0, + "step": 19430 + }, + { + "epoch": 2.471822923292202, + "ewc_loss": 0.008486016653478146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486017031827942e-05, + "grad_norm": 4.195532321929932, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8897068500518799, + "num_tokens": 741533443.0, + "step": 19431 + }, + { + "epoch": 2.4719501335707923, + "ewc_loss": 0.00843450240790844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434502524323761e-05, + "grad_norm": 4.175390243530273, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8802418112754822, + "num_tokens": 741576849.0, + "step": 19432 + }, + { + "epoch": 2.4720773438493833, + "ewc_loss": 0.008460522629320621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460522803943604e-05, + "grad_norm": 4.223975658416748, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.888654351234436, + "num_tokens": 741614653.0, + "step": 19433 + }, + { + "epoch": 2.4722045541279734, + "ewc_loss": 0.008477133698761463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477133815176785e-05, + "grad_norm": 4.255992412567139, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8769960403442383, + "num_tokens": 741649354.0, + "step": 19434 + }, + { + "epoch": 2.472331764406564, + "ewc_loss": 0.008493781089782715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493781206198037e-05, + "grad_norm": 4.154390335083008, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8755395412445068, + "num_tokens": 741692257.0, + "step": 19435 + }, + { + "epoch": 2.4724589746851544, + "ewc_loss": 0.008424961939454079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424961561104283e-05, + "grad_norm": 4.298067569732666, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8802785873413086, + "num_tokens": 741729739.0, + "step": 19436 + }, + { + "epoch": 2.472586184963745, + "ewc_loss": 0.008569207042455673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569207420805469e-05, + "grad_norm": 4.2823381423950195, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8739575147628784, + "num_tokens": 741763409.0, + "step": 19437 + }, + { + "epoch": 2.4727133952423355, + "ewc_loss": 0.008490484207868576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49048446980305e-05, + "grad_norm": 4.20673131942749, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8812879323959351, + "num_tokens": 741798746.0, + "step": 19438 + }, + { + "epoch": 2.472840605520926, + "ewc_loss": 0.008467726409435272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467726729577407e-05, + "grad_norm": 4.131605625152588, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8913619518280029, + "num_tokens": 741840488.0, + "step": 19439 + }, + { + "epoch": 2.4729678157995165, + "ewc_loss": 0.008448044769465923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44804453663528e-05, + "grad_norm": 4.153121471405029, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8995070457458496, + "num_tokens": 741880839.0, + "step": 19440 + }, + { + "epoch": 2.473095026078107, + "ewc_loss": 0.008483137004077435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483137207804248e-05, + "grad_norm": 4.206665515899658, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8767979741096497, + "num_tokens": 741919317.0, + "step": 19441 + }, + { + "epoch": 2.4732222363566976, + "ewc_loss": 0.008502890355885029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502889977535233e-05, + "grad_norm": 4.217867374420166, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8648995161056519, + "num_tokens": 741955782.0, + "step": 19442 + }, + { + "epoch": 2.473349446635288, + "ewc_loss": 0.008493510074913502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493509812979028e-05, + "grad_norm": 4.253598213195801, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8782340288162231, + "num_tokens": 741990410.0, + "step": 19443 + }, + { + "epoch": 2.4734766569138786, + "ewc_loss": 0.008542945608496666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542945579392835e-05, + "grad_norm": 4.2252197265625, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8783006072044373, + "num_tokens": 742031536.0, + "step": 19444 + }, + { + "epoch": 2.473603867192469, + "ewc_loss": 0.008490007370710373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49000716698356e-05, + "grad_norm": 4.241142272949219, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8725606203079224, + "num_tokens": 742067222.0, + "step": 19445 + }, + { + "epoch": 2.4737310774710597, + "ewc_loss": 0.008522674441337585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522674033883959e-05, + "grad_norm": 4.097276210784912, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8936775326728821, + "num_tokens": 742114740.0, + "step": 19446 + }, + { + "epoch": 2.47385828774965, + "ewc_loss": 0.008429641835391521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.429641457041726e-05, + "grad_norm": 4.202160835266113, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8911266326904297, + "num_tokens": 742149344.0, + "step": 19447 + }, + { + "epoch": 2.4739854980282407, + "ewc_loss": 0.008545923978090286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545923628844321e-05, + "grad_norm": 4.200328350067139, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8848068714141846, + "num_tokens": 742191089.0, + "step": 19448 + }, + { + "epoch": 2.4741127083068313, + "ewc_loss": 0.008490395732223988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490395703120157e-05, + "grad_norm": 4.21619987487793, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8824205994606018, + "num_tokens": 742224944.0, + "step": 19449 + }, + { + "epoch": 2.474239918585422, + "ewc_loss": 0.008492250926792622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492251072311774e-05, + "grad_norm": 4.187587261199951, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8741487264633179, + "num_tokens": 742263674.0, + "step": 19450 + }, + { + "epoch": 2.4743671288640123, + "ewc_loss": 0.00848547462373972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485474972985685e-05, + "grad_norm": 4.242556571960449, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8766605854034424, + "num_tokens": 742300424.0, + "step": 19451 + }, + { + "epoch": 2.474494339142603, + "ewc_loss": 0.008519038558006287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519038965459913e-05, + "grad_norm": 4.1732563972473145, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8931397199630737, + "num_tokens": 742333854.0, + "step": 19452 + }, + { + "epoch": 2.4746215494211934, + "ewc_loss": 0.008485945872962475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485945727443323e-05, + "grad_norm": 4.16029691696167, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8904644250869751, + "num_tokens": 742377126.0, + "step": 19453 + }, + { + "epoch": 2.474748759699784, + "ewc_loss": 0.008487881161272526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487881132168695e-05, + "grad_norm": 4.150768756866455, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8816775679588318, + "num_tokens": 742421284.0, + "step": 19454 + }, + { + "epoch": 2.4748759699783744, + "ewc_loss": 0.008477195166051388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477194933220744e-05, + "grad_norm": 4.239919185638428, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8670664429664612, + "num_tokens": 742459124.0, + "step": 19455 + }, + { + "epoch": 2.475003180256965, + "ewc_loss": 0.008544541895389557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544541924493387e-05, + "grad_norm": 4.2442498207092285, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8742305636405945, + "num_tokens": 742493137.0, + "step": 19456 + }, + { + "epoch": 2.475130390535555, + "ewc_loss": 0.008504125289618969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504125435138121e-05, + "grad_norm": 4.187470436096191, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8868908286094666, + "num_tokens": 742529805.0, + "step": 19457 + }, + { + "epoch": 2.475257600814146, + "ewc_loss": 0.008488276973366737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488276944262907e-05, + "grad_norm": 4.2548909187316895, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.895515501499176, + "num_tokens": 742560859.0, + "step": 19458 + }, + { + "epoch": 2.475384811092736, + "ewc_loss": 0.008536136709153652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536136738257483e-05, + "grad_norm": 4.172199249267578, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8951922655105591, + "num_tokens": 742601465.0, + "step": 19459 + }, + { + "epoch": 2.4755120213713266, + "ewc_loss": 0.008455896750092506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455896750092506e-05, + "grad_norm": 4.2337822914123535, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.876010537147522, + "num_tokens": 742639242.0, + "step": 19460 + }, + { + "epoch": 2.475639231649917, + "ewc_loss": 0.008507853373885155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507853635819629e-05, + "grad_norm": 4.227730751037598, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8832446336746216, + "num_tokens": 742676574.0, + "step": 19461 + }, + { + "epoch": 2.4757664419285077, + "ewc_loss": 0.008500372059643269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500372496200725e-05, + "grad_norm": 4.263682842254639, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8729457259178162, + "num_tokens": 742713326.0, + "step": 19462 + }, + { + "epoch": 2.4758936522070982, + "ewc_loss": 0.008497383445501328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497383532812819e-05, + "grad_norm": 4.185431480407715, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8602031469345093, + "num_tokens": 742756883.0, + "step": 19463 + }, + { + "epoch": 2.4760208624856888, + "ewc_loss": 0.0084538534283638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453853661194444e-05, + "grad_norm": 4.225144386291504, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.879058837890625, + "num_tokens": 742794301.0, + "step": 19464 + }, + { + "epoch": 2.4761480727642793, + "ewc_loss": 0.008507071062922478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507071470376104e-05, + "grad_norm": 4.195307731628418, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8841308951377869, + "num_tokens": 742832966.0, + "step": 19465 + }, + { + "epoch": 2.47627528304287, + "ewc_loss": 0.00847774837166071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477748633595183e-05, + "grad_norm": 4.202131748199463, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8795109987258911, + "num_tokens": 742872926.0, + "step": 19466 + }, + { + "epoch": 2.4764024933214603, + "ewc_loss": 0.008483322337269783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483322017127648e-05, + "grad_norm": 4.198009967803955, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.88920658826828, + "num_tokens": 742910838.0, + "step": 19467 + }, + { + "epoch": 2.476529703600051, + "ewc_loss": 0.008506983518600464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506983431288972e-05, + "grad_norm": 4.162079811096191, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8900103569030762, + "num_tokens": 742950098.0, + "step": 19468 + }, + { + "epoch": 2.4766569138786414, + "ewc_loss": 0.008482837118208408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482837438350543e-05, + "grad_norm": 4.208884239196777, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8931898474693298, + "num_tokens": 742983872.0, + "step": 19469 + }, + { + "epoch": 2.476784124157232, + "ewc_loss": 0.008523537777364254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523537690052763e-05, + "grad_norm": 4.1756978034973145, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8724706172943115, + "num_tokens": 743029816.0, + "step": 19470 + }, + { + "epoch": 2.4769113344358225, + "ewc_loss": 0.008479301817715168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479302050545812e-05, + "grad_norm": 4.1972126960754395, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8753768801689148, + "num_tokens": 743068565.0, + "step": 19471 + }, + { + "epoch": 2.477038544714413, + "ewc_loss": 0.008501344360411167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50134456413798e-05, + "grad_norm": 4.176564693450928, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8841525316238403, + "num_tokens": 743112137.0, + "step": 19472 + }, + { + "epoch": 2.4771657549930035, + "ewc_loss": 0.008476563729345798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476563380099833e-05, + "grad_norm": 4.198230266571045, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8953856825828552, + "num_tokens": 743145067.0, + "step": 19473 + }, + { + "epoch": 2.477292965271594, + "ewc_loss": 0.008497949689626694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497949602315202e-05, + "grad_norm": 4.229732990264893, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8877988457679749, + "num_tokens": 743177017.0, + "step": 19474 + }, + { + "epoch": 2.4774201755501846, + "ewc_loss": 0.008486548438668251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486548904329538e-05, + "grad_norm": 4.189021110534668, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8826488256454468, + "num_tokens": 743218121.0, + "step": 19475 + }, + { + "epoch": 2.477547385828775, + "ewc_loss": 0.008454971015453339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454971248283982e-05, + "grad_norm": 4.269283294677734, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8890556693077087, + "num_tokens": 743256635.0, + "step": 19476 + }, + { + "epoch": 2.4776745961073656, + "ewc_loss": 0.008510304614901543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510304905939847e-05, + "grad_norm": 4.181854724884033, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8817205429077148, + "num_tokens": 743293801.0, + "step": 19477 + }, + { + "epoch": 2.477801806385956, + "ewc_loss": 0.00843694619834423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436946518486366e-05, + "grad_norm": 4.163724899291992, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8958667516708374, + "num_tokens": 743331582.0, + "step": 19478 + }, + { + "epoch": 2.4779290166645467, + "ewc_loss": 0.008454861119389534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454861381324008e-05, + "grad_norm": 4.162043571472168, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8860592842102051, + "num_tokens": 743369901.0, + "step": 19479 + }, + { + "epoch": 2.4780562269431368, + "ewc_loss": 0.008473504334688187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47350456751883e-05, + "grad_norm": 4.229045867919922, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8774452209472656, + "num_tokens": 743405808.0, + "step": 19480 + }, + { + "epoch": 2.4781834372217277, + "ewc_loss": 0.008503179997205734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503180288244039e-05, + "grad_norm": 4.141208171844482, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8848220109939575, + "num_tokens": 743447236.0, + "step": 19481 + }, + { + "epoch": 2.478310647500318, + "ewc_loss": 0.008443559519946575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443559636361897e-05, + "grad_norm": 4.17981481552124, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8898389339447021, + "num_tokens": 743486494.0, + "step": 19482 + }, + { + "epoch": 2.4784378577789083, + "ewc_loss": 0.008481008931994438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481008990202099e-05, + "grad_norm": 4.220075607299805, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8784323334693909, + "num_tokens": 743523941.0, + "step": 19483 + }, + { + "epoch": 2.478565068057499, + "ewc_loss": 0.008481931872665882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481932309223339e-05, + "grad_norm": 4.242171287536621, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8872015476226807, + "num_tokens": 743554169.0, + "step": 19484 + }, + { + "epoch": 2.4786922783360894, + "ewc_loss": 0.008486179634928703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486179285682738e-05, + "grad_norm": 4.225357532501221, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8676840662956238, + "num_tokens": 743594656.0, + "step": 19485 + }, + { + "epoch": 2.47881948861468, + "ewc_loss": 0.008471577428281307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471577166346833e-05, + "grad_norm": 4.232851982116699, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.889126181602478, + "num_tokens": 743628638.0, + "step": 19486 + }, + { + "epoch": 2.4789466988932705, + "ewc_loss": 0.008482704870402813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482705015921965e-05, + "grad_norm": 4.252666473388672, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8769135475158691, + "num_tokens": 743665382.0, + "step": 19487 + }, + { + "epoch": 2.479073909171861, + "ewc_loss": 0.00849570520222187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495704969391227e-05, + "grad_norm": 4.172854423522949, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8961908221244812, + "num_tokens": 743701712.0, + "step": 19488 + }, + { + "epoch": 2.4792011194504515, + "ewc_loss": 0.008441573940217495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441574027528986e-05, + "grad_norm": 4.191604137420654, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8848131895065308, + "num_tokens": 743740347.0, + "step": 19489 + }, + { + "epoch": 2.479328329729042, + "ewc_loss": 0.008476197719573975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47619739943184e-05, + "grad_norm": 4.241815567016602, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8845536112785339, + "num_tokens": 743775434.0, + "step": 19490 + }, + { + "epoch": 2.4794555400076326, + "ewc_loss": 0.008487489074468613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487488958053291e-05, + "grad_norm": 4.1859564781188965, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8828406929969788, + "num_tokens": 743812475.0, + "step": 19491 + }, + { + "epoch": 2.479582750286223, + "ewc_loss": 0.008435908704996109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435908966930583e-05, + "grad_norm": 4.186519145965576, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8743811249732971, + "num_tokens": 743851350.0, + "step": 19492 + }, + { + "epoch": 2.4797099605648136, + "ewc_loss": 0.008474903181195259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474903006572276e-05, + "grad_norm": 4.2420125007629395, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.881298303604126, + "num_tokens": 743885086.0, + "step": 19493 + }, + { + "epoch": 2.479837170843404, + "ewc_loss": 0.008517377078533173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517377136740834e-05, + "grad_norm": 4.213163375854492, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8853285312652588, + "num_tokens": 743931747.0, + "step": 19494 + }, + { + "epoch": 2.4799643811219947, + "ewc_loss": 0.00845323782414198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453237387584522e-05, + "grad_norm": 4.172377109527588, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8938564658164978, + "num_tokens": 743971806.0, + "step": 19495 + }, + { + "epoch": 2.480091591400585, + "ewc_loss": 0.008475887589156628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475887443637475e-05, + "grad_norm": 4.241186618804932, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8938031792640686, + "num_tokens": 744005304.0, + "step": 19496 + }, + { + "epoch": 2.4802188016791757, + "ewc_loss": 0.008503814227879047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503814024152234e-05, + "grad_norm": 4.1374382972717285, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8756171464920044, + "num_tokens": 744051159.0, + "step": 19497 + }, + { + "epoch": 2.4803460119577663, + "ewc_loss": 0.008412668481469154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41266883071512e-05, + "grad_norm": 4.213789463043213, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8667880296707153, + "num_tokens": 744089856.0, + "step": 19498 + }, + { + "epoch": 2.480473222236357, + "ewc_loss": 0.008509640581905842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509640611009672e-05, + "grad_norm": 4.155959606170654, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8656601905822754, + "num_tokens": 744136355.0, + "step": 19499 + }, + { + "epoch": 2.4806004325149473, + "ewc_loss": 0.00845212209969759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452121983282268e-05, + "grad_norm": 4.192641735076904, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8724443316459656, + "num_tokens": 744177682.0, + "step": 19500 + }, + { + "epoch": 2.480727642793538, + "ewc_loss": 0.008479940705001354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479940879624337e-05, + "grad_norm": 4.1826090812683105, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8834987878799438, + "num_tokens": 744214209.0, + "step": 19501 + }, + { + "epoch": 2.4808548530721284, + "ewc_loss": 0.008477158844470978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477158553432673e-05, + "grad_norm": 4.190912246704102, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8670189380645752, + "num_tokens": 744257923.0, + "step": 19502 + }, + { + "epoch": 2.480982063350719, + "ewc_loss": 0.008462722413241863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462722325930372e-05, + "grad_norm": 4.235010147094727, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8896867036819458, + "num_tokens": 744293370.0, + "step": 19503 + }, + { + "epoch": 2.4811092736293094, + "ewc_loss": 0.008473648689687252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473648631479591e-05, + "grad_norm": 4.194859027862549, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.888076663017273, + "num_tokens": 744328572.0, + "step": 19504 + }, + { + "epoch": 2.4812364839078995, + "ewc_loss": 0.00844564102590084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445641287835315e-05, + "grad_norm": 4.227540493011475, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8679251670837402, + "num_tokens": 744364124.0, + "step": 19505 + }, + { + "epoch": 2.4813636941864905, + "ewc_loss": 0.008469365537166595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469365275232121e-05, + "grad_norm": 4.19625997543335, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8867793679237366, + "num_tokens": 744403679.0, + "step": 19506 + }, + { + "epoch": 2.4814909044650806, + "ewc_loss": 0.008438617922365665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438617805950344e-05, + "grad_norm": 4.228966236114502, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8829911947250366, + "num_tokens": 744440922.0, + "step": 19507 + }, + { + "epoch": 2.481618114743671, + "ewc_loss": 0.008473426103591919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473425987176597e-05, + "grad_norm": 4.196898460388184, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8793888688087463, + "num_tokens": 744479025.0, + "step": 19508 + }, + { + "epoch": 2.4817453250222616, + "ewc_loss": 0.008465119637548923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465119753964245e-05, + "grad_norm": 4.257513046264648, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8850678205490112, + "num_tokens": 744514165.0, + "step": 19509 + }, + { + "epoch": 2.481872535300852, + "ewc_loss": 0.008479777723550797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479777898173779e-05, + "grad_norm": 4.238260269165039, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8792141675949097, + "num_tokens": 744547210.0, + "step": 19510 + }, + { + "epoch": 2.4819997455794427, + "ewc_loss": 0.008460704237222672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460704702883959e-05, + "grad_norm": 4.203880310058594, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8823216557502747, + "num_tokens": 744585132.0, + "step": 19511 + }, + { + "epoch": 2.4821269558580332, + "ewc_loss": 0.008458435535430908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458435331704095e-05, + "grad_norm": 4.209933757781982, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8926759958267212, + "num_tokens": 744617921.0, + "step": 19512 + }, + { + "epoch": 2.4822541661366238, + "ewc_loss": 0.008508524857461452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508524479111657e-05, + "grad_norm": 4.2886505126953125, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8760685324668884, + "num_tokens": 744653095.0, + "step": 19513 + }, + { + "epoch": 2.4823813764152143, + "ewc_loss": 0.008519024588167667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519024413544685e-05, + "grad_norm": 4.178521156311035, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8878564834594727, + "num_tokens": 744688258.0, + "step": 19514 + }, + { + "epoch": 2.482508586693805, + "ewc_loss": 0.00845553632825613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455536590190604e-05, + "grad_norm": 4.243256092071533, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8821167349815369, + "num_tokens": 744723500.0, + "step": 19515 + }, + { + "epoch": 2.4826357969723953, + "ewc_loss": 0.008532597683370113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532597712473944e-05, + "grad_norm": 4.213985919952393, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8848110437393188, + "num_tokens": 744757133.0, + "step": 19516 + }, + { + "epoch": 2.482763007250986, + "ewc_loss": 0.008530200459063053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53020028444007e-05, + "grad_norm": 4.2130656242370605, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8958970308303833, + "num_tokens": 744793814.0, + "step": 19517 + }, + { + "epoch": 2.4828902175295764, + "ewc_loss": 0.008525380864739418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525380690116435e-05, + "grad_norm": 4.217534065246582, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8835173845291138, + "num_tokens": 744828680.0, + "step": 19518 + }, + { + "epoch": 2.483017427808167, + "ewc_loss": 0.008540535345673561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540535782231018e-05, + "grad_norm": 4.235767364501953, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.87775719165802, + "num_tokens": 744861642.0, + "step": 19519 + }, + { + "epoch": 2.4831446380867574, + "ewc_loss": 0.008583267219364643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583267481299117e-05, + "grad_norm": 4.226840496063232, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8942505121231079, + "num_tokens": 744900345.0, + "step": 19520 + }, + { + "epoch": 2.483271848365348, + "ewc_loss": 0.008560833521187305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560833521187305e-05, + "grad_norm": 4.186061382293701, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8886107206344604, + "num_tokens": 744938589.0, + "step": 19521 + }, + { + "epoch": 2.4833990586439385, + "ewc_loss": 0.008544588461518288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544588490622118e-05, + "grad_norm": 4.259494304656982, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8738389015197754, + "num_tokens": 744975385.0, + "step": 19522 + }, + { + "epoch": 2.483526268922529, + "ewc_loss": 0.008605201728641987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605201583122835e-05, + "grad_norm": 4.227555751800537, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8738205432891846, + "num_tokens": 745010860.0, + "step": 19523 + }, + { + "epoch": 2.4836534792011196, + "ewc_loss": 0.008547171950340271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547172183170915e-05, + "grad_norm": 4.218817710876465, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8760074377059937, + "num_tokens": 745047183.0, + "step": 19524 + }, + { + "epoch": 2.48378068947971, + "ewc_loss": 0.008559425361454487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559425623388961e-05, + "grad_norm": 4.231119155883789, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8808580636978149, + "num_tokens": 745081906.0, + "step": 19525 + }, + { + "epoch": 2.4839078997583006, + "ewc_loss": 0.008585255593061447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585255272919312e-05, + "grad_norm": 4.162449836730957, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8872511386871338, + "num_tokens": 745127285.0, + "step": 19526 + }, + { + "epoch": 2.484035110036891, + "ewc_loss": 0.008520795963704586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520796109223738e-05, + "grad_norm": 4.1524434089660645, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8970954418182373, + "num_tokens": 745171564.0, + "step": 19527 + }, + { + "epoch": 2.4841623203154817, + "ewc_loss": 0.008536610752344131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536610403098166e-05, + "grad_norm": 4.143106937408447, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8909781575202942, + "num_tokens": 745214252.0, + "step": 19528 + }, + { + "epoch": 2.484289530594072, + "ewc_loss": 0.008514461107552052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514460932929069e-05, + "grad_norm": 4.2354960441589355, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.884804904460907, + "num_tokens": 745251983.0, + "step": 19529 + }, + { + "epoch": 2.4844167408726623, + "ewc_loss": 0.008574692532420158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574692765250802e-05, + "grad_norm": 4.254146575927734, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8832167387008667, + "num_tokens": 745287037.0, + "step": 19530 + }, + { + "epoch": 2.4845439511512533, + "ewc_loss": 0.008524171076714993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524170698365197e-05, + "grad_norm": 4.199019908905029, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.880282998085022, + "num_tokens": 745327854.0, + "step": 19531 + }, + { + "epoch": 2.4846711614298433, + "ewc_loss": 0.008480209857225418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480210090056062e-05, + "grad_norm": 4.196443557739258, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8736530542373657, + "num_tokens": 745370227.0, + "step": 19532 + }, + { + "epoch": 2.484798371708434, + "ewc_loss": 0.008511769585311413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511769556207582e-05, + "grad_norm": 4.1932573318481445, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8821811079978943, + "num_tokens": 745409416.0, + "step": 19533 + }, + { + "epoch": 2.4849255819870244, + "ewc_loss": 0.008498878218233585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498878014506772e-05, + "grad_norm": 4.276573657989502, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8660746812820435, + "num_tokens": 745449908.0, + "step": 19534 + }, + { + "epoch": 2.485052792265615, + "ewc_loss": 0.008522567339241505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52256707730703e-05, + "grad_norm": 4.240625381469727, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8721829056739807, + "num_tokens": 745486384.0, + "step": 19535 + }, + { + "epoch": 2.4851800025442055, + "ewc_loss": 0.00846971943974495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469719614367932e-05, + "grad_norm": 4.2168803215026855, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8716021776199341, + "num_tokens": 745523058.0, + "step": 19536 + }, + { + "epoch": 2.485307212822796, + "ewc_loss": 0.008478195406496525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478195377392694e-05, + "grad_norm": 4.238266468048096, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.86708664894104, + "num_tokens": 745559276.0, + "step": 19537 + }, + { + "epoch": 2.4854344231013865, + "ewc_loss": 0.008491521701216698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491521293763071e-05, + "grad_norm": 4.2479376792907715, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8810659646987915, + "num_tokens": 745592726.0, + "step": 19538 + }, + { + "epoch": 2.485561633379977, + "ewc_loss": 0.00851061474531889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510614861734211e-05, + "grad_norm": 4.231929779052734, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8840453624725342, + "num_tokens": 745630117.0, + "step": 19539 + }, + { + "epoch": 2.4856888436585676, + "ewc_loss": 0.008497378788888454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497378439642489e-05, + "grad_norm": 4.2398552894592285, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8719110488891602, + "num_tokens": 745664968.0, + "step": 19540 + }, + { + "epoch": 2.485816053937158, + "ewc_loss": 0.008521344512701035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521344716427848e-05, + "grad_norm": 4.199194431304932, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8841578364372253, + "num_tokens": 745701893.0, + "step": 19541 + }, + { + "epoch": 2.4859432642157486, + "ewc_loss": 0.008500242605805397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500242984155193e-05, + "grad_norm": 4.258159637451172, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8811697959899902, + "num_tokens": 745736884.0, + "step": 19542 + }, + { + "epoch": 2.486070474494339, + "ewc_loss": 0.008556109853088856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556109969504178e-05, + "grad_norm": 4.147112846374512, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8948960304260254, + "num_tokens": 745782381.0, + "step": 19543 + }, + { + "epoch": 2.4861976847729297, + "ewc_loss": 0.008470197208225727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470196917187423e-05, + "grad_norm": 4.14284610748291, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8982203602790833, + "num_tokens": 745827630.0, + "step": 19544 + }, + { + "epoch": 2.48632489505152, + "ewc_loss": 0.008514500223100185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514500223100185e-05, + "grad_norm": 4.26733922958374, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8888224363327026, + "num_tokens": 745862223.0, + "step": 19545 + }, + { + "epoch": 2.4864521053301107, + "ewc_loss": 0.008572814986109734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572814840590581e-05, + "grad_norm": 4.164079189300537, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8982778787612915, + "num_tokens": 745903077.0, + "step": 19546 + }, + { + "epoch": 2.4865793156087013, + "ewc_loss": 0.008460119366645813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460119715891778e-05, + "grad_norm": 4.166538238525391, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8829816579818726, + "num_tokens": 745945534.0, + "step": 19547 + }, + { + "epoch": 2.486706525887292, + "ewc_loss": 0.008496986702084541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496986265527084e-05, + "grad_norm": 4.232089042663574, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8667756915092468, + "num_tokens": 745986308.0, + "step": 19548 + }, + { + "epoch": 2.4868337361658823, + "ewc_loss": 0.008516701869666576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516701927874237e-05, + "grad_norm": 4.22957706451416, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8735180497169495, + "num_tokens": 746022206.0, + "step": 19549 + }, + { + "epoch": 2.486960946444473, + "ewc_loss": 0.008490452542901039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490452455589548e-05, + "grad_norm": 4.160897254943848, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.890000581741333, + "num_tokens": 746059854.0, + "step": 19550 + }, + { + "epoch": 2.4870881567230634, + "ewc_loss": 0.008456196635961533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456196519546211e-05, + "grad_norm": 4.133713245391846, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8906623721122742, + "num_tokens": 746103994.0, + "step": 19551 + }, + { + "epoch": 2.487215367001654, + "ewc_loss": 0.008467397652566433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467397856293246e-05, + "grad_norm": 4.265442848205566, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8740119934082031, + "num_tokens": 746138491.0, + "step": 19552 + }, + { + "epoch": 2.4873425772802444, + "ewc_loss": 0.00854416936635971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544169395463541e-05, + "grad_norm": 4.24156379699707, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.880165696144104, + "num_tokens": 746172598.0, + "step": 19553 + }, + { + "epoch": 2.487469787558835, + "ewc_loss": 0.008475270122289658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475270442431793e-05, + "grad_norm": 4.1790266036987305, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8882307410240173, + "num_tokens": 746210385.0, + "step": 19554 + }, + { + "epoch": 2.487596997837425, + "ewc_loss": 0.008463525213301182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463525591650978e-05, + "grad_norm": 4.155875205993652, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8832463026046753, + "num_tokens": 746252563.0, + "step": 19555 + }, + { + "epoch": 2.487724208116016, + "ewc_loss": 0.00847641471773386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476414950564504e-05, + "grad_norm": 4.1600422859191895, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8875697255134583, + "num_tokens": 746292356.0, + "step": 19556 + }, + { + "epoch": 2.487851418394606, + "ewc_loss": 0.008474557660520077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474557398585603e-05, + "grad_norm": 4.280426502227783, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8736595511436462, + "num_tokens": 746325467.0, + "step": 19557 + }, + { + "epoch": 2.4879786286731966, + "ewc_loss": 0.008545887656509876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545887976652011e-05, + "grad_norm": 4.176809310913086, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.877688467502594, + "num_tokens": 746362285.0, + "step": 19558 + }, + { + "epoch": 2.488105838951787, + "ewc_loss": 0.008442778140306473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442778198514134e-05, + "grad_norm": 4.201700687408447, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.888165295124054, + "num_tokens": 746401540.0, + "step": 19559 + }, + { + "epoch": 2.4882330492303777, + "ewc_loss": 0.00848187506198883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481874829158187e-05, + "grad_norm": 4.162962436676025, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8842799663543701, + "num_tokens": 746439282.0, + "step": 19560 + }, + { + "epoch": 2.488360259508968, + "ewc_loss": 0.008463900536298752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463900303468108e-05, + "grad_norm": 4.1229166984558105, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8935933113098145, + "num_tokens": 746480993.0, + "step": 19561 + }, + { + "epoch": 2.4884874697875587, + "ewc_loss": 0.008451873436570168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451873873127624e-05, + "grad_norm": 4.20404052734375, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8801500797271729, + "num_tokens": 746519318.0, + "step": 19562 + }, + { + "epoch": 2.4886146800661493, + "ewc_loss": 0.008512374013662338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512374188285321e-05, + "grad_norm": 4.171945571899414, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8724761009216309, + "num_tokens": 746559761.0, + "step": 19563 + }, + { + "epoch": 2.48874189034474, + "ewc_loss": 0.008466595783829689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.466595318168402e-05, + "grad_norm": 4.272971153259277, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8901153802871704, + "num_tokens": 746591622.0, + "step": 19564 + }, + { + "epoch": 2.4888691006233303, + "ewc_loss": 0.008519242517650127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51924269227311e-05, + "grad_norm": 4.183526992797852, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8811138868331909, + "num_tokens": 746627709.0, + "step": 19565 + }, + { + "epoch": 2.488996310901921, + "ewc_loss": 0.008454308845102787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45430840854533e-05, + "grad_norm": 4.209167957305908, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8783401250839233, + "num_tokens": 746663445.0, + "step": 19566 + }, + { + "epoch": 2.4891235211805114, + "ewc_loss": 0.008484783582389355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484783757012337e-05, + "grad_norm": 4.176048755645752, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8707681894302368, + "num_tokens": 746706869.0, + "step": 19567 + }, + { + "epoch": 2.489250731459102, + "ewc_loss": 0.00848227459937334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482274279231206e-05, + "grad_norm": 4.172354698181152, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8938801884651184, + "num_tokens": 746749502.0, + "step": 19568 + }, + { + "epoch": 2.4893779417376924, + "ewc_loss": 0.008478231728076935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478231757180765e-05, + "grad_norm": 4.2102370262146, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8847764730453491, + "num_tokens": 746786510.0, + "step": 19569 + }, + { + "epoch": 2.489505152016283, + "ewc_loss": 0.00847714301198721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477143273921683e-05, + "grad_norm": 4.241476535797119, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8869823217391968, + "num_tokens": 746819430.0, + "step": 19570 + }, + { + "epoch": 2.4896323622948735, + "ewc_loss": 0.008489678613841534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489678293699399e-05, + "grad_norm": 4.221830368041992, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8860809803009033, + "num_tokens": 746856018.0, + "step": 19571 + }, + { + "epoch": 2.489759572573464, + "ewc_loss": 0.008470946922898293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470947068417445e-05, + "grad_norm": 4.194276809692383, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8780303001403809, + "num_tokens": 746897350.0, + "step": 19572 + }, + { + "epoch": 2.4898867828520546, + "ewc_loss": 0.008465434424579144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46543480292894e-05, + "grad_norm": 4.250182628631592, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8767756223678589, + "num_tokens": 746937163.0, + "step": 19573 + }, + { + "epoch": 2.490013993130645, + "ewc_loss": 0.008499467745423317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499468094669282e-05, + "grad_norm": 4.137806415557861, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8913452625274658, + "num_tokens": 746976140.0, + "step": 19574 + }, + { + "epoch": 2.4901412034092356, + "ewc_loss": 0.008409388363361359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.409388101426885e-05, + "grad_norm": 4.197978973388672, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8836283087730408, + "num_tokens": 747012576.0, + "step": 19575 + }, + { + "epoch": 2.490268413687826, + "ewc_loss": 0.008488236926496029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488236926496029e-05, + "grad_norm": 4.197040557861328, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8812217712402344, + "num_tokens": 747048179.0, + "step": 19576 + }, + { + "epoch": 2.4903956239664167, + "ewc_loss": 0.008463818579912186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46381881274283e-05, + "grad_norm": 4.226674556732178, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8646849989891052, + "num_tokens": 747085291.0, + "step": 19577 + }, + { + "epoch": 2.4905228342450068, + "ewc_loss": 0.008486359380185604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48635972943157e-05, + "grad_norm": 4.197870254516602, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.888495922088623, + "num_tokens": 747119085.0, + "step": 19578 + }, + { + "epoch": 2.4906500445235977, + "ewc_loss": 0.008472459390759468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.472459012409672e-05, + "grad_norm": 4.189159393310547, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8805629014968872, + "num_tokens": 747153463.0, + "step": 19579 + }, + { + "epoch": 2.490777254802188, + "ewc_loss": 0.008487238548696041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487238665111363e-05, + "grad_norm": 4.236791133880615, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8592294454574585, + "num_tokens": 747194781.0, + "step": 19580 + }, + { + "epoch": 2.4909044650807783, + "ewc_loss": 0.00849416758865118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49416755954735e-05, + "grad_norm": 4.175510406494141, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8845710754394531, + "num_tokens": 747235328.0, + "step": 19581 + }, + { + "epoch": 2.491031675359369, + "ewc_loss": 0.00844030175358057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440302190138027e-05, + "grad_norm": 4.162741184234619, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8813591003417969, + "num_tokens": 747278481.0, + "step": 19582 + }, + { + "epoch": 2.4911588856379594, + "ewc_loss": 0.008495127782225609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495127985952422e-05, + "grad_norm": 4.185450077056885, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8957655429840088, + "num_tokens": 747313077.0, + "step": 19583 + }, + { + "epoch": 2.49128609591655, + "ewc_loss": 0.0084999930113554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499993418809026e-05, + "grad_norm": 4.175547122955322, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8893648386001587, + "num_tokens": 747352693.0, + "step": 19584 + }, + { + "epoch": 2.4914133061951405, + "ewc_loss": 0.008483093231916428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483093552058563e-05, + "grad_norm": 4.159635543823242, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8744202852249146, + "num_tokens": 747397717.0, + "step": 19585 + }, + { + "epoch": 2.491540516473731, + "ewc_loss": 0.008474770002067089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474769856547937e-05, + "grad_norm": 4.236679553985596, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8863819241523743, + "num_tokens": 747435457.0, + "step": 19586 + }, + { + "epoch": 2.4916677267523215, + "ewc_loss": 0.008500155992805958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500156400259584e-05, + "grad_norm": 4.145438194274902, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8858941793441772, + "num_tokens": 747479017.0, + "step": 19587 + }, + { + "epoch": 2.491794937030912, + "ewc_loss": 0.008442074060440063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442073885817081e-05, + "grad_norm": 4.216506004333496, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8825017213821411, + "num_tokens": 747520247.0, + "step": 19588 + }, + { + "epoch": 2.4919221473095026, + "ewc_loss": 0.008518370799720287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518371032550931e-05, + "grad_norm": 4.240323543548584, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8699314594268799, + "num_tokens": 747559983.0, + "step": 19589 + }, + { + "epoch": 2.492049357588093, + "ewc_loss": 0.008473622612655163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47362243803218e-05, + "grad_norm": 4.2158732414245605, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8790065050125122, + "num_tokens": 747595746.0, + "step": 19590 + }, + { + "epoch": 2.4921765678666836, + "ewc_loss": 0.008465924300253391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465924474876374e-05, + "grad_norm": 4.149750709533691, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8814991116523743, + "num_tokens": 747639261.0, + "step": 19591 + }, + { + "epoch": 2.492303778145274, + "ewc_loss": 0.008439426310360432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43942616484128e-05, + "grad_norm": 4.312089920043945, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8654553294181824, + "num_tokens": 747680665.0, + "step": 19592 + }, + { + "epoch": 2.4924309884238647, + "ewc_loss": 0.008549479767680168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549480116926134e-05, + "grad_norm": 4.2595343589782715, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.88572096824646, + "num_tokens": 747713320.0, + "step": 19593 + }, + { + "epoch": 2.492558198702455, + "ewc_loss": 0.008454188704490662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454188355244696e-05, + "grad_norm": 4.2474260330200195, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8573492765426636, + "num_tokens": 747752167.0, + "step": 19594 + }, + { + "epoch": 2.4926854089810457, + "ewc_loss": 0.008470410481095314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470410102745518e-05, + "grad_norm": 4.183608055114746, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8875395059585571, + "num_tokens": 747787274.0, + "step": 19595 + }, + { + "epoch": 2.4928126192596363, + "ewc_loss": 0.008439562283456326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439562225248665e-05, + "grad_norm": 4.175947666168213, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8776179552078247, + "num_tokens": 747826647.0, + "step": 19596 + }, + { + "epoch": 2.492939829538227, + "ewc_loss": 0.00847900751978159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479007374262437e-05, + "grad_norm": 4.202727794647217, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8789819478988647, + "num_tokens": 747866797.0, + "step": 19597 + }, + { + "epoch": 2.4930670398168173, + "ewc_loss": 0.00849419180303812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494191570207477e-05, + "grad_norm": 4.1864495277404785, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.882904589176178, + "num_tokens": 747907589.0, + "step": 19598 + }, + { + "epoch": 2.493194250095408, + "ewc_loss": 0.008458586409687996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45858667162247e-05, + "grad_norm": 4.239960193634033, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8758978247642517, + "num_tokens": 747939865.0, + "step": 19599 + }, + { + "epoch": 2.4933214603739984, + "ewc_loss": 0.008518939837813377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51894001243636e-05, + "grad_norm": 4.265539169311523, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8783864378929138, + "num_tokens": 747976238.0, + "step": 19600 + }, + { + "epoch": 2.493448670652589, + "ewc_loss": 0.008507669903337955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507670281687751e-05, + "grad_norm": 4.15348482131958, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8812434673309326, + "num_tokens": 748012076.0, + "step": 19601 + }, + { + "epoch": 2.4935758809311794, + "ewc_loss": 0.008466732688248158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46673283376731e-05, + "grad_norm": 4.32487154006958, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8763432502746582, + "num_tokens": 748042518.0, + "step": 19602 + }, + { + "epoch": 2.4937030912097695, + "ewc_loss": 0.008596789091825485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596789120929316e-05, + "grad_norm": 4.1922688484191895, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8891808986663818, + "num_tokens": 748077509.0, + "step": 19603 + }, + { + "epoch": 2.4938303014883605, + "ewc_loss": 0.008476261980831623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476262155454606e-05, + "grad_norm": 4.162402153015137, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8853325247764587, + "num_tokens": 748118164.0, + "step": 19604 + }, + { + "epoch": 2.4939575117669506, + "ewc_loss": 0.008485764265060425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485763828502968e-05, + "grad_norm": 4.2053117752075195, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.875907301902771, + "num_tokens": 748153609.0, + "step": 19605 + }, + { + "epoch": 2.494084722045541, + "ewc_loss": 0.00854455679655075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544556476408616e-05, + "grad_norm": 4.19582462310791, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8790132403373718, + "num_tokens": 748189573.0, + "step": 19606 + }, + { + "epoch": 2.4942119323241316, + "ewc_loss": 0.008523778058588505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523777796654031e-05, + "grad_norm": 4.189453601837158, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8762208819389343, + "num_tokens": 748231121.0, + "step": 19607 + }, + { + "epoch": 2.494339142602722, + "ewc_loss": 0.008512021973729134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512022031936795e-05, + "grad_norm": 4.1620354652404785, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8925209045410156, + "num_tokens": 748268744.0, + "step": 19608 + }, + { + "epoch": 2.4944663528813127, + "ewc_loss": 0.008507094345986843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507094025844708e-05, + "grad_norm": 4.222395896911621, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8864763975143433, + "num_tokens": 748307441.0, + "step": 19609 + }, + { + "epoch": 2.494593563159903, + "ewc_loss": 0.008535372093319893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535372035112232e-05, + "grad_norm": 4.185816287994385, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8816483020782471, + "num_tokens": 748343506.0, + "step": 19610 + }, + { + "epoch": 2.4947207734384937, + "ewc_loss": 0.008495119400322437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495119254803285e-05, + "grad_norm": 4.16806697845459, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8829644918441772, + "num_tokens": 748382209.0, + "step": 19611 + }, + { + "epoch": 2.4948479837170843, + "ewc_loss": 0.008485936559736729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485936996294186e-05, + "grad_norm": 4.204870223999023, + "learning_rate": 1e-06, + "loss": 0.2789, + "mean_token_accuracy": 0.9006805419921875, + "num_tokens": 748413660.0, + "step": 19612 + }, + { + "epoch": 2.494975193995675, + "ewc_loss": 0.0085190050303936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519005496054888e-05, + "grad_norm": 4.1660284996032715, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8833863139152527, + "num_tokens": 748457037.0, + "step": 19613 + }, + { + "epoch": 2.4951024042742653, + "ewc_loss": 0.008491626009345055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491626067552716e-05, + "grad_norm": 4.1929402351379395, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8619785904884338, + "num_tokens": 748497227.0, + "step": 19614 + }, + { + "epoch": 2.495229614552856, + "ewc_loss": 0.008518001064658165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51800141390413e-05, + "grad_norm": 4.240627765655518, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8797562718391418, + "num_tokens": 748529901.0, + "step": 19615 + }, + { + "epoch": 2.4953568248314464, + "ewc_loss": 0.00852220505475998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522204734617844e-05, + "grad_norm": 4.200601577758789, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8712310791015625, + "num_tokens": 748568238.0, + "step": 19616 + }, + { + "epoch": 2.495484035110037, + "ewc_loss": 0.00850826408714056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508263999829069e-05, + "grad_norm": 4.208016395568848, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8719799518585205, + "num_tokens": 748608153.0, + "step": 19617 + }, + { + "epoch": 2.4956112453886274, + "ewc_loss": 0.008526542223989964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52654266054742e-05, + "grad_norm": 4.204753398895264, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8782082200050354, + "num_tokens": 748646977.0, + "step": 19618 + }, + { + "epoch": 2.495738455667218, + "ewc_loss": 0.008512457832694054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512457861797884e-05, + "grad_norm": 4.17413330078125, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8734267950057983, + "num_tokens": 748689586.0, + "step": 19619 + }, + { + "epoch": 2.4958656659458085, + "ewc_loss": 0.008518034592270851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518034883309156e-05, + "grad_norm": 4.187759876251221, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8833698034286499, + "num_tokens": 748730915.0, + "step": 19620 + }, + { + "epoch": 2.495992876224399, + "ewc_loss": 0.00851783249527216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517832611687481e-05, + "grad_norm": 4.236153602600098, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8774954080581665, + "num_tokens": 748766996.0, + "step": 19621 + }, + { + "epoch": 2.4961200865029896, + "ewc_loss": 0.008548012934625149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548012556275353e-05, + "grad_norm": 4.248967170715332, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8751853108406067, + "num_tokens": 748801225.0, + "step": 19622 + }, + { + "epoch": 2.49624729678158, + "ewc_loss": 0.008533176966011524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533176878700033e-05, + "grad_norm": 4.146695613861084, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.877068281173706, + "num_tokens": 748841355.0, + "step": 19623 + }, + { + "epoch": 2.4963745070601706, + "ewc_loss": 0.008496175520122051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496175723848864e-05, + "grad_norm": 4.194601535797119, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8798118829727173, + "num_tokens": 748879636.0, + "step": 19624 + }, + { + "epoch": 2.496501717338761, + "ewc_loss": 0.008553114719688892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553114457754418e-05, + "grad_norm": 4.205968856811523, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.879389762878418, + "num_tokens": 748917795.0, + "step": 19625 + }, + { + "epoch": 2.4966289276173517, + "ewc_loss": 0.008540991693735123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540991257177666e-05, + "grad_norm": 4.253650188446045, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8575900197029114, + "num_tokens": 748956512.0, + "step": 19626 + }, + { + "epoch": 2.496756137895942, + "ewc_loss": 0.008571139536798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571139915147796e-05, + "grad_norm": 4.180288791656494, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8860173225402832, + "num_tokens": 748995577.0, + "step": 19627 + }, + { + "epoch": 2.4968833481745323, + "ewc_loss": 0.008515260182321072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515259833075106e-05, + "grad_norm": 4.248997688293457, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8780602216720581, + "num_tokens": 749027672.0, + "step": 19628 + }, + { + "epoch": 2.4970105584531233, + "ewc_loss": 0.008578520268201828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578520646551624e-05, + "grad_norm": 4.211292266845703, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8905195593833923, + "num_tokens": 749059101.0, + "step": 19629 + }, + { + "epoch": 2.4971377687317133, + "ewc_loss": 0.008543862029910088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543861622456461e-05, + "grad_norm": 4.219912528991699, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8675433397293091, + "num_tokens": 749097034.0, + "step": 19630 + }, + { + "epoch": 2.497264979010304, + "ewc_loss": 0.008546779863536358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54678000905551e-05, + "grad_norm": 4.145910263061523, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8790974617004395, + "num_tokens": 749137536.0, + "step": 19631 + }, + { + "epoch": 2.4973921892888944, + "ewc_loss": 0.008535193279385567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535193046554923e-05, + "grad_norm": 4.234615325927734, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8821808695793152, + "num_tokens": 749171487.0, + "step": 19632 + }, + { + "epoch": 2.497519399567485, + "ewc_loss": 0.00858739111572504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587391494074836e-05, + "grad_norm": 4.17591667175293, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8748358488082886, + "num_tokens": 749213234.0, + "step": 19633 + }, + { + "epoch": 2.4976466098460754, + "ewc_loss": 0.008551846258342266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551846258342266e-05, + "grad_norm": 4.246760368347168, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8698375821113586, + "num_tokens": 749256184.0, + "step": 19634 + }, + { + "epoch": 2.497773820124666, + "ewc_loss": 0.00858644861727953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586448529968038e-05, + "grad_norm": 4.19711446762085, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8796523809432983, + "num_tokens": 749292909.0, + "step": 19635 + }, + { + "epoch": 2.4979010304032565, + "ewc_loss": 0.008566063828766346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56606347952038e-05, + "grad_norm": 4.262021064758301, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8683239221572876, + "num_tokens": 749327947.0, + "step": 19636 + }, + { + "epoch": 2.498028240681847, + "ewc_loss": 0.008592586033046246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592585800215602e-05, + "grad_norm": 4.208869457244873, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8905894160270691, + "num_tokens": 749364107.0, + "step": 19637 + }, + { + "epoch": 2.4981554509604376, + "ewc_loss": 0.008550840429961681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550839993404225e-05, + "grad_norm": 4.2218194007873535, + "learning_rate": 1e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.895808756351471, + "num_tokens": 749398055.0, + "step": 19638 + }, + { + "epoch": 2.498282661239028, + "ewc_loss": 0.00855153240263462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551531936973333e-05, + "grad_norm": 4.2142415046691895, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8811442255973816, + "num_tokens": 749434626.0, + "step": 19639 + }, + { + "epoch": 2.4984098715176186, + "ewc_loss": 0.008562548086047173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562547736801207e-05, + "grad_norm": 4.2136969566345215, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8750469088554382, + "num_tokens": 749470616.0, + "step": 19640 + }, + { + "epoch": 2.498537081796209, + "ewc_loss": 0.008575929328799248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575929678045213e-05, + "grad_norm": 4.15425968170166, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8840273022651672, + "num_tokens": 749513072.0, + "step": 19641 + }, + { + "epoch": 2.4986642920747997, + "ewc_loss": 0.008548771031200886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548770711058751e-05, + "grad_norm": 4.211570739746094, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8935712575912476, + "num_tokens": 749546988.0, + "step": 19642 + }, + { + "epoch": 2.49879150235339, + "ewc_loss": 0.008575201034545898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575201354688033e-05, + "grad_norm": 4.191084384918213, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8765385150909424, + "num_tokens": 749585851.0, + "step": 19643 + }, + { + "epoch": 2.4989187126319807, + "ewc_loss": 0.008541506715118885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54150639497675e-05, + "grad_norm": 4.23840856552124, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8804377317428589, + "num_tokens": 749619510.0, + "step": 19644 + }, + { + "epoch": 2.4990459229105713, + "ewc_loss": 0.008577706292271614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577706466894597e-05, + "grad_norm": 4.2131757736206055, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8873400688171387, + "num_tokens": 749653660.0, + "step": 19645 + }, + { + "epoch": 2.499173133189162, + "ewc_loss": 0.008551526814699173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551526843803003e-05, + "grad_norm": 4.22048807144165, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8823349475860596, + "num_tokens": 749691321.0, + "step": 19646 + }, + { + "epoch": 2.4993003434677523, + "ewc_loss": 0.008564147166907787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564146992284805e-05, + "grad_norm": 4.21765661239624, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.883945107460022, + "num_tokens": 749729305.0, + "step": 19647 + }, + { + "epoch": 2.499427553746343, + "ewc_loss": 0.008563501760363579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563501614844427e-05, + "grad_norm": 4.1826348304748535, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.886934757232666, + "num_tokens": 749766134.0, + "step": 19648 + }, + { + "epoch": 2.4995547640249334, + "ewc_loss": 0.008536575362086296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536575478501618e-05, + "grad_norm": 4.211833477020264, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8787386417388916, + "num_tokens": 749807715.0, + "step": 19649 + }, + { + "epoch": 2.499681974303524, + "ewc_loss": 0.008540027774870396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540027920389548e-05, + "grad_norm": 4.239772796630859, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8819226622581482, + "num_tokens": 749838799.0, + "step": 19650 + }, + { + "epoch": 2.499809184582114, + "ewc_loss": 0.008589071221649647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589071512687951e-05, + "grad_norm": 4.186809539794922, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8918720483779907, + "num_tokens": 749874283.0, + "step": 19651 + }, + { + "epoch": 2.499936394860705, + "ewc_loss": 0.008522170595824718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522170537617058e-05, + "grad_norm": 4.210676670074463, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8780757188796997, + "num_tokens": 749915912.0, + "step": 19652 + }, + { + "epoch": 2.500063605139295, + "ewc_loss": 0.008563811890780926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563811570638791e-05, + "grad_norm": 4.188251495361328, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.888790488243103, + "num_tokens": 749959133.0, + "step": 19653 + }, + { + "epoch": 2.500190815417886, + "ewc_loss": 0.008541087619960308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541088027413934e-05, + "grad_norm": 4.2223100662231445, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8793718814849854, + "num_tokens": 749997846.0, + "step": 19654 + }, + { + "epoch": 2.500318025696476, + "ewc_loss": 0.008562073111534119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562073344364762e-05, + "grad_norm": 4.173618793487549, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8851029872894287, + "num_tokens": 750040973.0, + "step": 19655 + }, + { + "epoch": 2.5004452359750666, + "ewc_loss": 0.008494543842971325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494543726556003e-05, + "grad_norm": 4.186523914337158, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8740086555480957, + "num_tokens": 750082171.0, + "step": 19656 + }, + { + "epoch": 2.500572446253657, + "ewc_loss": 0.008528093807399273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528093894710764e-05, + "grad_norm": 4.162369251251221, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8918006420135498, + "num_tokens": 750123083.0, + "step": 19657 + }, + { + "epoch": 2.5006996565322477, + "ewc_loss": 0.008493437431752682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493437053402886e-05, + "grad_norm": 4.211296081542969, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8674174547195435, + "num_tokens": 750162235.0, + "step": 19658 + }, + { + "epoch": 2.500826866810838, + "ewc_loss": 0.008527430705726147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527431054972112e-05, + "grad_norm": 4.221701145172119, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.892652153968811, + "num_tokens": 750196501.0, + "step": 19659 + }, + { + "epoch": 2.5009540770894287, + "ewc_loss": 0.008519504219293594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519504626747221e-05, + "grad_norm": 4.169728755950928, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8549392223358154, + "num_tokens": 750241178.0, + "step": 19660 + }, + { + "epoch": 2.5010812873680193, + "ewc_loss": 0.008482049219310284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482049452140927e-05, + "grad_norm": 4.1661601066589355, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8866142630577087, + "num_tokens": 750284104.0, + "step": 19661 + }, + { + "epoch": 2.50120849764661, + "ewc_loss": 0.008492719382047653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492719643982127e-05, + "grad_norm": 4.2151875495910645, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8867770433425903, + "num_tokens": 750318525.0, + "step": 19662 + }, + { + "epoch": 2.5013357079252003, + "ewc_loss": 0.0085161617025733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516162051819265e-05, + "grad_norm": 4.184291362762451, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8901355266571045, + "num_tokens": 750358892.0, + "step": 19663 + }, + { + "epoch": 2.501462918203791, + "ewc_loss": 0.008477714844048023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477715164190158e-05, + "grad_norm": 4.214230537414551, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8686505556106567, + "num_tokens": 750399325.0, + "step": 19664 + }, + { + "epoch": 2.5015901284823814, + "ewc_loss": 0.008486878126859665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4868777776137e-05, + "grad_norm": 4.215211391448975, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8750032186508179, + "num_tokens": 750437493.0, + "step": 19665 + }, + { + "epoch": 2.501717338760972, + "ewc_loss": 0.008475618436932564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47561823320575e-05, + "grad_norm": 4.179288864135742, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8884034156799316, + "num_tokens": 750472381.0, + "step": 19666 + }, + { + "epoch": 2.5018445490395624, + "ewc_loss": 0.00845879502594471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458794764010236e-05, + "grad_norm": 4.175947189331055, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8656438589096069, + "num_tokens": 750511786.0, + "step": 19667 + }, + { + "epoch": 2.501971759318153, + "ewc_loss": 0.008482477627694607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482478006044403e-05, + "grad_norm": 4.225353717803955, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8880196213722229, + "num_tokens": 750550596.0, + "step": 19668 + }, + { + "epoch": 2.5020989695967435, + "ewc_loss": 0.008486256003379822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486256410833448e-05, + "grad_norm": 4.1305108070373535, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8865166306495667, + "num_tokens": 750594790.0, + "step": 19669 + }, + { + "epoch": 2.502226179875334, + "ewc_loss": 0.008414107374846935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414107287535444e-05, + "grad_norm": 4.13429069519043, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8935704827308655, + "num_tokens": 750636738.0, + "step": 19670 + }, + { + "epoch": 2.5023533901539246, + "ewc_loss": 0.008449489250779152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449489541817456e-05, + "grad_norm": 4.204495906829834, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8933053016662598, + "num_tokens": 750672338.0, + "step": 19671 + }, + { + "epoch": 2.502480600432515, + "ewc_loss": 0.008457878604531288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45787872094661e-05, + "grad_norm": 4.212890148162842, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8770719766616821, + "num_tokens": 750709149.0, + "step": 19672 + }, + { + "epoch": 2.5026078107111056, + "ewc_loss": 0.008455252274870872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45525210024789e-05, + "grad_norm": 4.218230724334717, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8940688371658325, + "num_tokens": 750741727.0, + "step": 19673 + }, + { + "epoch": 2.5027350209896957, + "ewc_loss": 0.008462394587695599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462394907837734e-05, + "grad_norm": 4.276100158691406, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8702468872070312, + "num_tokens": 750775903.0, + "step": 19674 + }, + { + "epoch": 2.5028622312682867, + "ewc_loss": 0.008485880680382252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485880971420556e-05, + "grad_norm": 4.2069478034973145, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8771556615829468, + "num_tokens": 750810502.0, + "step": 19675 + }, + { + "epoch": 2.5029894415468767, + "ewc_loss": 0.008435207419097424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435207564616576e-05, + "grad_norm": 4.170046329498291, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8897950649261475, + "num_tokens": 750847762.0, + "step": 19676 + }, + { + "epoch": 2.5031166518254677, + "ewc_loss": 0.008451619185507298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451619214611128e-05, + "grad_norm": 4.169029235839844, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8837763071060181, + "num_tokens": 750887201.0, + "step": 19677 + }, + { + "epoch": 2.503243862104058, + "ewc_loss": 0.008471508510410786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47150877234526e-05, + "grad_norm": 4.1892313957214355, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.881134033203125, + "num_tokens": 750927323.0, + "step": 19678 + }, + { + "epoch": 2.5033710723826488, + "ewc_loss": 0.008471537381410599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471537148579955e-05, + "grad_norm": 4.18823766708374, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8741487264633179, + "num_tokens": 750972501.0, + "step": 19679 + }, + { + "epoch": 2.503498282661239, + "ewc_loss": 0.008481538854539394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481538679916412e-05, + "grad_norm": 4.1777167320251465, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8905070424079895, + "num_tokens": 751010743.0, + "step": 19680 + }, + { + "epoch": 2.5036254929398294, + "ewc_loss": 0.008447585627436638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447585423709825e-05, + "grad_norm": 4.201765537261963, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8777881264686584, + "num_tokens": 751050440.0, + "step": 19681 + }, + { + "epoch": 2.50375270321842, + "ewc_loss": 0.008490709587931633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49071002448909e-05, + "grad_norm": 4.204117774963379, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8852593898773193, + "num_tokens": 751088319.0, + "step": 19682 + }, + { + "epoch": 2.5038799134970104, + "ewc_loss": 0.008472856134176254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.472856279695407e-05, + "grad_norm": 4.184377670288086, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8824062943458557, + "num_tokens": 751123766.0, + "step": 19683 + }, + { + "epoch": 2.504007123775601, + "ewc_loss": 0.008470983244478703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470983448205516e-05, + "grad_norm": 4.167012691497803, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8990832567214966, + "num_tokens": 751163375.0, + "step": 19684 + }, + { + "epoch": 2.5041343340541915, + "ewc_loss": 0.008450272493064404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450272434856743e-05, + "grad_norm": 4.170334815979004, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8750051856040955, + "num_tokens": 751204526.0, + "step": 19685 + }, + { + "epoch": 2.504261544332782, + "ewc_loss": 0.008462251164019108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462251571472734e-05, + "grad_norm": 4.233757495880127, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8867731690406799, + "num_tokens": 751245133.0, + "step": 19686 + }, + { + "epoch": 2.5043887546113726, + "ewc_loss": 0.008495639078319073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495638758176938e-05, + "grad_norm": 4.244871139526367, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8717188239097595, + "num_tokens": 751284578.0, + "step": 19687 + }, + { + "epoch": 2.504515964889963, + "ewc_loss": 0.008467993699014187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467993757221848e-05, + "grad_norm": 4.209657669067383, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8730036020278931, + "num_tokens": 751325794.0, + "step": 19688 + }, + { + "epoch": 2.5046431751685536, + "ewc_loss": 0.008447633124887943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447633445030078e-05, + "grad_norm": 4.162344932556152, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8889033794403076, + "num_tokens": 751363777.0, + "step": 19689 + }, + { + "epoch": 2.504770385447144, + "ewc_loss": 0.008449921384453773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449921733699739e-05, + "grad_norm": 4.239415645599365, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8722416162490845, + "num_tokens": 751400499.0, + "step": 19690 + }, + { + "epoch": 2.5048975957257347, + "ewc_loss": 0.00848828162997961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488281309837475e-05, + "grad_norm": 4.22326135635376, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.879470705986023, + "num_tokens": 751434895.0, + "step": 19691 + }, + { + "epoch": 2.505024806004325, + "ewc_loss": 0.008460280485451221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460280514555052e-05, + "grad_norm": 4.136889457702637, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8904358148574829, + "num_tokens": 751473984.0, + "step": 19692 + }, + { + "epoch": 2.5051520162829157, + "ewc_loss": 0.00844169408082962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44169408082962e-05, + "grad_norm": 4.191965103149414, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8837375640869141, + "num_tokens": 751515030.0, + "step": 19693 + }, + { + "epoch": 2.5052792265615063, + "ewc_loss": 0.008487613871693611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487614104524255e-05, + "grad_norm": 4.1995062828063965, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8807225227355957, + "num_tokens": 751550023.0, + "step": 19694 + }, + { + "epoch": 2.505406436840097, + "ewc_loss": 0.008498161099851131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498161332681775e-05, + "grad_norm": 4.211026191711426, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8835998177528381, + "num_tokens": 751584793.0, + "step": 19695 + }, + { + "epoch": 2.5055336471186873, + "ewc_loss": 0.00849051121622324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490511390846223e-05, + "grad_norm": 4.2267961502075195, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.865054726600647, + "num_tokens": 751619817.0, + "step": 19696 + }, + { + "epoch": 2.505660857397278, + "ewc_loss": 0.008497475646436214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497475937474519e-05, + "grad_norm": 4.225253105163574, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8877471685409546, + "num_tokens": 751657097.0, + "step": 19697 + }, + { + "epoch": 2.5057880676758684, + "ewc_loss": 0.008496100082993507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496100053889677e-05, + "grad_norm": 4.2871832847595215, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8728805780410767, + "num_tokens": 751687042.0, + "step": 19698 + }, + { + "epoch": 2.5059152779544585, + "ewc_loss": 0.00855227280408144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552272629458457e-05, + "grad_norm": 4.168478012084961, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8895142078399658, + "num_tokens": 751724281.0, + "step": 19699 + }, + { + "epoch": 2.5060424882330494, + "ewc_loss": 0.00848392117768526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483921556035057e-05, + "grad_norm": 4.218081951141357, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8642948865890503, + "num_tokens": 751765144.0, + "step": 19700 + }, + { + "epoch": 2.5061696985116395, + "ewc_loss": 0.008552009239792824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552009239792824e-05, + "grad_norm": 4.186795234680176, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8887679576873779, + "num_tokens": 751799602.0, + "step": 19701 + }, + { + "epoch": 2.5062969087902305, + "ewc_loss": 0.00853786151856184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537861140212044e-05, + "grad_norm": 4.170899391174316, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8890197277069092, + "num_tokens": 751836972.0, + "step": 19702 + }, + { + "epoch": 2.5064241190688206, + "ewc_loss": 0.008530556224286556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530556078767404e-05, + "grad_norm": 4.2296833992004395, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8834893107414246, + "num_tokens": 751869627.0, + "step": 19703 + }, + { + "epoch": 2.5065513293474115, + "ewc_loss": 0.008563871495425701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563871233491227e-05, + "grad_norm": 4.1843414306640625, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8736436367034912, + "num_tokens": 751911103.0, + "step": 19704 + }, + { + "epoch": 2.5066785396260016, + "ewc_loss": 0.008530179969966412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53017991175875e-05, + "grad_norm": 4.2262654304504395, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8762238621711731, + "num_tokens": 751948956.0, + "step": 19705 + }, + { + "epoch": 2.506805749904592, + "ewc_loss": 0.00858781486749649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587814954807982e-05, + "grad_norm": 4.175012111663818, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8856856822967529, + "num_tokens": 751992185.0, + "step": 19706 + }, + { + "epoch": 2.5069329601831827, + "ewc_loss": 0.008540304377675056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540304406778887e-05, + "grad_norm": 4.212096214294434, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8794512748718262, + "num_tokens": 752032661.0, + "step": 19707 + }, + { + "epoch": 2.507060170461773, + "ewc_loss": 0.008568302728235722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568303019274026e-05, + "grad_norm": 4.173342227935791, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8860243558883667, + "num_tokens": 752073946.0, + "step": 19708 + }, + { + "epoch": 2.5071873807403637, + "ewc_loss": 0.008532808162271976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532807987648994e-05, + "grad_norm": 4.190949440002441, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8679900169372559, + "num_tokens": 752118766.0, + "step": 19709 + }, + { + "epoch": 2.5073145910189543, + "ewc_loss": 0.008557762950658798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55776306707412e-05, + "grad_norm": 4.189804553985596, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8841144442558289, + "num_tokens": 752159669.0, + "step": 19710 + }, + { + "epoch": 2.507441801297545, + "ewc_loss": 0.008528757840394974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528757462045178e-05, + "grad_norm": 4.216566562652588, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8845322132110596, + "num_tokens": 752198139.0, + "step": 19711 + }, + { + "epoch": 2.5075690115761353, + "ewc_loss": 0.00853767804801464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537678513675928e-05, + "grad_norm": 4.164875507354736, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8841519355773926, + "num_tokens": 752239686.0, + "step": 19712 + }, + { + "epoch": 2.507696221854726, + "ewc_loss": 0.008489489555358887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48948911880143e-05, + "grad_norm": 4.278082847595215, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8667232990264893, + "num_tokens": 752275740.0, + "step": 19713 + }, + { + "epoch": 2.5078234321333164, + "ewc_loss": 0.008580188266932964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580188296036795e-05, + "grad_norm": 4.259792804718018, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8708033561706543, + "num_tokens": 752310554.0, + "step": 19714 + }, + { + "epoch": 2.507950642411907, + "ewc_loss": 0.008519326336681843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519326365785673e-05, + "grad_norm": 4.200225830078125, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.886100172996521, + "num_tokens": 752347761.0, + "step": 19715 + }, + { + "epoch": 2.5080778526904974, + "ewc_loss": 0.008497613482177258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497613453073427e-05, + "grad_norm": 4.219250679016113, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8733676075935364, + "num_tokens": 752387128.0, + "step": 19716 + }, + { + "epoch": 2.508205062969088, + "ewc_loss": 0.008533653803169727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533654181519523e-05, + "grad_norm": 4.293221473693848, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8812144994735718, + "num_tokens": 752428722.0, + "step": 19717 + }, + { + "epoch": 2.5083322732476785, + "ewc_loss": 0.008559907786548138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559908019378781e-05, + "grad_norm": 4.185013294219971, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8809767961502075, + "num_tokens": 752468920.0, + "step": 19718 + }, + { + "epoch": 2.508459483526269, + "ewc_loss": 0.00845311302691698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45311296870932e-05, + "grad_norm": 4.239335060119629, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8731238842010498, + "num_tokens": 752505226.0, + "step": 19719 + }, + { + "epoch": 2.5085866938048595, + "ewc_loss": 0.008548302575945854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548302139388397e-05, + "grad_norm": 4.2079925537109375, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8830984830856323, + "num_tokens": 752543693.0, + "step": 19720 + }, + { + "epoch": 2.50871390408345, + "ewc_loss": 0.008490615524351597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490615437040105e-05, + "grad_norm": 4.236498832702637, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8688767552375793, + "num_tokens": 752578758.0, + "step": 19721 + }, + { + "epoch": 2.5088411143620406, + "ewc_loss": 0.00850549153983593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505491132382303e-05, + "grad_norm": 4.198873996734619, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8897064328193665, + "num_tokens": 752612115.0, + "step": 19722 + }, + { + "epoch": 2.508968324640631, + "ewc_loss": 0.008473710156977177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47370974952355e-05, + "grad_norm": 4.211353778839111, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8864914178848267, + "num_tokens": 752644601.0, + "step": 19723 + }, + { + "epoch": 2.509095534919221, + "ewc_loss": 0.008514377288520336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514377259416506e-05, + "grad_norm": 4.171121597290039, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8845146894454956, + "num_tokens": 752682333.0, + "step": 19724 + }, + { + "epoch": 2.509222745197812, + "ewc_loss": 0.008511442691087723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511442865710706e-05, + "grad_norm": 4.204796314239502, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.874608039855957, + "num_tokens": 752721802.0, + "step": 19725 + }, + { + "epoch": 2.5093499554764023, + "ewc_loss": 0.008535569533705711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535569213563576e-05, + "grad_norm": 4.202380180358887, + "learning_rate": 1e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8958126306533813, + "num_tokens": 752755673.0, + "step": 19726 + }, + { + "epoch": 2.5094771657549932, + "ewc_loss": 0.008520053699612617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520053961547092e-05, + "grad_norm": 4.157693862915039, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.889700174331665, + "num_tokens": 752795575.0, + "step": 19727 + }, + { + "epoch": 2.5096043760335833, + "ewc_loss": 0.008510753512382507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510753832524642e-05, + "grad_norm": 4.181748390197754, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8958982825279236, + "num_tokens": 752833172.0, + "step": 19728 + }, + { + "epoch": 2.5097315863121743, + "ewc_loss": 0.008520768024027348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520768460584804e-05, + "grad_norm": 4.213690280914307, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8672389388084412, + "num_tokens": 752872857.0, + "step": 19729 + }, + { + "epoch": 2.5098587965907644, + "ewc_loss": 0.008532045409083366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532045467291027e-05, + "grad_norm": 4.222912788391113, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8904263973236084, + "num_tokens": 752908553.0, + "step": 19730 + }, + { + "epoch": 2.509986006869355, + "ewc_loss": 0.008522030897438526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522030839230865e-05, + "grad_norm": 4.273782253265381, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8864515423774719, + "num_tokens": 752937762.0, + "step": 19731 + }, + { + "epoch": 2.5101132171479454, + "ewc_loss": 0.008558073081076145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558073022868484e-05, + "grad_norm": 4.196771621704102, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8669928312301636, + "num_tokens": 752977184.0, + "step": 19732 + }, + { + "epoch": 2.510240427426536, + "ewc_loss": 0.008489593863487244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489593892591074e-05, + "grad_norm": 4.169305801391602, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8786177039146423, + "num_tokens": 753021071.0, + "step": 19733 + }, + { + "epoch": 2.5103676377051265, + "ewc_loss": 0.0085128890350461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512889326084405e-05, + "grad_norm": 4.281432628631592, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8572884798049927, + "num_tokens": 753056597.0, + "step": 19734 + }, + { + "epoch": 2.510494847983717, + "ewc_loss": 0.00859517976641655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595179679105058e-05, + "grad_norm": 4.244320392608643, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8850741386413574, + "num_tokens": 753096178.0, + "step": 19735 + }, + { + "epoch": 2.5106220582623076, + "ewc_loss": 0.008510706946253777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510707266395912e-05, + "grad_norm": 4.222415447235107, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8845691680908203, + "num_tokens": 753130388.0, + "step": 19736 + }, + { + "epoch": 2.510749268540898, + "ewc_loss": 0.008527847006917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527847239747643e-05, + "grad_norm": 4.203585147857666, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8891960382461548, + "num_tokens": 753166585.0, + "step": 19737 + }, + { + "epoch": 2.5108764788194886, + "ewc_loss": 0.008516018278896809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516017987858504e-05, + "grad_norm": 4.210127830505371, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8903229832649231, + "num_tokens": 753203403.0, + "step": 19738 + }, + { + "epoch": 2.511003689098079, + "ewc_loss": 0.008541726507246494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541726128896698e-05, + "grad_norm": 4.288736820220947, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8851500153541565, + "num_tokens": 753241632.0, + "step": 19739 + }, + { + "epoch": 2.5111308993766697, + "ewc_loss": 0.0085663553327322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566355245420709e-05, + "grad_norm": 4.1667351722717285, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8939003348350525, + "num_tokens": 753279292.0, + "step": 19740 + }, + { + "epoch": 2.51125810965526, + "ewc_loss": 0.008477224968373775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477224764646962e-05, + "grad_norm": 4.235231399536133, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8729982972145081, + "num_tokens": 753319121.0, + "step": 19741 + }, + { + "epoch": 2.5113853199338507, + "ewc_loss": 0.008552642539143562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552642975701019e-05, + "grad_norm": 4.283419609069824, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8653170466423035, + "num_tokens": 753353074.0, + "step": 19742 + }, + { + "epoch": 2.5115125302124413, + "ewc_loss": 0.00855469610542059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554696250939742e-05, + "grad_norm": 4.176557540893555, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8983259201049805, + "num_tokens": 753388994.0, + "step": 19743 + }, + { + "epoch": 2.511639740491032, + "ewc_loss": 0.008473911322653294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473911293549463e-05, + "grad_norm": 4.134318828582764, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8871036767959595, + "num_tokens": 753430271.0, + "step": 19744 + }, + { + "epoch": 2.5117669507696223, + "ewc_loss": 0.008502035401761532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502035052515566e-05, + "grad_norm": 4.17271614074707, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8741627931594849, + "num_tokens": 753473820.0, + "step": 19745 + }, + { + "epoch": 2.511894161048213, + "ewc_loss": 0.008523018099367619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52301818667911e-05, + "grad_norm": 4.182785987854004, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8786082863807678, + "num_tokens": 753516397.0, + "step": 19746 + }, + { + "epoch": 2.5120213713268034, + "ewc_loss": 0.008500946685671806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500946569256485e-05, + "grad_norm": 4.176915645599365, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8900989294052124, + "num_tokens": 753557589.0, + "step": 19747 + }, + { + "epoch": 2.512148581605394, + "ewc_loss": 0.008501063100993633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501062984578311e-05, + "grad_norm": 4.265176773071289, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8888262510299683, + "num_tokens": 753593764.0, + "step": 19748 + }, + { + "epoch": 2.512275791883984, + "ewc_loss": 0.008520947769284248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520948176737875e-05, + "grad_norm": 4.20574426651001, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8703988790512085, + "num_tokens": 753635967.0, + "step": 19749 + }, + { + "epoch": 2.512403002162575, + "ewc_loss": 0.008452000096440315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45199974719435e-05, + "grad_norm": 4.1881914138793945, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8888131380081177, + "num_tokens": 753674595.0, + "step": 19750 + }, + { + "epoch": 2.512530212441165, + "ewc_loss": 0.00843176618218422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.431766036665067e-05, + "grad_norm": 4.211000919342041, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.866426408290863, + "num_tokens": 753714118.0, + "step": 19751 + }, + { + "epoch": 2.512657422719756, + "ewc_loss": 0.00846028421074152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460284152533859e-05, + "grad_norm": 4.201148509979248, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8673807382583618, + "num_tokens": 753758139.0, + "step": 19752 + }, + { + "epoch": 2.512784632998346, + "ewc_loss": 0.00846043135970831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460431126877666e-05, + "grad_norm": 4.205291271209717, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8711549043655396, + "num_tokens": 753800373.0, + "step": 19753 + }, + { + "epoch": 2.5129118432769366, + "ewc_loss": 0.008451782166957855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451782196061686e-05, + "grad_norm": 4.27553129196167, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8766651749610901, + "num_tokens": 753832366.0, + "step": 19754 + }, + { + "epoch": 2.513039053555527, + "ewc_loss": 0.008468644693493843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468644955428317e-05, + "grad_norm": 4.216080188751221, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8992754220962524, + "num_tokens": 753866176.0, + "step": 19755 + }, + { + "epoch": 2.5131662638341177, + "ewc_loss": 0.008416110649704933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.416110358666629e-05, + "grad_norm": 4.161298751831055, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.880810022354126, + "num_tokens": 753911470.0, + "step": 19756 + }, + { + "epoch": 2.513293474112708, + "ewc_loss": 0.008404392749071121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404393156524748e-05, + "grad_norm": 4.220689296722412, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.878340482711792, + "num_tokens": 753953027.0, + "step": 19757 + }, + { + "epoch": 2.5134206843912987, + "ewc_loss": 0.008471445180475712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471445471514016e-05, + "grad_norm": 4.265403747558594, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8741850256919861, + "num_tokens": 753987992.0, + "step": 19758 + }, + { + "epoch": 2.5135478946698893, + "ewc_loss": 0.008450536988675594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450537279713899e-05, + "grad_norm": 4.229973793029785, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.886128306388855, + "num_tokens": 754026671.0, + "step": 19759 + }, + { + "epoch": 2.51367510494848, + "ewc_loss": 0.00842491164803505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424911356996745e-05, + "grad_norm": 4.217595100402832, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8824313879013062, + "num_tokens": 754063526.0, + "step": 19760 + }, + { + "epoch": 2.5138023152270703, + "ewc_loss": 0.00843029748648405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430297748418525e-05, + "grad_norm": 4.214340686798096, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8667808771133423, + "num_tokens": 754103361.0, + "step": 19761 + }, + { + "epoch": 2.513929525505661, + "ewc_loss": 0.008442451246082783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442451508017257e-05, + "grad_norm": 4.184699058532715, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8835424184799194, + "num_tokens": 754141969.0, + "step": 19762 + }, + { + "epoch": 2.5140567357842514, + "ewc_loss": 0.008431206457316875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.431206515524536e-05, + "grad_norm": 4.148787498474121, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8963772654533386, + "num_tokens": 754182691.0, + "step": 19763 + }, + { + "epoch": 2.514183946062842, + "ewc_loss": 0.008413087576627731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413087198277935e-05, + "grad_norm": 4.210119247436523, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.881771445274353, + "num_tokens": 754219956.0, + "step": 19764 + }, + { + "epoch": 2.5143111563414324, + "ewc_loss": 0.008468042127788067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468042506137863e-05, + "grad_norm": 4.215838432312012, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8786963820457458, + "num_tokens": 754259467.0, + "step": 19765 + }, + { + "epoch": 2.514438366620023, + "ewc_loss": 0.008452072739601135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452072506770492e-05, + "grad_norm": 4.187290191650391, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8841198682785034, + "num_tokens": 754300344.0, + "step": 19766 + }, + { + "epoch": 2.5145655768986135, + "ewc_loss": 0.008430146612226963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43014640850015e-05, + "grad_norm": 4.148256301879883, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8805166482925415, + "num_tokens": 754342606.0, + "step": 19767 + }, + { + "epoch": 2.514692787177204, + "ewc_loss": 0.008423064835369587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.423064718954265e-05, + "grad_norm": 4.257309436798096, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8786803483963013, + "num_tokens": 754379946.0, + "step": 19768 + }, + { + "epoch": 2.5148199974557945, + "ewc_loss": 0.00849235150963068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49235148052685e-05, + "grad_norm": 4.198558330535889, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8950842618942261, + "num_tokens": 754414353.0, + "step": 19769 + }, + { + "epoch": 2.514947207734385, + "ewc_loss": 0.008420299738645554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420299855060875e-05, + "grad_norm": 4.227463245391846, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8712093830108643, + "num_tokens": 754455506.0, + "step": 19770 + }, + { + "epoch": 2.5150744180129756, + "ewc_loss": 0.008465580642223358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465580322081223e-05, + "grad_norm": 4.225323677062988, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8658082485198975, + "num_tokens": 754498316.0, + "step": 19771 + }, + { + "epoch": 2.5152016282915657, + "ewc_loss": 0.008441529236733913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44152964418754e-05, + "grad_norm": 4.248930931091309, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8718637228012085, + "num_tokens": 754534943.0, + "step": 19772 + }, + { + "epoch": 2.5153288385701567, + "ewc_loss": 0.008473842404782772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473842171952128e-05, + "grad_norm": 4.2154059410095215, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8799833655357361, + "num_tokens": 754572775.0, + "step": 19773 + }, + { + "epoch": 2.5154560488487467, + "ewc_loss": 0.008465743623673916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465743303531781e-05, + "grad_norm": 4.184309005737305, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8918867111206055, + "num_tokens": 754610473.0, + "step": 19774 + }, + { + "epoch": 2.5155832591273377, + "ewc_loss": 0.008462229743599892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462229743599892e-05, + "grad_norm": 4.214883804321289, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.88148033618927, + "num_tokens": 754649527.0, + "step": 19775 + }, + { + "epoch": 2.515710469405928, + "ewc_loss": 0.00849427655339241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494276698911563e-05, + "grad_norm": 4.166028022766113, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8834162354469299, + "num_tokens": 754691513.0, + "step": 19776 + }, + { + "epoch": 2.5158376796845188, + "ewc_loss": 0.008455460891127586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455460920231417e-05, + "grad_norm": 4.270807266235352, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8691271543502808, + "num_tokens": 754727932.0, + "step": 19777 + }, + { + "epoch": 2.515964889963109, + "ewc_loss": 0.008541746065020561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541745773982257e-05, + "grad_norm": 4.194809436798096, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8860405683517456, + "num_tokens": 754764571.0, + "step": 19778 + }, + { + "epoch": 2.5160921002416994, + "ewc_loss": 0.00846043974161148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460439858026803e-05, + "grad_norm": 4.174291610717773, + "learning_rate": 1e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.894706130027771, + "num_tokens": 754805846.0, + "step": 19779 + }, + { + "epoch": 2.51621931052029, + "ewc_loss": 0.008475441485643387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475441427435726e-05, + "grad_norm": 4.240804195404053, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8880304098129272, + "num_tokens": 754844016.0, + "step": 19780 + }, + { + "epoch": 2.5163465207988804, + "ewc_loss": 0.00851984042674303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519840048393235e-05, + "grad_norm": 4.166478633880615, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.881421685218811, + "num_tokens": 754886493.0, + "step": 19781 + }, + { + "epoch": 2.516473731077471, + "ewc_loss": 0.008451142348349094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451141911791638e-05, + "grad_norm": 4.2144389152526855, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8850667476654053, + "num_tokens": 754928195.0, + "step": 19782 + }, + { + "epoch": 2.5166009413560615, + "ewc_loss": 0.008504914119839668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504914148943499e-05, + "grad_norm": 4.226041793823242, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8910238742828369, + "num_tokens": 754958399.0, + "step": 19783 + }, + { + "epoch": 2.516728151634652, + "ewc_loss": 0.008478034287691116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478034578729421e-05, + "grad_norm": 4.182374477386475, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8862543106079102, + "num_tokens": 754997926.0, + "step": 19784 + }, + { + "epoch": 2.5168553619132426, + "ewc_loss": 0.008466162718832493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.466162398690358e-05, + "grad_norm": 4.176438331604004, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8860762119293213, + "num_tokens": 755040577.0, + "step": 19785 + }, + { + "epoch": 2.516982572191833, + "ewc_loss": 0.008476734161376953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476734365103766e-05, + "grad_norm": 4.288074970245361, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8788245320320129, + "num_tokens": 755070458.0, + "step": 19786 + }, + { + "epoch": 2.5171097824704236, + "ewc_loss": 0.008514368906617165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51436925586313e-05, + "grad_norm": 4.18904447555542, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8623548746109009, + "num_tokens": 755117205.0, + "step": 19787 + }, + { + "epoch": 2.517236992749014, + "ewc_loss": 0.008445851504802704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445851563010365e-05, + "grad_norm": 4.181769847869873, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8829634785652161, + "num_tokens": 755162098.0, + "step": 19788 + }, + { + "epoch": 2.5173642030276047, + "ewc_loss": 0.008495253510773182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495253860019147e-05, + "grad_norm": 4.227632999420166, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8806246519088745, + "num_tokens": 755199655.0, + "step": 19789 + }, + { + "epoch": 2.517491413306195, + "ewc_loss": 0.008508150465786457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508150494890288e-05, + "grad_norm": 4.215991497039795, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8947731256484985, + "num_tokens": 755234165.0, + "step": 19790 + }, + { + "epoch": 2.5176186235847857, + "ewc_loss": 0.008472856134176254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.472856279695407e-05, + "grad_norm": 4.230310440063477, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8833208680152893, + "num_tokens": 755271933.0, + "step": 19791 + }, + { + "epoch": 2.5177458338633762, + "ewc_loss": 0.008478024043142796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478023664793e-05, + "grad_norm": 4.189830303192139, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.890328049659729, + "num_tokens": 755306903.0, + "step": 19792 + }, + { + "epoch": 2.5178730441419668, + "ewc_loss": 0.00844787247478962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447872096439824e-05, + "grad_norm": 4.176242828369141, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8891221284866333, + "num_tokens": 755347613.0, + "step": 19793 + }, + { + "epoch": 2.5180002544205573, + "ewc_loss": 0.008450161665678024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450161840301007e-05, + "grad_norm": 4.224506378173828, + "learning_rate": 1e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.8995782732963562, + "num_tokens": 755380541.0, + "step": 19794 + }, + { + "epoch": 2.518127464699148, + "ewc_loss": 0.008490175940096378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490175969200209e-05, + "grad_norm": 4.170598030090332, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8799420595169067, + "num_tokens": 755422502.0, + "step": 19795 + }, + { + "epoch": 2.5182546749777384, + "ewc_loss": 0.008450619876384735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4506202256307e-05, + "grad_norm": 4.161083221435547, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8831383585929871, + "num_tokens": 755463975.0, + "step": 19796 + }, + { + "epoch": 2.5183818852563284, + "ewc_loss": 0.008443837985396385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443837577942759e-05, + "grad_norm": 4.183244705200195, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8827874660491943, + "num_tokens": 755504873.0, + "step": 19797 + }, + { + "epoch": 2.5185090955349194, + "ewc_loss": 0.008447895757853985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44789610709995e-05, + "grad_norm": 4.228393077850342, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8581381440162659, + "num_tokens": 755545646.0, + "step": 19798 + }, + { + "epoch": 2.5186363058135095, + "ewc_loss": 0.008464105427265167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464105485472828e-05, + "grad_norm": 4.320585250854492, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8693464398384094, + "num_tokens": 755577228.0, + "step": 19799 + }, + { + "epoch": 2.5187635160921005, + "ewc_loss": 0.008507129736244678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507129678037018e-05, + "grad_norm": 4.167497634887695, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8912009000778198, + "num_tokens": 755621437.0, + "step": 19800 + }, + { + "epoch": 2.5188907263706906, + "ewc_loss": 0.008400743827223778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.400744263781235e-05, + "grad_norm": 4.218392372131348, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8795171976089478, + "num_tokens": 755658266.0, + "step": 19801 + }, + { + "epoch": 2.5190179366492815, + "ewc_loss": 0.008477275259792805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477275696350262e-05, + "grad_norm": 4.2464141845703125, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8704362511634827, + "num_tokens": 755696755.0, + "step": 19802 + }, + { + "epoch": 2.5191451469278716, + "ewc_loss": 0.00845721922814846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457219519186765e-05, + "grad_norm": 4.178910255432129, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8941695094108582, + "num_tokens": 755732434.0, + "step": 19803 + }, + { + "epoch": 2.519272357206462, + "ewc_loss": 0.008420945145189762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.420945232501253e-05, + "grad_norm": 4.225009918212891, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8864060640335083, + "num_tokens": 755768488.0, + "step": 19804 + }, + { + "epoch": 2.5193995674850527, + "ewc_loss": 0.00846935249865055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469352178508416e-05, + "grad_norm": 4.2276997566223145, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8891676068305969, + "num_tokens": 755799637.0, + "step": 19805 + }, + { + "epoch": 2.519526777763643, + "ewc_loss": 0.008454946801066399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454947237623855e-05, + "grad_norm": 4.255190849304199, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8679366111755371, + "num_tokens": 755834229.0, + "step": 19806 + }, + { + "epoch": 2.5196539880422337, + "ewc_loss": 0.008461575955152512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461575635010377e-05, + "grad_norm": 4.179823398590088, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8854464292526245, + "num_tokens": 755871776.0, + "step": 19807 + }, + { + "epoch": 2.5197811983208243, + "ewc_loss": 0.008447665721178055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447665459243581e-05, + "grad_norm": 4.206235885620117, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8923674821853638, + "num_tokens": 755911125.0, + "step": 19808 + }, + { + "epoch": 2.519908408599415, + "ewc_loss": 0.008481130003929138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481129771098495e-05, + "grad_norm": 4.1595354080200195, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8918807506561279, + "num_tokens": 755952204.0, + "step": 19809 + }, + { + "epoch": 2.5200356188780053, + "ewc_loss": 0.008448299951851368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448299922747537e-05, + "grad_norm": 4.181180953979492, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8725435733795166, + "num_tokens": 755996702.0, + "step": 19810 + }, + { + "epoch": 2.520162829156596, + "ewc_loss": 0.008471193723380566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471193723380566e-05, + "grad_norm": 4.1828932762146, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8970492482185364, + "num_tokens": 756033285.0, + "step": 19811 + }, + { + "epoch": 2.5202900394351864, + "ewc_loss": 0.008460366167128086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460366370854899e-05, + "grad_norm": 4.216958522796631, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8829759359359741, + "num_tokens": 756068851.0, + "step": 19812 + }, + { + "epoch": 2.520417249713777, + "ewc_loss": 0.008483384735882282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48338459036313e-05, + "grad_norm": 4.195490837097168, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.873291015625, + "num_tokens": 756111676.0, + "step": 19813 + }, + { + "epoch": 2.5205444599923674, + "ewc_loss": 0.008464967831969261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464967686450109e-05, + "grad_norm": 4.308803558349609, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8681278228759766, + "num_tokens": 756146575.0, + "step": 19814 + }, + { + "epoch": 2.520671670270958, + "ewc_loss": 0.008525444194674492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525443990947679e-05, + "grad_norm": 4.199067115783691, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8739727735519409, + "num_tokens": 756190006.0, + "step": 19815 + }, + { + "epoch": 2.5207988805495485, + "ewc_loss": 0.008438866585493088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438866643700749e-05, + "grad_norm": 4.186820030212402, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8920009732246399, + "num_tokens": 756232828.0, + "step": 19816 + }, + { + "epoch": 2.520926090828139, + "ewc_loss": 0.008468566462397575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468566375086084e-05, + "grad_norm": 4.214481830596924, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8883545994758606, + "num_tokens": 756273077.0, + "step": 19817 + }, + { + "epoch": 2.5210533011067295, + "ewc_loss": 0.008495070040225983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495069778291509e-05, + "grad_norm": 4.2076416015625, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8671901226043701, + "num_tokens": 756313421.0, + "step": 19818 + }, + { + "epoch": 2.52118051138532, + "ewc_loss": 0.008452306501567364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452306065009907e-05, + "grad_norm": 4.170647621154785, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.877799391746521, + "num_tokens": 756356461.0, + "step": 19819 + }, + { + "epoch": 2.5213077216639106, + "ewc_loss": 0.008464961312711239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464961138088256e-05, + "grad_norm": 4.312108993530273, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8782755136489868, + "num_tokens": 756388309.0, + "step": 19820 + }, + { + "epoch": 2.521434931942501, + "ewc_loss": 0.008538289926946163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53828969411552e-05, + "grad_norm": 4.196796417236328, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.891052782535553, + "num_tokens": 756426655.0, + "step": 19821 + }, + { + "epoch": 2.521562142221091, + "ewc_loss": 0.008442780002951622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442779653705657e-05, + "grad_norm": 4.1943254470825195, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8793151378631592, + "num_tokens": 756464961.0, + "step": 19822 + }, + { + "epoch": 2.521689352499682, + "ewc_loss": 0.008500215597450733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500215335516259e-05, + "grad_norm": 4.247128486633301, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8964861631393433, + "num_tokens": 756497338.0, + "step": 19823 + }, + { + "epoch": 2.5218165627782723, + "ewc_loss": 0.008526572957634926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5265732195694e-05, + "grad_norm": 4.224260330200195, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8879474401473999, + "num_tokens": 756535324.0, + "step": 19824 + }, + { + "epoch": 2.5219437730568632, + "ewc_loss": 0.008500002324581146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500002149958163e-05, + "grad_norm": 4.187807083129883, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8908892869949341, + "num_tokens": 756574138.0, + "step": 19825 + }, + { + "epoch": 2.5220709833354533, + "ewc_loss": 0.008491571061313152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491570770274848e-05, + "grad_norm": 4.1980061531066895, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.891166627407074, + "num_tokens": 756608967.0, + "step": 19826 + }, + { + "epoch": 2.522198193614044, + "ewc_loss": 0.008518015034496784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518015238223597e-05, + "grad_norm": 4.215817451477051, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8634973168373108, + "num_tokens": 756653882.0, + "step": 19827 + }, + { + "epoch": 2.5223254038926344, + "ewc_loss": 0.008522188290953636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522188727511093e-05, + "grad_norm": 4.168368816375732, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8758081197738647, + "num_tokens": 756695638.0, + "step": 19828 + }, + { + "epoch": 2.522452614171225, + "ewc_loss": 0.008498634211719036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498634269926697e-05, + "grad_norm": 4.213906764984131, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8765153884887695, + "num_tokens": 756734964.0, + "step": 19829 + }, + { + "epoch": 2.5225798244498154, + "ewc_loss": 0.00851766113191843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517660899087787e-05, + "grad_norm": 4.234301567077637, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8863855600357056, + "num_tokens": 756768454.0, + "step": 19830 + }, + { + "epoch": 2.522707034728406, + "ewc_loss": 0.008506319485604763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506319863954559e-05, + "grad_norm": 4.250154495239258, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8732407093048096, + "num_tokens": 756801696.0, + "step": 19831 + }, + { + "epoch": 2.5228342450069965, + "ewc_loss": 0.008523408323526382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52340817800723e-05, + "grad_norm": 4.209316730499268, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8828684687614441, + "num_tokens": 756839874.0, + "step": 19832 + }, + { + "epoch": 2.522961455285587, + "ewc_loss": 0.008481872268021107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481872646370903e-05, + "grad_norm": 4.165619373321533, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8708826303482056, + "num_tokens": 756884179.0, + "step": 19833 + }, + { + "epoch": 2.5230886655641775, + "ewc_loss": 0.008478241041302681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478241215925664e-05, + "grad_norm": 4.2142205238342285, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8827091455459595, + "num_tokens": 756922725.0, + "step": 19834 + }, + { + "epoch": 2.523215875842768, + "ewc_loss": 0.008505485951900482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505486039211974e-05, + "grad_norm": 4.239068508148193, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8788883686065674, + "num_tokens": 756956080.0, + "step": 19835 + }, + { + "epoch": 2.5233430861213586, + "ewc_loss": 0.008502322249114513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502322452841327e-05, + "grad_norm": 4.220077037811279, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8739612102508545, + "num_tokens": 756995137.0, + "step": 19836 + }, + { + "epoch": 2.523470296399949, + "ewc_loss": 0.008490639738738537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490639447700232e-05, + "grad_norm": 4.1759352684021, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8897570371627808, + "num_tokens": 757035300.0, + "step": 19837 + }, + { + "epoch": 2.5235975066785397, + "ewc_loss": 0.008481140248477459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481139957439154e-05, + "grad_norm": 4.316093921661377, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8600085973739624, + "num_tokens": 757068789.0, + "step": 19838 + }, + { + "epoch": 2.52372471695713, + "ewc_loss": 0.008571343496441841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571343641960993e-05, + "grad_norm": 4.257757186889648, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8752796649932861, + "num_tokens": 757102913.0, + "step": 19839 + }, + { + "epoch": 2.5238519272357207, + "ewc_loss": 0.008476203307509422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476203220197931e-05, + "grad_norm": 4.195115089416504, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8848868012428284, + "num_tokens": 757138932.0, + "step": 19840 + }, + { + "epoch": 2.5239791375143112, + "ewc_loss": 0.008496347814798355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496347436448559e-05, + "grad_norm": 4.184249401092529, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8831572532653809, + "num_tokens": 757179181.0, + "step": 19841 + }, + { + "epoch": 2.5241063477929018, + "ewc_loss": 0.008513903245329857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513902866980061e-05, + "grad_norm": 4.205268859863281, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8550324440002441, + "num_tokens": 757224209.0, + "step": 19842 + }, + { + "epoch": 2.5242335580714923, + "ewc_loss": 0.008524968288838863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524968143319711e-05, + "grad_norm": 4.193459987640381, + "learning_rate": 1e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.8947367072105408, + "num_tokens": 757262398.0, + "step": 19843 + }, + { + "epoch": 2.524360768350083, + "ewc_loss": 0.008513255976140499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51325603434816e-05, + "grad_norm": 4.198968410491943, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8723448514938354, + "num_tokens": 757301744.0, + "step": 19844 + }, + { + "epoch": 2.5244879786286734, + "ewc_loss": 0.008514424785971642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514424553140998e-05, + "grad_norm": 4.288227081298828, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8872731924057007, + "num_tokens": 757338273.0, + "step": 19845 + }, + { + "epoch": 2.524615188907264, + "ewc_loss": 0.008559026755392551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559026900911704e-05, + "grad_norm": 4.164200305938721, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8742552995681763, + "num_tokens": 757383169.0, + "step": 19846 + }, + { + "epoch": 2.524742399185854, + "ewc_loss": 0.008449235931038857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449235610896721e-05, + "grad_norm": 4.2140278816223145, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8706089854240417, + "num_tokens": 757420998.0, + "step": 19847 + }, + { + "epoch": 2.524869609464445, + "ewc_loss": 0.008545992895960808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545992750441656e-05, + "grad_norm": 4.191763401031494, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8848860263824463, + "num_tokens": 757463281.0, + "step": 19848 + }, + { + "epoch": 2.524996819743035, + "ewc_loss": 0.008500578813254833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500578405801207e-05, + "grad_norm": 4.309254169464111, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8767130374908447, + "num_tokens": 757495022.0, + "step": 19849 + }, + { + "epoch": 2.525124030021626, + "ewc_loss": 0.00859034899622202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590349170845002e-05, + "grad_norm": 4.188270568847656, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8868807554244995, + "num_tokens": 757536801.0, + "step": 19850 + }, + { + "epoch": 2.525251240300216, + "ewc_loss": 0.008481749333441257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481749682687223e-05, + "grad_norm": 4.192175388336182, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8955093622207642, + "num_tokens": 757573419.0, + "step": 19851 + }, + { + "epoch": 2.5253784505788066, + "ewc_loss": 0.008527352474629879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527352474629879e-05, + "grad_norm": 4.232207298278809, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8772121667861938, + "num_tokens": 757610773.0, + "step": 19852 + }, + { + "epoch": 2.525505660857397, + "ewc_loss": 0.008544066920876503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544066804461181e-05, + "grad_norm": 4.16835355758667, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8913202285766602, + "num_tokens": 757648067.0, + "step": 19853 + }, + { + "epoch": 2.5256328711359877, + "ewc_loss": 0.008503648452460766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503648859914392e-05, + "grad_norm": 4.213072776794434, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8885906934738159, + "num_tokens": 757681598.0, + "step": 19854 + }, + { + "epoch": 2.525760081414578, + "ewc_loss": 0.008547955192625523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547955076210201e-05, + "grad_norm": 4.185100078582764, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8654324412345886, + "num_tokens": 757724752.0, + "step": 19855 + }, + { + "epoch": 2.5258872916931687, + "ewc_loss": 0.008527551777660847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527551835868508e-05, + "grad_norm": 4.197484016418457, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8929812908172607, + "num_tokens": 757763801.0, + "step": 19856 + }, + { + "epoch": 2.5260145019717593, + "ewc_loss": 0.008531501516699791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531501225661486e-05, + "grad_norm": 4.245366096496582, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8816022276878357, + "num_tokens": 757799235.0, + "step": 19857 + }, + { + "epoch": 2.52614171225035, + "ewc_loss": 0.00853912252932787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539122791262344e-05, + "grad_norm": 4.167332172393799, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.886262059211731, + "num_tokens": 757839695.0, + "step": 19858 + }, + { + "epoch": 2.5262689225289403, + "ewc_loss": 0.008476370945572853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476371294818819e-05, + "grad_norm": 4.28321647644043, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8690125942230225, + "num_tokens": 757877152.0, + "step": 19859 + }, + { + "epoch": 2.526396132807531, + "ewc_loss": 0.008576453663408756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576453546993434e-05, + "grad_norm": 4.179475784301758, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8800323009490967, + "num_tokens": 757916173.0, + "step": 19860 + }, + { + "epoch": 2.5265233430861214, + "ewc_loss": 0.008492215536534786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492215420119464e-05, + "grad_norm": 4.188527584075928, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8855421543121338, + "num_tokens": 757957271.0, + "step": 19861 + }, + { + "epoch": 2.526650553364712, + "ewc_loss": 0.0085150683298707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515068475389853e-05, + "grad_norm": 4.242795467376709, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8754204511642456, + "num_tokens": 757995553.0, + "step": 19862 + }, + { + "epoch": 2.5267777636433024, + "ewc_loss": 0.008553522638976574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553522638976574e-05, + "grad_norm": 4.22783899307251, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8723561763763428, + "num_tokens": 758035095.0, + "step": 19863 + }, + { + "epoch": 2.526904973921893, + "ewc_loss": 0.008516989648342133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516990055795759e-05, + "grad_norm": 4.2040839195251465, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8787734508514404, + "num_tokens": 758071185.0, + "step": 19864 + }, + { + "epoch": 2.5270321842004835, + "ewc_loss": 0.008496715687215328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496715599903837e-05, + "grad_norm": 4.189960956573486, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8807910084724426, + "num_tokens": 758110251.0, + "step": 19865 + }, + { + "epoch": 2.527159394479074, + "ewc_loss": 0.008521177805960178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521178096998483e-05, + "grad_norm": 4.267507553100586, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8683636784553528, + "num_tokens": 758144385.0, + "step": 19866 + }, + { + "epoch": 2.5272866047576645, + "ewc_loss": 0.008538356050848961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538355905329809e-05, + "grad_norm": 4.127150535583496, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8918521404266357, + "num_tokens": 758185935.0, + "step": 19867 + }, + { + "epoch": 2.527413815036255, + "ewc_loss": 0.008438586257398129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438586519332603e-05, + "grad_norm": 4.174898624420166, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.884639322757721, + "num_tokens": 758226688.0, + "step": 19868 + }, + { + "epoch": 2.5275410253148456, + "ewc_loss": 0.008543143048882484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54314275784418e-05, + "grad_norm": 4.264220237731934, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.887621283531189, + "num_tokens": 758263866.0, + "step": 19869 + }, + { + "epoch": 2.5276682355934357, + "ewc_loss": 0.008543964475393295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543964213458821e-05, + "grad_norm": 4.201539039611816, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.886481761932373, + "num_tokens": 758303607.0, + "step": 19870 + }, + { + "epoch": 2.5277954458720266, + "ewc_loss": 0.008454546332359314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454546332359314e-05, + "grad_norm": 4.218351364135742, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8841584920883179, + "num_tokens": 758340120.0, + "step": 19871 + }, + { + "epoch": 2.5279226561506167, + "ewc_loss": 0.008493592962622643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493592758895829e-05, + "grad_norm": 4.246975898742676, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8828330039978027, + "num_tokens": 758373792.0, + "step": 19872 + }, + { + "epoch": 2.5280498664292077, + "ewc_loss": 0.008502379059791565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502379205310717e-05, + "grad_norm": 4.1797685623168945, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8682602643966675, + "num_tokens": 758418157.0, + "step": 19873 + }, + { + "epoch": 2.528177076707798, + "ewc_loss": 0.008447900414466858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447900472674519e-05, + "grad_norm": 4.222793102264404, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8744088411331177, + "num_tokens": 758454042.0, + "step": 19874 + }, + { + "epoch": 2.5283042869863888, + "ewc_loss": 0.00849975273013115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499752584611997e-05, + "grad_norm": 4.192844390869141, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8834207057952881, + "num_tokens": 758495305.0, + "step": 19875 + }, + { + "epoch": 2.528431497264979, + "ewc_loss": 0.008469140157103539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469139720546082e-05, + "grad_norm": 4.225290775299072, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8879116177558899, + "num_tokens": 758532646.0, + "step": 19876 + }, + { + "epoch": 2.5285587075435694, + "ewc_loss": 0.008484352380037308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484352292725816e-05, + "grad_norm": 4.216314792633057, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8782659769058228, + "num_tokens": 758568329.0, + "step": 19877 + }, + { + "epoch": 2.52868591782216, + "ewc_loss": 0.008495579473674297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495579095324501e-05, + "grad_norm": 4.220673084259033, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8847566246986389, + "num_tokens": 758608246.0, + "step": 19878 + }, + { + "epoch": 2.5288131281007504, + "ewc_loss": 0.008465348742902279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465348946629092e-05, + "grad_norm": 4.138866901397705, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8917071223258972, + "num_tokens": 758645356.0, + "step": 19879 + }, + { + "epoch": 2.528940338379341, + "ewc_loss": 0.008450460620224476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45046088215895e-05, + "grad_norm": 4.15530252456665, + "learning_rate": 1e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8958859443664551, + "num_tokens": 758685657.0, + "step": 19880 + }, + { + "epoch": 2.5290675486579315, + "ewc_loss": 0.00850350596010685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503506251145154e-05, + "grad_norm": 4.2748565673828125, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8865914940834045, + "num_tokens": 758718843.0, + "step": 19881 + }, + { + "epoch": 2.529194758936522, + "ewc_loss": 0.008556501008570194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55650141602382e-05, + "grad_norm": 4.201583385467529, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8849330544471741, + "num_tokens": 758757946.0, + "step": 19882 + }, + { + "epoch": 2.5293219692151125, + "ewc_loss": 0.008478227071464062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478227391606197e-05, + "grad_norm": 4.285212993621826, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8629151582717896, + "num_tokens": 758792832.0, + "step": 19883 + }, + { + "epoch": 2.529449179493703, + "ewc_loss": 0.008555838838219643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55583930388093e-05, + "grad_norm": 4.2765421867370605, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8753128051757812, + "num_tokens": 758826062.0, + "step": 19884 + }, + { + "epoch": 2.5295763897722936, + "ewc_loss": 0.008542819879949093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54281970532611e-05, + "grad_norm": 4.206228256225586, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.890579342842102, + "num_tokens": 758860940.0, + "step": 19885 + }, + { + "epoch": 2.529703600050884, + "ewc_loss": 0.00849562231451273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495622751070186e-05, + "grad_norm": 4.1853156089782715, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8803359270095825, + "num_tokens": 758900740.0, + "step": 19886 + }, + { + "epoch": 2.5298308103294747, + "ewc_loss": 0.008536148816347122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536149107385427e-05, + "grad_norm": 4.173449516296387, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.879321813583374, + "num_tokens": 758942786.0, + "step": 19887 + }, + { + "epoch": 2.529958020608065, + "ewc_loss": 0.008519652299582958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519652328686789e-05, + "grad_norm": 4.171324253082275, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8895192742347717, + "num_tokens": 758983376.0, + "step": 19888 + }, + { + "epoch": 2.5300852308866557, + "ewc_loss": 0.008543525822460651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543526200810447e-05, + "grad_norm": 4.184457778930664, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.871430516242981, + "num_tokens": 759024007.0, + "step": 19889 + }, + { + "epoch": 2.5302124411652462, + "ewc_loss": 0.008548267185688019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548267214791849e-05, + "grad_norm": 4.210343837738037, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8886824250221252, + "num_tokens": 759061611.0, + "step": 19890 + }, + { + "epoch": 2.5303396514438368, + "ewc_loss": 0.008566074073314667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566074393456802e-05, + "grad_norm": 4.229887962341309, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8678956031799316, + "num_tokens": 759101942.0, + "step": 19891 + }, + { + "epoch": 2.5304668617224273, + "ewc_loss": 0.008547556586563587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547556353732944e-05, + "grad_norm": 4.1905317306518555, + "learning_rate": 1e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.8978214859962463, + "num_tokens": 759140281.0, + "step": 19892 + }, + { + "epoch": 2.530594072001018, + "ewc_loss": 0.00852438434958458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524384611519054e-05, + "grad_norm": 4.23309850692749, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8806256055831909, + "num_tokens": 759174319.0, + "step": 19893 + }, + { + "epoch": 2.5307212822796084, + "ewc_loss": 0.00854897778481245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548977348254994e-05, + "grad_norm": 4.2433671951293945, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8895937204360962, + "num_tokens": 759208981.0, + "step": 19894 + }, + { + "epoch": 2.5308484925581984, + "ewc_loss": 0.00851829070597887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518290269421414e-05, + "grad_norm": 4.287237167358398, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8834001421928406, + "num_tokens": 759240559.0, + "step": 19895 + }, + { + "epoch": 2.5309757028367894, + "ewc_loss": 0.00855791475623846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557915134588256e-05, + "grad_norm": 4.180455207824707, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.892807126045227, + "num_tokens": 759280550.0, + "step": 19896 + }, + { + "epoch": 2.5311029131153795, + "ewc_loss": 0.008472761139273643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.472760964650661e-05, + "grad_norm": 4.192024230957031, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8767901659011841, + "num_tokens": 759321298.0, + "step": 19897 + }, + { + "epoch": 2.5312301233939705, + "ewc_loss": 0.008496514521539211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496514783473685e-05, + "grad_norm": 4.214913368225098, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8933916687965393, + "num_tokens": 759354627.0, + "step": 19898 + }, + { + "epoch": 2.5313573336725606, + "ewc_loss": 0.008514776825904846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514776709489524e-05, + "grad_norm": 4.308807849884033, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.877544641494751, + "num_tokens": 759388129.0, + "step": 19899 + }, + { + "epoch": 2.5314845439511515, + "ewc_loss": 0.008570319972932339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570319914724678e-05, + "grad_norm": 4.216345310211182, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8892839550971985, + "num_tokens": 759423667.0, + "step": 19900 + }, + { + "epoch": 2.5316117542297416, + "ewc_loss": 0.008458428084850311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458428055746481e-05, + "grad_norm": 4.1462721824646, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.873387336730957, + "num_tokens": 759466586.0, + "step": 19901 + }, + { + "epoch": 2.531738964508332, + "ewc_loss": 0.008477448485791683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477448136545718e-05, + "grad_norm": 4.234800815582275, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8760607242584229, + "num_tokens": 759504214.0, + "step": 19902 + }, + { + "epoch": 2.5318661747869227, + "ewc_loss": 0.008543740026652813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543740113964304e-05, + "grad_norm": 4.195054531097412, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8680177927017212, + "num_tokens": 759543865.0, + "step": 19903 + }, + { + "epoch": 2.531993385065513, + "ewc_loss": 0.008492527529597282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492527558701113e-05, + "grad_norm": 4.225827693939209, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8787510395050049, + "num_tokens": 759581917.0, + "step": 19904 + }, + { + "epoch": 2.5321205953441037, + "ewc_loss": 0.00852272566407919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52272569318302e-05, + "grad_norm": 4.2286272048950195, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8844759464263916, + "num_tokens": 759617641.0, + "step": 19905 + }, + { + "epoch": 2.5322478056226942, + "ewc_loss": 0.008506685495376587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506685117026791e-05, + "grad_norm": 4.237316608428955, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8915311694145203, + "num_tokens": 759649097.0, + "step": 19906 + }, + { + "epoch": 2.5323750159012848, + "ewc_loss": 0.008521946147084236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521945710526779e-05, + "grad_norm": 4.1895952224731445, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.877547025680542, + "num_tokens": 759688802.0, + "step": 19907 + }, + { + "epoch": 2.5325022261798753, + "ewc_loss": 0.008502623997628689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502623677486554e-05, + "grad_norm": 4.201995372772217, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8922311067581177, + "num_tokens": 759724243.0, + "step": 19908 + }, + { + "epoch": 2.532629436458466, + "ewc_loss": 0.008527593687176704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52759403642267e-05, + "grad_norm": 4.171531677246094, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8746719360351562, + "num_tokens": 759764646.0, + "step": 19909 + }, + { + "epoch": 2.5327566467370564, + "ewc_loss": 0.008512386120855808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512385829817504e-05, + "grad_norm": 4.210119724273682, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8836185336112976, + "num_tokens": 759806939.0, + "step": 19910 + }, + { + "epoch": 2.532883857015647, + "ewc_loss": 0.00853633414953947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536333916708827e-05, + "grad_norm": 4.161353588104248, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.88498854637146, + "num_tokens": 759848138.0, + "step": 19911 + }, + { + "epoch": 2.5330110672942374, + "ewc_loss": 0.008510097861289978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510097541147843e-05, + "grad_norm": 4.238526344299316, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8784659504890442, + "num_tokens": 759881877.0, + "step": 19912 + }, + { + "epoch": 2.533138277572828, + "ewc_loss": 0.008560551330447197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560551214031875e-05, + "grad_norm": 4.1602678298950195, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8918219208717346, + "num_tokens": 759922625.0, + "step": 19913 + }, + { + "epoch": 2.5332654878514185, + "ewc_loss": 0.00848904624581337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489046012982726e-05, + "grad_norm": 4.253155708312988, + "learning_rate": 1e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8934344053268433, + "num_tokens": 759952429.0, + "step": 19914 + }, + { + "epoch": 2.533392698130009, + "ewc_loss": 0.008586821146309376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586821058997884e-05, + "grad_norm": 4.291199684143066, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.883195161819458, + "num_tokens": 759987678.0, + "step": 19915 + }, + { + "epoch": 2.5335199084085995, + "ewc_loss": 0.008554687723517418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554687519790605e-05, + "grad_norm": 4.361597061157227, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8689868450164795, + "num_tokens": 760023943.0, + "step": 19916 + }, + { + "epoch": 2.53364711868719, + "ewc_loss": 0.008569596335291862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569596684537828e-05, + "grad_norm": 4.143351078033447, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8847715854644775, + "num_tokens": 760068229.0, + "step": 19917 + }, + { + "epoch": 2.5337743289657806, + "ewc_loss": 0.008447140455245972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447140135103837e-05, + "grad_norm": 4.244239330291748, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8700141906738281, + "num_tokens": 760109440.0, + "step": 19918 + }, + { + "epoch": 2.533901539244371, + "ewc_loss": 0.008583121001720428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583121234551072e-05, + "grad_norm": 4.218123435974121, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.883323073387146, + "num_tokens": 760146616.0, + "step": 19919 + }, + { + "epoch": 2.534028749522961, + "ewc_loss": 0.008528471924364567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528472244506702e-05, + "grad_norm": 4.203121185302734, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8645734786987305, + "num_tokens": 760188708.0, + "step": 19920 + }, + { + "epoch": 2.534155959801552, + "ewc_loss": 0.008509990759193897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509990584570915e-05, + "grad_norm": 4.164369583129883, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8964420557022095, + "num_tokens": 760231355.0, + "step": 19921 + }, + { + "epoch": 2.5342831700801423, + "ewc_loss": 0.008502807468175888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502807031618431e-05, + "grad_norm": 4.269887447357178, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8701448440551758, + "num_tokens": 760268316.0, + "step": 19922 + }, + { + "epoch": 2.5344103803587332, + "ewc_loss": 0.008561830036342144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561829599784687e-05, + "grad_norm": 4.245416164398193, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8751099705696106, + "num_tokens": 760305420.0, + "step": 19923 + }, + { + "epoch": 2.5345375906373233, + "ewc_loss": 0.008504538796842098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504538709530607e-05, + "grad_norm": 4.19312858581543, + "learning_rate": 1e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.8990371823310852, + "num_tokens": 760342195.0, + "step": 19924 + }, + { + "epoch": 2.534664800915914, + "ewc_loss": 0.008492006920278072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492006600135937e-05, + "grad_norm": 4.204103946685791, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8956133723258972, + "num_tokens": 760375394.0, + "step": 19925 + }, + { + "epoch": 2.5347920111945044, + "ewc_loss": 0.008520002476871014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520002302248031e-05, + "grad_norm": 4.257657051086426, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8711241483688354, + "num_tokens": 760411629.0, + "step": 19926 + }, + { + "epoch": 2.534919221473095, + "ewc_loss": 0.008543855510652065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543855074094608e-05, + "grad_norm": 4.165493965148926, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8840577602386475, + "num_tokens": 760456212.0, + "step": 19927 + }, + { + "epoch": 2.5350464317516854, + "ewc_loss": 0.008478470146656036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478469680994749e-05, + "grad_norm": 4.229633331298828, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8733068704605103, + "num_tokens": 760496385.0, + "step": 19928 + }, + { + "epoch": 2.535173642030276, + "ewc_loss": 0.008550327271223068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550327038392425e-05, + "grad_norm": 4.260765552520752, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8822226524353027, + "num_tokens": 760530426.0, + "step": 19929 + }, + { + "epoch": 2.5353008523088665, + "ewc_loss": 0.008542136289179325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542136492906138e-05, + "grad_norm": 4.216324806213379, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8924882411956787, + "num_tokens": 760568174.0, + "step": 19930 + }, + { + "epoch": 2.535428062587457, + "ewc_loss": 0.008500020019710064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500020339852199e-05, + "grad_norm": 4.1914215087890625, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8985704779624939, + "num_tokens": 760608097.0, + "step": 19931 + }, + { + "epoch": 2.5355552728660475, + "ewc_loss": 0.008514872752130032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514872752130032e-05, + "grad_norm": 4.254186153411865, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8836870193481445, + "num_tokens": 760643018.0, + "step": 19932 + }, + { + "epoch": 2.535682483144638, + "ewc_loss": 0.008545836433768272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54583631735295e-05, + "grad_norm": 4.242652893066406, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8894883394241333, + "num_tokens": 760676553.0, + "step": 19933 + }, + { + "epoch": 2.5358096934232286, + "ewc_loss": 0.00852135755121708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521357085555792e-05, + "grad_norm": 4.23276948928833, + "learning_rate": 1e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.89415043592453, + "num_tokens": 760708850.0, + "step": 19934 + }, + { + "epoch": 2.535936903701819, + "ewc_loss": 0.008523574098944664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523574069840834e-05, + "grad_norm": 4.242743968963623, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8793941140174866, + "num_tokens": 760746088.0, + "step": 19935 + }, + { + "epoch": 2.5360641139804097, + "ewc_loss": 0.008543805219233036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543805597582832e-05, + "grad_norm": 4.195352554321289, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8862674236297607, + "num_tokens": 760785716.0, + "step": 19936 + }, + { + "epoch": 2.536191324259, + "ewc_loss": 0.008513545617461205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513545617461205e-05, + "grad_norm": 4.239654541015625, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8907924294471741, + "num_tokens": 760821301.0, + "step": 19937 + }, + { + "epoch": 2.5363185345375907, + "ewc_loss": 0.008543375879526138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543375588487834e-05, + "grad_norm": 4.218279838562012, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8729590177536011, + "num_tokens": 760861200.0, + "step": 19938 + }, + { + "epoch": 2.5364457448161812, + "ewc_loss": 0.008537918329238892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537917892681435e-05, + "grad_norm": 4.23314905166626, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8815529346466064, + "num_tokens": 760896398.0, + "step": 19939 + }, + { + "epoch": 2.5365729550947718, + "ewc_loss": 0.008557409048080444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557409455534071e-05, + "grad_norm": 4.189864635467529, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8643274903297424, + "num_tokens": 760937300.0, + "step": 19940 + }, + { + "epoch": 2.5367001653733623, + "ewc_loss": 0.008537120185792446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53712044772692e-05, + "grad_norm": 4.243072986602783, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8785874247550964, + "num_tokens": 760973726.0, + "step": 19941 + }, + { + "epoch": 2.536827375651953, + "ewc_loss": 0.008597788400948048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597788109909743e-05, + "grad_norm": 4.234226226806641, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8799022436141968, + "num_tokens": 761009048.0, + "step": 19942 + }, + { + "epoch": 2.5369545859305433, + "ewc_loss": 0.00857007410377264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570073987357318e-05, + "grad_norm": 4.161557674407959, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8793724775314331, + "num_tokens": 761050836.0, + "step": 19943 + }, + { + "epoch": 2.537081796209134, + "ewc_loss": 0.008521717973053455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521717973053455e-05, + "grad_norm": 4.248378276824951, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8755742311477661, + "num_tokens": 761087517.0, + "step": 19944 + }, + { + "epoch": 2.537209006487724, + "ewc_loss": 0.008614306338131428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614306716481224e-05, + "grad_norm": 4.210783004760742, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8960139155387878, + "num_tokens": 761121401.0, + "step": 19945 + }, + { + "epoch": 2.537336216766315, + "ewc_loss": 0.008556141518056393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556141256121919e-05, + "grad_norm": 4.282581329345703, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.871566891670227, + "num_tokens": 761154973.0, + "step": 19946 + }, + { + "epoch": 2.537463427044905, + "ewc_loss": 0.008603325113654137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603325113654137e-05, + "grad_norm": 4.196300029754639, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8889903426170349, + "num_tokens": 761194211.0, + "step": 19947 + }, + { + "epoch": 2.537590637323496, + "ewc_loss": 0.008538853377103806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538853580830619e-05, + "grad_norm": 4.238135814666748, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8749209642410278, + "num_tokens": 761233486.0, + "step": 19948 + }, + { + "epoch": 2.537717847602086, + "ewc_loss": 0.008570415899157524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570415957365185e-05, + "grad_norm": 4.177546977996826, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8838692903518677, + "num_tokens": 761275846.0, + "step": 19949 + }, + { + "epoch": 2.5378450578806766, + "ewc_loss": 0.008539491333067417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539490954717621e-05, + "grad_norm": 4.260608673095703, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8718236684799194, + "num_tokens": 761311210.0, + "step": 19950 + }, + { + "epoch": 2.537972268159267, + "ewc_loss": 0.008595101535320282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595101098762825e-05, + "grad_norm": 4.222383499145508, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8805938959121704, + "num_tokens": 761349961.0, + "step": 19951 + }, + { + "epoch": 2.5380994784378577, + "ewc_loss": 0.008527951315045357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527951285941526e-05, + "grad_norm": 4.252630710601807, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8890634179115295, + "num_tokens": 761386993.0, + "step": 19952 + }, + { + "epoch": 2.538226688716448, + "ewc_loss": 0.008561515249311924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561515278415754e-05, + "grad_norm": 4.2054972648620605, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.873451828956604, + "num_tokens": 761426910.0, + "step": 19953 + }, + { + "epoch": 2.5383538989950387, + "ewc_loss": 0.00851896870881319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518968388671055e-05, + "grad_norm": 4.176422119140625, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8827426433563232, + "num_tokens": 761466633.0, + "step": 19954 + }, + { + "epoch": 2.5384811092736292, + "ewc_loss": 0.008519061841070652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519061520928517e-05, + "grad_norm": 4.250647068023682, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8715616464614868, + "num_tokens": 761497670.0, + "step": 19955 + }, + { + "epoch": 2.5386083195522198, + "ewc_loss": 0.008578364737331867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57836494105868e-05, + "grad_norm": 4.178371906280518, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8789452314376831, + "num_tokens": 761534626.0, + "step": 19956 + }, + { + "epoch": 2.5387355298308103, + "ewc_loss": 0.008499444462358952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499444084009156e-05, + "grad_norm": 4.144957065582275, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8882454037666321, + "num_tokens": 761581125.0, + "step": 19957 + }, + { + "epoch": 2.538862740109401, + "ewc_loss": 0.008523598313331604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52359808050096e-05, + "grad_norm": 4.2287917137146, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8818366527557373, + "num_tokens": 761616533.0, + "step": 19958 + }, + { + "epoch": 2.5389899503879914, + "ewc_loss": 0.008563349954783916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563350274926051e-05, + "grad_norm": 4.272608757019043, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8714962005615234, + "num_tokens": 761650615.0, + "step": 19959 + }, + { + "epoch": 2.539117160666582, + "ewc_loss": 0.00855766236782074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557661931263283e-05, + "grad_norm": 4.183681488037109, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8920412659645081, + "num_tokens": 761687020.0, + "step": 19960 + }, + { + "epoch": 2.5392443709451724, + "ewc_loss": 0.008483875542879105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483875717502087e-05, + "grad_norm": 4.209064960479736, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.896290123462677, + "num_tokens": 761721653.0, + "step": 19961 + }, + { + "epoch": 2.539371581223763, + "ewc_loss": 0.008539441041648388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539441478205845e-05, + "grad_norm": 4.222157001495361, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8885681629180908, + "num_tokens": 761758377.0, + "step": 19962 + }, + { + "epoch": 2.5394987915023535, + "ewc_loss": 0.00852980837225914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529808110324666e-05, + "grad_norm": 4.243655204772949, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8829435110092163, + "num_tokens": 761795263.0, + "step": 19963 + }, + { + "epoch": 2.539626001780944, + "ewc_loss": 0.008552604354918003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552604413125664e-05, + "grad_norm": 4.199275493621826, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8918642997741699, + "num_tokens": 761834758.0, + "step": 19964 + }, + { + "epoch": 2.5397532120595345, + "ewc_loss": 0.008503705263137817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503705612383783e-05, + "grad_norm": 4.181724548339844, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8758691549301147, + "num_tokens": 761880482.0, + "step": 19965 + }, + { + "epoch": 2.539880422338125, + "ewc_loss": 0.008506561629474163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50656142574735e-05, + "grad_norm": 4.224277496337891, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8806743621826172, + "num_tokens": 761922494.0, + "step": 19966 + }, + { + "epoch": 2.5400076326167156, + "ewc_loss": 0.008517754264175892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517754031345248e-05, + "grad_norm": 4.205204486846924, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8987569212913513, + "num_tokens": 761959462.0, + "step": 19967 + }, + { + "epoch": 2.5401348428953057, + "ewc_loss": 0.008486160077154636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48615964059718e-05, + "grad_norm": 4.370140552520752, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.874082624912262, + "num_tokens": 761990316.0, + "step": 19968 + }, + { + "epoch": 2.5402620531738966, + "ewc_loss": 0.00858491100370884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584911120124161e-05, + "grad_norm": 4.2793660163879395, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8776378631591797, + "num_tokens": 762023671.0, + "step": 19969 + }, + { + "epoch": 2.5403892634524867, + "ewc_loss": 0.008469975553452969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469975728075951e-05, + "grad_norm": 4.25016450881958, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8852154016494751, + "num_tokens": 762056719.0, + "step": 19970 + }, + { + "epoch": 2.5405164737310777, + "ewc_loss": 0.008503904566168785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50390424602665e-05, + "grad_norm": 4.197266101837158, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8825689554214478, + "num_tokens": 762098802.0, + "step": 19971 + }, + { + "epoch": 2.540643684009668, + "ewc_loss": 0.008480648510158062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480648830300197e-05, + "grad_norm": 4.226285457611084, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8851693868637085, + "num_tokens": 762134470.0, + "step": 19972 + }, + { + "epoch": 2.5407708942882588, + "ewc_loss": 0.008524182252585888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524182339897379e-05, + "grad_norm": 4.186801910400391, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8852540254592896, + "num_tokens": 762177163.0, + "step": 19973 + }, + { + "epoch": 2.540898104566849, + "ewc_loss": 0.008495953865349293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495953807141632e-05, + "grad_norm": 4.197469234466553, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8895017504692078, + "num_tokens": 762217323.0, + "step": 19974 + }, + { + "epoch": 2.5410253148454394, + "ewc_loss": 0.008531765080988407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531765342922881e-05, + "grad_norm": 4.2716169357299805, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8791141510009766, + "num_tokens": 762252144.0, + "step": 19975 + }, + { + "epoch": 2.54115252512403, + "ewc_loss": 0.008548594079911709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548593905288726e-05, + "grad_norm": 4.347239017486572, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.875933051109314, + "num_tokens": 762282108.0, + "step": 19976 + }, + { + "epoch": 2.5412797354026204, + "ewc_loss": 0.00857697706669569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576976688345894e-05, + "grad_norm": 4.277769565582275, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8848389983177185, + "num_tokens": 762318154.0, + "step": 19977 + }, + { + "epoch": 2.541406945681211, + "ewc_loss": 0.00849789660423994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497896487824619e-05, + "grad_norm": 4.233466148376465, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8795698285102844, + "num_tokens": 762354704.0, + "step": 19978 + }, + { + "epoch": 2.5415341559598015, + "ewc_loss": 0.00850547943264246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505479490850121e-05, + "grad_norm": 4.195328235626221, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8820317387580872, + "num_tokens": 762389943.0, + "step": 19979 + }, + { + "epoch": 2.541661366238392, + "ewc_loss": 0.008521722629666328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521722338628024e-05, + "grad_norm": 4.290704727172852, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.885860800743103, + "num_tokens": 762421560.0, + "step": 19980 + }, + { + "epoch": 2.5417885765169825, + "ewc_loss": 0.008559235371649265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559235720895231e-05, + "grad_norm": 4.195671081542969, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8878940343856812, + "num_tokens": 762458509.0, + "step": 19981 + }, + { + "epoch": 2.541915786795573, + "ewc_loss": 0.008509383536875248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509383769705892e-05, + "grad_norm": 4.1691179275512695, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8816343545913696, + "num_tokens": 762502382.0, + "step": 19982 + }, + { + "epoch": 2.5420429970741636, + "ewc_loss": 0.008499948307871819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499948307871819e-05, + "grad_norm": 4.253282070159912, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8780723810195923, + "num_tokens": 762536611.0, + "step": 19983 + }, + { + "epoch": 2.542170207352754, + "ewc_loss": 0.008588535711169243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588535274611786e-05, + "grad_norm": 4.226931571960449, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8822555541992188, + "num_tokens": 762577801.0, + "step": 19984 + }, + { + "epoch": 2.5422974176313446, + "ewc_loss": 0.008517306298017502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517305832356215e-05, + "grad_norm": 4.177332878112793, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8845309019088745, + "num_tokens": 762617413.0, + "step": 19985 + }, + { + "epoch": 2.542424627909935, + "ewc_loss": 0.008520549163222313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520549454260617e-05, + "grad_norm": 4.222088813781738, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8909637928009033, + "num_tokens": 762654811.0, + "step": 19986 + }, + { + "epoch": 2.5425518381885257, + "ewc_loss": 0.008555092848837376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555092790629715e-05, + "grad_norm": 4.182732105255127, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8868516087532043, + "num_tokens": 762692902.0, + "step": 19987 + }, + { + "epoch": 2.5426790484671162, + "ewc_loss": 0.008510522544384003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510522457072511e-05, + "grad_norm": 4.221382141113281, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8827490210533142, + "num_tokens": 762731984.0, + "step": 19988 + }, + { + "epoch": 2.5428062587457068, + "ewc_loss": 0.008540423586964607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54042373248376e-05, + "grad_norm": 4.273592948913574, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8908946514129639, + "num_tokens": 762767104.0, + "step": 19989 + }, + { + "epoch": 2.5429334690242973, + "ewc_loss": 0.008540309965610504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540310227544978e-05, + "grad_norm": 4.227522373199463, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.871074914932251, + "num_tokens": 762804982.0, + "step": 19990 + }, + { + "epoch": 2.543060679302888, + "ewc_loss": 0.008490679785609245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490680193062872e-05, + "grad_norm": 4.259029865264893, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8710418939590454, + "num_tokens": 762842956.0, + "step": 19991 + }, + { + "epoch": 2.5431878895814783, + "ewc_loss": 0.008510250598192215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510251063853502e-05, + "grad_norm": 4.266402244567871, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8885620832443237, + "num_tokens": 762875445.0, + "step": 19992 + }, + { + "epoch": 2.5433150998600684, + "ewc_loss": 0.008513505570590496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513505599694327e-05, + "grad_norm": 4.263108253479004, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8944169282913208, + "num_tokens": 762906061.0, + "step": 19993 + }, + { + "epoch": 2.5434423101386594, + "ewc_loss": 0.008497954346239567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497953967889771e-05, + "grad_norm": 4.196102142333984, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8894617557525635, + "num_tokens": 762945550.0, + "step": 19994 + }, + { + "epoch": 2.5435695204172495, + "ewc_loss": 0.00848681852221489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486818114761263e-05, + "grad_norm": 4.210533142089844, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8819069862365723, + "num_tokens": 762982586.0, + "step": 19995 + }, + { + "epoch": 2.5436967306958405, + "ewc_loss": 0.008535007946193218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535008237231523e-05, + "grad_norm": 4.2765793800354, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8827944993972778, + "num_tokens": 763013527.0, + "step": 19996 + }, + { + "epoch": 2.5438239409744305, + "ewc_loss": 0.008542198687791824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542198338545859e-05, + "grad_norm": 4.174760818481445, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8785911798477173, + "num_tokens": 763054363.0, + "step": 19997 + }, + { + "epoch": 2.5439511512530215, + "ewc_loss": 0.008472425863146782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.472425543004647e-05, + "grad_norm": 4.216904163360596, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8774182796478271, + "num_tokens": 763096834.0, + "step": 19998 + }, + { + "epoch": 2.5440783615316116, + "ewc_loss": 0.008541766554117203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541766146663576e-05, + "grad_norm": 4.157712459564209, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8817669153213501, + "num_tokens": 763139611.0, + "step": 19999 + }, + { + "epoch": 2.544205571810202, + "ewc_loss": 0.008470460772514343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470460306853056e-05, + "grad_norm": 4.186838150024414, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8832079172134399, + "num_tokens": 763181085.0, + "step": 20000 + }, + { + "epoch": 2.5443327820887927, + "ewc_loss": 0.008532078936696053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532078936696053e-05, + "grad_norm": 4.250650882720947, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8783782720565796, + "num_tokens": 763219811.0, + "step": 20001 + }, + { + "epoch": 2.544459992367383, + "ewc_loss": 0.00854637660086155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546376193407923e-05, + "grad_norm": 4.257962226867676, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8707215785980225, + "num_tokens": 763257745.0, + "step": 20002 + }, + { + "epoch": 2.5445872026459737, + "ewc_loss": 0.008501922711730003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501923002768308e-05, + "grad_norm": 4.207979202270508, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8675076365470886, + "num_tokens": 763299475.0, + "step": 20003 + }, + { + "epoch": 2.5447144129245642, + "ewc_loss": 0.008481278084218502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481278200633824e-05, + "grad_norm": 4.253037452697754, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8645980954170227, + "num_tokens": 763333960.0, + "step": 20004 + }, + { + "epoch": 2.5448416232031548, + "ewc_loss": 0.008550194092094898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550193888368085e-05, + "grad_norm": 4.20019006729126, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8722606897354126, + "num_tokens": 763378646.0, + "step": 20005 + }, + { + "epoch": 2.5449688334817453, + "ewc_loss": 0.00847757887095213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477579103782773e-05, + "grad_norm": 4.233432292938232, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8855701088905334, + "num_tokens": 763412441.0, + "step": 20006 + }, + { + "epoch": 2.545096043760336, + "ewc_loss": 0.008535031229257584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535030792700127e-05, + "grad_norm": 4.235401153564453, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8863771557807922, + "num_tokens": 763446713.0, + "step": 20007 + }, + { + "epoch": 2.5452232540389264, + "ewc_loss": 0.00851945485919714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519455150235444e-05, + "grad_norm": 4.231273651123047, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8913028240203857, + "num_tokens": 763478686.0, + "step": 20008 + }, + { + "epoch": 2.545350464317517, + "ewc_loss": 0.00850448664277792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504486322635785e-05, + "grad_norm": 4.16506290435791, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8969007730484009, + "num_tokens": 763517258.0, + "step": 20009 + }, + { + "epoch": 2.5454776745961074, + "ewc_loss": 0.008479567244648933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479566895402968e-05, + "grad_norm": 4.2685441970825195, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8771356344223022, + "num_tokens": 763553229.0, + "step": 20010 + }, + { + "epoch": 2.545604884874698, + "ewc_loss": 0.008568156510591507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568156772525981e-05, + "grad_norm": 4.179200649261475, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8891695737838745, + "num_tokens": 763594274.0, + "step": 20011 + }, + { + "epoch": 2.5457320951532885, + "ewc_loss": 0.00845431536436081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454315684502944e-05, + "grad_norm": 4.212430000305176, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8742324113845825, + "num_tokens": 763632162.0, + "step": 20012 + }, + { + "epoch": 2.545859305431879, + "ewc_loss": 0.008539986796677113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539987175026909e-05, + "grad_norm": 4.269589424133301, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.883172869682312, + "num_tokens": 763666797.0, + "step": 20013 + }, + { + "epoch": 2.5459865157104695, + "ewc_loss": 0.008540239185094833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540238923160359e-05, + "grad_norm": 4.2306227684021, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8806418776512146, + "num_tokens": 763705475.0, + "step": 20014 + }, + { + "epoch": 2.54611372598906, + "ewc_loss": 0.008501635864377022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501636330038309e-05, + "grad_norm": 4.224420547485352, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8967454433441162, + "num_tokens": 763739434.0, + "step": 20015 + }, + { + "epoch": 2.5462409362676506, + "ewc_loss": 0.008513201028108597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513200737070292e-05, + "grad_norm": 4.252910137176514, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8717626333236694, + "num_tokens": 763773262.0, + "step": 20016 + }, + { + "epoch": 2.546368146546241, + "ewc_loss": 0.008528291247785091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528291073162109e-05, + "grad_norm": 4.25007438659668, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8717126846313477, + "num_tokens": 763812573.0, + "step": 20017 + }, + { + "epoch": 2.546495356824831, + "ewc_loss": 0.008517441339790821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517441165167838e-05, + "grad_norm": 4.227293968200684, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8841162919998169, + "num_tokens": 763848555.0, + "step": 20018 + }, + { + "epoch": 2.546622567103422, + "ewc_loss": 0.008494514971971512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494514622725546e-05, + "grad_norm": 4.233309745788574, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8800690770149231, + "num_tokens": 763884608.0, + "step": 20019 + }, + { + "epoch": 2.5467497773820122, + "ewc_loss": 0.008509930223226547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509930194122717e-05, + "grad_norm": 4.190340995788574, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8789365291595459, + "num_tokens": 763922038.0, + "step": 20020 + }, + { + "epoch": 2.546876987660603, + "ewc_loss": 0.008507310412824154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507310849381611e-05, + "grad_norm": 4.205853462219238, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8794324398040771, + "num_tokens": 763958089.0, + "step": 20021 + }, + { + "epoch": 2.5470041979391933, + "ewc_loss": 0.008532305248081684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532305218977854e-05, + "grad_norm": 4.208911418914795, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8829038143157959, + "num_tokens": 764000802.0, + "step": 20022 + }, + { + "epoch": 2.547131408217784, + "ewc_loss": 0.008528961800038815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528961916454136e-05, + "grad_norm": 4.235440254211426, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8847854733467102, + "num_tokens": 764036628.0, + "step": 20023 + }, + { + "epoch": 2.5472586184963744, + "ewc_loss": 0.008531080558896065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531080675311387e-05, + "grad_norm": 4.217134475708008, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8854604959487915, + "num_tokens": 764073643.0, + "step": 20024 + }, + { + "epoch": 2.547385828774965, + "ewc_loss": 0.008519514463841915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519514085492119e-05, + "grad_norm": 4.213525772094727, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8885846734046936, + "num_tokens": 764110473.0, + "step": 20025 + }, + { + "epoch": 2.5475130390535554, + "ewc_loss": 0.008514394983649254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514395449310541e-05, + "grad_norm": 4.245964050292969, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8810783624649048, + "num_tokens": 764142941.0, + "step": 20026 + }, + { + "epoch": 2.547640249332146, + "ewc_loss": 0.008535962551832199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535962115274742e-05, + "grad_norm": 4.258547306060791, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8844910264015198, + "num_tokens": 764178752.0, + "step": 20027 + }, + { + "epoch": 2.5477674596107365, + "ewc_loss": 0.008541696704924107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54169629747048e-05, + "grad_norm": 4.202315807342529, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8790944814682007, + "num_tokens": 764217440.0, + "step": 20028 + }, + { + "epoch": 2.547894669889327, + "ewc_loss": 0.008474752306938171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474752394249663e-05, + "grad_norm": 4.205550193786621, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8729327917098999, + "num_tokens": 764257524.0, + "step": 20029 + }, + { + "epoch": 2.5480218801679175, + "ewc_loss": 0.008541366085410118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541365968994796e-05, + "grad_norm": 4.248668670654297, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8778356313705444, + "num_tokens": 764293645.0, + "step": 20030 + }, + { + "epoch": 2.548149090446508, + "ewc_loss": 0.008540183305740356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54018289828673e-05, + "grad_norm": 4.22951602935791, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8896487951278687, + "num_tokens": 764331660.0, + "step": 20031 + }, + { + "epoch": 2.5482763007250986, + "ewc_loss": 0.008510752581059933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51075237733312e-05, + "grad_norm": 4.147308826446533, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8985925912857056, + "num_tokens": 764376577.0, + "step": 20032 + }, + { + "epoch": 2.548403511003689, + "ewc_loss": 0.008487578481435776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487578452331945e-05, + "grad_norm": 4.241135120391846, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8803592324256897, + "num_tokens": 764415492.0, + "step": 20033 + }, + { + "epoch": 2.5485307212822796, + "ewc_loss": 0.008545040152966976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545040327589959e-05, + "grad_norm": 4.224682331085205, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.872930109500885, + "num_tokens": 764453948.0, + "step": 20034 + }, + { + "epoch": 2.54865793156087, + "ewc_loss": 0.008516144938766956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516144589520991e-05, + "grad_norm": 4.222723960876465, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.886208713054657, + "num_tokens": 764493906.0, + "step": 20035 + }, + { + "epoch": 2.5487851418394607, + "ewc_loss": 0.00849184300750494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491842891089618e-05, + "grad_norm": 4.333807468414307, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8778491616249084, + "num_tokens": 764532379.0, + "step": 20036 + }, + { + "epoch": 2.5489123521180512, + "ewc_loss": 0.00854964554309845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549646008759737e-05, + "grad_norm": 4.279937744140625, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8655596375465393, + "num_tokens": 764567726.0, + "step": 20037 + }, + { + "epoch": 2.5490395623966418, + "ewc_loss": 0.008497921749949455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497921953676268e-05, + "grad_norm": 4.2224507331848145, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.881364643573761, + "num_tokens": 764606229.0, + "step": 20038 + }, + { + "epoch": 2.5491667726752323, + "ewc_loss": 0.008490313775837421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490314212394878e-05, + "grad_norm": 4.205783367156982, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8905004262924194, + "num_tokens": 764641346.0, + "step": 20039 + }, + { + "epoch": 2.549293982953823, + "ewc_loss": 0.008497913368046284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497913222527131e-05, + "grad_norm": 4.168092250823975, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8834102153778076, + "num_tokens": 764687286.0, + "step": 20040 + }, + { + "epoch": 2.5494211932324133, + "ewc_loss": 0.008482036180794239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482036355417222e-05, + "grad_norm": 4.152205467224121, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8853839635848999, + "num_tokens": 764729235.0, + "step": 20041 + }, + { + "epoch": 2.549548403511004, + "ewc_loss": 0.008505567908287048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505568257533014e-05, + "grad_norm": 4.206298828125, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8751072287559509, + "num_tokens": 764771834.0, + "step": 20042 + }, + { + "epoch": 2.549675613789594, + "ewc_loss": 0.00851126853376627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511268970323727e-05, + "grad_norm": 4.2461371421813965, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8816293478012085, + "num_tokens": 764808307.0, + "step": 20043 + }, + { + "epoch": 2.549802824068185, + "ewc_loss": 0.00851947907358408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519479160895571e-05, + "grad_norm": 4.205881118774414, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8752444982528687, + "num_tokens": 764848373.0, + "step": 20044 + }, + { + "epoch": 2.549930034346775, + "ewc_loss": 0.008480251766741276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480251563014463e-05, + "grad_norm": 4.249605655670166, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8782484531402588, + "num_tokens": 764886468.0, + "step": 20045 + }, + { + "epoch": 2.550057244625366, + "ewc_loss": 0.0085245780646801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524578151991591e-05, + "grad_norm": 4.252676010131836, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8833330273628235, + "num_tokens": 764928690.0, + "step": 20046 + }, + { + "epoch": 2.550184454903956, + "ewc_loss": 0.008482144214212894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482144039589912e-05, + "grad_norm": 4.238306999206543, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8521186113357544, + "num_tokens": 764969148.0, + "step": 20047 + }, + { + "epoch": 2.5503116651825466, + "ewc_loss": 0.008492774330079556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492774213664234e-05, + "grad_norm": 4.1933746337890625, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.875747799873352, + "num_tokens": 765010754.0, + "step": 20048 + }, + { + "epoch": 2.550438875461137, + "ewc_loss": 0.008477382361888885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47738265292719e-05, + "grad_norm": 4.2418670654296875, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8853570222854614, + "num_tokens": 765043562.0, + "step": 20049 + }, + { + "epoch": 2.5505660857397277, + "ewc_loss": 0.008510631509125233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510631596436724e-05, + "grad_norm": 4.2179694175720215, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8645718097686768, + "num_tokens": 765085638.0, + "step": 20050 + }, + { + "epoch": 2.550693296018318, + "ewc_loss": 0.00848372932523489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483729470754042e-05, + "grad_norm": 4.282119274139404, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8891961574554443, + "num_tokens": 765115847.0, + "step": 20051 + }, + { + "epoch": 2.5508205062969087, + "ewc_loss": 0.008531010709702969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53101082611829e-05, + "grad_norm": 4.2889862060546875, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8811097145080566, + "num_tokens": 765146839.0, + "step": 20052 + }, + { + "epoch": 2.5509477165754992, + "ewc_loss": 0.008516868576407433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516868547303602e-05, + "grad_norm": 4.2024922370910645, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8832027316093445, + "num_tokens": 765185921.0, + "step": 20053 + }, + { + "epoch": 2.5510749268540898, + "ewc_loss": 0.008480044081807137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480044198222458e-05, + "grad_norm": 4.222987651824951, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8823479413986206, + "num_tokens": 765224534.0, + "step": 20054 + }, + { + "epoch": 2.5512021371326803, + "ewc_loss": 0.008548468351364136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548468031222001e-05, + "grad_norm": 4.177785396575928, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8892433643341064, + "num_tokens": 765267160.0, + "step": 20055 + }, + { + "epoch": 2.551329347411271, + "ewc_loss": 0.00849634874612093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496348891640082e-05, + "grad_norm": 4.2521162033081055, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.883333683013916, + "num_tokens": 765304770.0, + "step": 20056 + }, + { + "epoch": 2.5514565576898613, + "ewc_loss": 0.008552655577659607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552655344828963e-05, + "grad_norm": 4.252103328704834, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8798412084579468, + "num_tokens": 765341966.0, + "step": 20057 + }, + { + "epoch": 2.551583767968452, + "ewc_loss": 0.008539687842130661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539687405573204e-05, + "grad_norm": 4.173533916473389, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8957555294036865, + "num_tokens": 765382653.0, + "step": 20058 + }, + { + "epoch": 2.5517109782470424, + "ewc_loss": 0.008478917181491852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478917152388021e-05, + "grad_norm": 4.285828590393066, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8813364505767822, + "num_tokens": 765414940.0, + "step": 20059 + }, + { + "epoch": 2.551838188525633, + "ewc_loss": 0.008572204038500786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572204387746751e-05, + "grad_norm": 4.300060272216797, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.886883556842804, + "num_tokens": 765445549.0, + "step": 20060 + }, + { + "epoch": 2.5519653988042235, + "ewc_loss": 0.008546756580471992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546756725991145e-05, + "grad_norm": 4.184765338897705, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8893626928329468, + "num_tokens": 765484474.0, + "step": 20061 + }, + { + "epoch": 2.552092609082814, + "ewc_loss": 0.008481459692120552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481459371978417e-05, + "grad_norm": 4.216904163360596, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8834701776504517, + "num_tokens": 765522651.0, + "step": 20062 + }, + { + "epoch": 2.5522198193614045, + "ewc_loss": 0.008540987968444824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54098834679462e-05, + "grad_norm": 4.207226276397705, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.875673770904541, + "num_tokens": 765565983.0, + "step": 20063 + }, + { + "epoch": 2.552347029639995, + "ewc_loss": 0.008513107895851135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513108332408592e-05, + "grad_norm": 4.205340385437012, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8825501203536987, + "num_tokens": 765604056.0, + "step": 20064 + }, + { + "epoch": 2.5524742399185856, + "ewc_loss": 0.008513189852237701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513189823133871e-05, + "grad_norm": 4.184364318847656, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8909829258918762, + "num_tokens": 765646062.0, + "step": 20065 + }, + { + "epoch": 2.5526014501971757, + "ewc_loss": 0.00849681906402111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496818918501958e-05, + "grad_norm": 4.2288408279418945, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8819090723991394, + "num_tokens": 765687284.0, + "step": 20066 + }, + { + "epoch": 2.5527286604757666, + "ewc_loss": 0.008539082482457161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539082773495466e-05, + "grad_norm": 4.279043197631836, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8696160316467285, + "num_tokens": 765723743.0, + "step": 20067 + }, + { + "epoch": 2.5528558707543567, + "ewc_loss": 0.008544369600713253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544369484297931e-05, + "grad_norm": 4.162557601928711, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.880468487739563, + "num_tokens": 765768472.0, + "step": 20068 + }, + { + "epoch": 2.5529830810329477, + "ewc_loss": 0.008443409577012062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443409751635045e-05, + "grad_norm": 4.188168525695801, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8969568014144897, + "num_tokens": 765809040.0, + "step": 20069 + }, + { + "epoch": 2.5531102913115378, + "ewc_loss": 0.008509056642651558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509056351613253e-05, + "grad_norm": 4.231927394866943, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8837013244628906, + "num_tokens": 765845968.0, + "step": 20070 + }, + { + "epoch": 2.5532375015901287, + "ewc_loss": 0.008531869389116764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531869389116764e-05, + "grad_norm": 4.293279647827148, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8873001933097839, + "num_tokens": 765881037.0, + "step": 20071 + }, + { + "epoch": 2.553364711868719, + "ewc_loss": 0.008521078154444695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521078416379169e-05, + "grad_norm": 4.187528610229492, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8862546682357788, + "num_tokens": 765918945.0, + "step": 20072 + }, + { + "epoch": 2.5534919221473094, + "ewc_loss": 0.008447205647826195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447205618722364e-05, + "grad_norm": 4.195401668548584, + "learning_rate": 1e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.8954197764396667, + "num_tokens": 765956128.0, + "step": 20073 + }, + { + "epoch": 2.5536191324259, + "ewc_loss": 0.008463534526526928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463534322800115e-05, + "grad_norm": 4.169838905334473, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8961372375488281, + "num_tokens": 765997109.0, + "step": 20074 + }, + { + "epoch": 2.5537463427044904, + "ewc_loss": 0.008442079648375511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442079706583172e-05, + "grad_norm": 4.230630874633789, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.880642831325531, + "num_tokens": 766032557.0, + "step": 20075 + }, + { + "epoch": 2.553873552983081, + "ewc_loss": 0.008485764265060425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48576455609873e-05, + "grad_norm": 4.225239276885986, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8784169554710388, + "num_tokens": 766075285.0, + "step": 20076 + }, + { + "epoch": 2.5540007632616715, + "ewc_loss": 0.00845123641192913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451236499240622e-05, + "grad_norm": 4.249449253082275, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8795239925384521, + "num_tokens": 766110600.0, + "step": 20077 + }, + { + "epoch": 2.554127973540262, + "ewc_loss": 0.008464821614325047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464821439702064e-05, + "grad_norm": 4.263832092285156, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.872212290763855, + "num_tokens": 766146877.0, + "step": 20078 + }, + { + "epoch": 2.5542551838188525, + "ewc_loss": 0.008465228602290154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465228165732697e-05, + "grad_norm": 4.203237533569336, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8779245615005493, + "num_tokens": 766185164.0, + "step": 20079 + }, + { + "epoch": 2.554382394097443, + "ewc_loss": 0.008433191105723381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433190669165924e-05, + "grad_norm": 4.20717716217041, + "learning_rate": 1e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.9008887410163879, + "num_tokens": 766221442.0, + "step": 20080 + }, + { + "epoch": 2.5545096043760336, + "ewc_loss": 0.008454354479908943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4543542470783e-05, + "grad_norm": 4.294070720672607, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8604340553283691, + "num_tokens": 766255258.0, + "step": 20081 + }, + { + "epoch": 2.554636814654624, + "ewc_loss": 0.008500291034579277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500291005475447e-05, + "grad_norm": 4.2182817459106445, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8811992406845093, + "num_tokens": 766292392.0, + "step": 20082 + }, + { + "epoch": 2.5547640249332146, + "ewc_loss": 0.008437201380729675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437201177002862e-05, + "grad_norm": 4.198540687561035, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8725990056991577, + "num_tokens": 766335312.0, + "step": 20083 + }, + { + "epoch": 2.554891235211805, + "ewc_loss": 0.008471368812024593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471369073959067e-05, + "grad_norm": 4.237101078033447, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8830966949462891, + "num_tokens": 766372189.0, + "step": 20084 + }, + { + "epoch": 2.5550184454903957, + "ewc_loss": 0.00848979502916336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489794709021226e-05, + "grad_norm": 4.195140838623047, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8732030391693115, + "num_tokens": 766414021.0, + "step": 20085 + }, + { + "epoch": 2.5551456557689862, + "ewc_loss": 0.008454489521682262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454489579889923e-05, + "grad_norm": 4.208016872406006, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8944171667098999, + "num_tokens": 766452005.0, + "step": 20086 + }, + { + "epoch": 2.5552728660475768, + "ewc_loss": 0.00848359800875187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483597775921226e-05, + "grad_norm": 4.194736480712891, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.877600371837616, + "num_tokens": 766492150.0, + "step": 20087 + }, + { + "epoch": 2.5554000763261673, + "ewc_loss": 0.008470200933516026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470201282761991e-05, + "grad_norm": 4.168905735015869, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.892785370349884, + "num_tokens": 766533889.0, + "step": 20088 + }, + { + "epoch": 2.555527286604758, + "ewc_loss": 0.008450287394225597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450286986771971e-05, + "grad_norm": 4.245815753936768, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8829418420791626, + "num_tokens": 766565008.0, + "step": 20089 + }, + { + "epoch": 2.5556544968833483, + "ewc_loss": 0.008524667471647263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524667646270245e-05, + "grad_norm": 4.2061333656311035, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8825792074203491, + "num_tokens": 766605744.0, + "step": 20090 + }, + { + "epoch": 2.5557817071619384, + "ewc_loss": 0.008473347872495651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473348134430125e-05, + "grad_norm": 4.220498561859131, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8903765678405762, + "num_tokens": 766645782.0, + "step": 20091 + }, + { + "epoch": 2.5559089174405294, + "ewc_loss": 0.008498585782945156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498585521010682e-05, + "grad_norm": 4.272675037384033, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8723399043083191, + "num_tokens": 766683135.0, + "step": 20092 + }, + { + "epoch": 2.5560361277191195, + "ewc_loss": 0.008503831923007965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50383221404627e-05, + "grad_norm": 4.242182731628418, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8836634755134583, + "num_tokens": 766720691.0, + "step": 20093 + }, + { + "epoch": 2.5561633379977104, + "ewc_loss": 0.008475705981254578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475706272292882e-05, + "grad_norm": 4.2351861000061035, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8785099983215332, + "num_tokens": 766755426.0, + "step": 20094 + }, + { + "epoch": 2.5562905482763005, + "ewc_loss": 0.008496839553117752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496839291183278e-05, + "grad_norm": 4.28564453125, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8874753713607788, + "num_tokens": 766788315.0, + "step": 20095 + }, + { + "epoch": 2.5564177585548915, + "ewc_loss": 0.008536669425666332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536669338354841e-05, + "grad_norm": 4.1713361740112305, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.8968561887741089, + "num_tokens": 766825664.0, + "step": 20096 + }, + { + "epoch": 2.5565449688334816, + "ewc_loss": 0.008437501266598701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.437500946456566e-05, + "grad_norm": 4.160488605499268, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8928687572479248, + "num_tokens": 766868145.0, + "step": 20097 + }, + { + "epoch": 2.556672179112072, + "ewc_loss": 0.008492378517985344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492378401570022e-05, + "grad_norm": 4.241523742675781, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8711576461791992, + "num_tokens": 766907278.0, + "step": 20098 + }, + { + "epoch": 2.5567993893906626, + "ewc_loss": 0.008525392040610313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525392331648618e-05, + "grad_norm": 4.179884910583496, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8935574889183044, + "num_tokens": 766941662.0, + "step": 20099 + }, + { + "epoch": 2.556926599669253, + "ewc_loss": 0.008469918742775917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469918975606561e-05, + "grad_norm": 4.335518836975098, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8663888573646545, + "num_tokens": 766977725.0, + "step": 20100 + }, + { + "epoch": 2.5570538099478437, + "ewc_loss": 0.008590447716414928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590448123868555e-05, + "grad_norm": 4.269694805145264, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8786607384681702, + "num_tokens": 767011011.0, + "step": 20101 + }, + { + "epoch": 2.5571810202264342, + "ewc_loss": 0.008500403724610806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500403782818466e-05, + "grad_norm": 4.201160430908203, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8792304992675781, + "num_tokens": 767047078.0, + "step": 20102 + }, + { + "epoch": 2.5573082305050248, + "ewc_loss": 0.008502204902470112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502204582327977e-05, + "grad_norm": 4.179903984069824, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8799037337303162, + "num_tokens": 767089208.0, + "step": 20103 + }, + { + "epoch": 2.5574354407836153, + "ewc_loss": 0.008517072536051273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517072274116799e-05, + "grad_norm": 4.211775779724121, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8668739795684814, + "num_tokens": 767131225.0, + "step": 20104 + }, + { + "epoch": 2.557562651062206, + "ewc_loss": 0.008540804497897625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540804265066981e-05, + "grad_norm": 4.279207229614258, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8726348876953125, + "num_tokens": 767165222.0, + "step": 20105 + }, + { + "epoch": 2.5576898613407963, + "ewc_loss": 0.008582215756177902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582216105423868e-05, + "grad_norm": 4.24161434173584, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8824588656425476, + "num_tokens": 767200870.0, + "step": 20106 + }, + { + "epoch": 2.557817071619387, + "ewc_loss": 0.008528579957783222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528579928679392e-05, + "grad_norm": 4.218517303466797, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8803210258483887, + "num_tokens": 767236881.0, + "step": 20107 + }, + { + "epoch": 2.5579442818979774, + "ewc_loss": 0.008542753756046295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542753494111821e-05, + "grad_norm": 4.2635087966918945, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8879114389419556, + "num_tokens": 767269716.0, + "step": 20108 + }, + { + "epoch": 2.558071492176568, + "ewc_loss": 0.008579122833907604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579123095842078e-05, + "grad_norm": 4.2211198806762695, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8748384714126587, + "num_tokens": 767306840.0, + "step": 20109 + }, + { + "epoch": 2.5581987024551585, + "ewc_loss": 0.008563117124140263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563116716686636e-05, + "grad_norm": 4.163736343383789, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8929859399795532, + "num_tokens": 767346966.0, + "step": 20110 + }, + { + "epoch": 2.558325912733749, + "ewc_loss": 0.008556692861020565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556692773709074e-05, + "grad_norm": 4.245052814483643, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8779294490814209, + "num_tokens": 767387084.0, + "step": 20111 + }, + { + "epoch": 2.5584531230123395, + "ewc_loss": 0.008619721978902817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61972221173346e-05, + "grad_norm": 4.381757736206055, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8657297492027283, + "num_tokens": 767417857.0, + "step": 20112 + }, + { + "epoch": 2.55858033329093, + "ewc_loss": 0.008661084808409214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66108457557857e-05, + "grad_norm": 4.235978126525879, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8839755058288574, + "num_tokens": 767455771.0, + "step": 20113 + }, + { + "epoch": 2.5587075435695206, + "ewc_loss": 0.008530068211257458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530067862011492e-05, + "grad_norm": 4.2446513175964355, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8695166110992432, + "num_tokens": 767493187.0, + "step": 20114 + }, + { + "epoch": 2.558834753848111, + "ewc_loss": 0.008624646812677383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62464657984674e-05, + "grad_norm": 4.211669921875, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8735631704330444, + "num_tokens": 767534526.0, + "step": 20115 + }, + { + "epoch": 2.558961964126701, + "ewc_loss": 0.008579764515161514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579764107707888e-05, + "grad_norm": 4.267177104949951, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8812688589096069, + "num_tokens": 767569297.0, + "step": 20116 + }, + { + "epoch": 2.559089174405292, + "ewc_loss": 0.008615648373961449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615648403065279e-05, + "grad_norm": 4.234033107757568, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8789851069450378, + "num_tokens": 767605860.0, + "step": 20117 + }, + { + "epoch": 2.5592163846838822, + "ewc_loss": 0.008581516332924366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581516158301383e-05, + "grad_norm": 4.16439151763916, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8830780386924744, + "num_tokens": 767645252.0, + "step": 20118 + }, + { + "epoch": 2.559343594962473, + "ewc_loss": 0.008547138422727585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547137986170128e-05, + "grad_norm": 4.1894965171813965, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8820465207099915, + "num_tokens": 767681942.0, + "step": 20119 + }, + { + "epoch": 2.5594708052410633, + "ewc_loss": 0.00860777497291565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607775089330971e-05, + "grad_norm": 4.218990325927734, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8788769841194153, + "num_tokens": 767720631.0, + "step": 20120 + }, + { + "epoch": 2.559598015519654, + "ewc_loss": 0.00858918484300375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589185017626733e-05, + "grad_norm": 4.1879096031188965, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8853510618209839, + "num_tokens": 767757018.0, + "step": 20121 + }, + { + "epoch": 2.5597252257982444, + "ewc_loss": 0.00856888946145773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56888946145773e-05, + "grad_norm": 4.212501049041748, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8847436904907227, + "num_tokens": 767796046.0, + "step": 20122 + }, + { + "epoch": 2.559852436076835, + "ewc_loss": 0.00859568826854229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59568826854229e-05, + "grad_norm": 4.220678329467773, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8921332359313965, + "num_tokens": 767835678.0, + "step": 20123 + }, + { + "epoch": 2.5599796463554254, + "ewc_loss": 0.008569439873099327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569440251449123e-05, + "grad_norm": 4.14717435836792, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.894904375076294, + "num_tokens": 767870515.0, + "step": 20124 + }, + { + "epoch": 2.560106856634016, + "ewc_loss": 0.008539865724742413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539865666534752e-05, + "grad_norm": 4.205728054046631, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.86707603931427, + "num_tokens": 767911573.0, + "step": 20125 + }, + { + "epoch": 2.5602340669126065, + "ewc_loss": 0.008590183220803738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590183279011399e-05, + "grad_norm": 4.201638698577881, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8809998035430908, + "num_tokens": 767950161.0, + "step": 20126 + }, + { + "epoch": 2.560361277191197, + "ewc_loss": 0.00854526087641716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545260789105669e-05, + "grad_norm": 4.2145609855651855, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8796026110649109, + "num_tokens": 767989431.0, + "step": 20127 + }, + { + "epoch": 2.5604884874697875, + "ewc_loss": 0.008552576415240765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55257676448673e-05, + "grad_norm": 4.219067573547363, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8920152187347412, + "num_tokens": 768023551.0, + "step": 20128 + }, + { + "epoch": 2.560615697748378, + "ewc_loss": 0.008534794673323631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534795051673427e-05, + "grad_norm": 4.268265247344971, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8886329531669617, + "num_tokens": 768055317.0, + "step": 20129 + }, + { + "epoch": 2.5607429080269686, + "ewc_loss": 0.008565855212509632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565855387132615e-05, + "grad_norm": 4.249041557312012, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8763527870178223, + "num_tokens": 768089575.0, + "step": 20130 + }, + { + "epoch": 2.560870118305559, + "ewc_loss": 0.008549639023840427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549639460397884e-05, + "grad_norm": 4.220844745635986, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8737543821334839, + "num_tokens": 768131795.0, + "step": 20131 + }, + { + "epoch": 2.5609973285841496, + "ewc_loss": 0.008533031679689884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53303208714351e-05, + "grad_norm": 4.211521148681641, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8861371278762817, + "num_tokens": 768165714.0, + "step": 20132 + }, + { + "epoch": 2.56112453886274, + "ewc_loss": 0.008510980755090714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510980842402205e-05, + "grad_norm": 4.1761298179626465, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.891501784324646, + "num_tokens": 768203475.0, + "step": 20133 + }, + { + "epoch": 2.5612517491413307, + "ewc_loss": 0.008530277758836746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530278137186542e-05, + "grad_norm": 4.187507152557373, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8959177732467651, + "num_tokens": 768244841.0, + "step": 20134 + }, + { + "epoch": 2.561378959419921, + "ewc_loss": 0.008519133552908897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519133552908897e-05, + "grad_norm": 4.214580535888672, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8773701190948486, + "num_tokens": 768286502.0, + "step": 20135 + }, + { + "epoch": 2.5615061696985117, + "ewc_loss": 0.008513141423463821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513141074217856e-05, + "grad_norm": 4.238025188446045, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8895179033279419, + "num_tokens": 768320628.0, + "step": 20136 + }, + { + "epoch": 2.5616333799771023, + "ewc_loss": 0.00852102693170309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521026757080108e-05, + "grad_norm": 4.287057399749756, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8753840327262878, + "num_tokens": 768353630.0, + "step": 20137 + }, + { + "epoch": 2.561760590255693, + "ewc_loss": 0.0085375364869833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537536632502452e-05, + "grad_norm": 4.247334957122803, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8819230794906616, + "num_tokens": 768388170.0, + "step": 20138 + }, + { + "epoch": 2.5618878005342833, + "ewc_loss": 0.00848403200507164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484032150590792e-05, + "grad_norm": 4.157954216003418, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8993826508522034, + "num_tokens": 768428238.0, + "step": 20139 + }, + { + "epoch": 2.562015010812874, + "ewc_loss": 0.008468899875879288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468900341540575e-05, + "grad_norm": 4.2552289962768555, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8950201869010925, + "num_tokens": 768460075.0, + "step": 20140 + }, + { + "epoch": 2.562142221091464, + "ewc_loss": 0.00856612715870142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566126780351624e-05, + "grad_norm": 4.214108467102051, + "learning_rate": 1e-06, + "loss": 0.2839, + "mean_token_accuracy": 0.8987585306167603, + "num_tokens": 768493982.0, + "step": 20141 + }, + { + "epoch": 2.562269431370055, + "ewc_loss": 0.008484735153615475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484735008096322e-05, + "grad_norm": 4.303704738616943, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8851144909858704, + "num_tokens": 768529121.0, + "step": 20142 + }, + { + "epoch": 2.562396641648645, + "ewc_loss": 0.008568978868424892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568978955736384e-05, + "grad_norm": 4.304526329040527, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8813643455505371, + "num_tokens": 768560730.0, + "step": 20143 + }, + { + "epoch": 2.562523851927236, + "ewc_loss": 0.008523267693817616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523267752025276e-05, + "grad_norm": 4.1651458740234375, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8925586342811584, + "num_tokens": 768603015.0, + "step": 20144 + }, + { + "epoch": 2.562651062205826, + "ewc_loss": 0.008451072499155998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451072062598541e-05, + "grad_norm": 4.229378700256348, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8830834627151489, + "num_tokens": 768639847.0, + "step": 20145 + }, + { + "epoch": 2.5627782724844166, + "ewc_loss": 0.008562483824789524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562483708374202e-05, + "grad_norm": 4.205150604248047, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8796463012695312, + "num_tokens": 768681614.0, + "step": 20146 + }, + { + "epoch": 2.562905482763007, + "ewc_loss": 0.008501444011926651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501444244757295e-05, + "grad_norm": 4.197841167449951, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8884262442588806, + "num_tokens": 768722052.0, + "step": 20147 + }, + { + "epoch": 2.5630326930415976, + "ewc_loss": 0.008506213314831257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50621290737763e-05, + "grad_norm": 4.272712230682373, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8765605688095093, + "num_tokens": 768754191.0, + "step": 20148 + }, + { + "epoch": 2.563159903320188, + "ewc_loss": 0.008548146113753319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548146433895454e-05, + "grad_norm": 4.235438346862793, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8723996877670288, + "num_tokens": 768792981.0, + "step": 20149 + }, + { + "epoch": 2.5632871135987787, + "ewc_loss": 0.00848911702632904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489116589771584e-05, + "grad_norm": 4.244767189025879, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8741041421890259, + "num_tokens": 768827040.0, + "step": 20150 + }, + { + "epoch": 2.5634143238773692, + "ewc_loss": 0.008539518341422081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539517875760794e-05, + "grad_norm": 4.2580037117004395, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8804408311843872, + "num_tokens": 768865669.0, + "step": 20151 + }, + { + "epoch": 2.5635415341559598, + "ewc_loss": 0.00852565560489893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525655721314251e-05, + "grad_norm": 4.214874744415283, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8814330101013184, + "num_tokens": 768903306.0, + "step": 20152 + }, + { + "epoch": 2.5636687444345503, + "ewc_loss": 0.00852163415402174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521634299540892e-05, + "grad_norm": 4.1918864250183105, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8861909508705139, + "num_tokens": 768946322.0, + "step": 20153 + }, + { + "epoch": 2.563795954713141, + "ewc_loss": 0.008494707755744457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494708163198084e-05, + "grad_norm": 4.1403727531433105, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.8989728093147278, + "num_tokens": 768987350.0, + "step": 20154 + }, + { + "epoch": 2.5639231649917313, + "ewc_loss": 0.008502442389726639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50244250614196e-05, + "grad_norm": 4.247735023498535, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8900836706161499, + "num_tokens": 769030762.0, + "step": 20155 + }, + { + "epoch": 2.564050375270322, + "ewc_loss": 0.008542025461792946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542025898350403e-05, + "grad_norm": 4.211147308349609, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8922991156578064, + "num_tokens": 769069827.0, + "step": 20156 + }, + { + "epoch": 2.5641775855489124, + "ewc_loss": 0.008464489132165909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464488928439096e-05, + "grad_norm": 4.214038848876953, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8822426795959473, + "num_tokens": 769107742.0, + "step": 20157 + }, + { + "epoch": 2.564304795827503, + "ewc_loss": 0.008486012928187847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486012666253373e-05, + "grad_norm": 4.186560153961182, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8822628855705261, + "num_tokens": 769153286.0, + "step": 20158 + }, + { + "epoch": 2.5644320061060935, + "ewc_loss": 0.008463882841169834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463882841169834e-05, + "grad_norm": 4.2343220710754395, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8841469287872314, + "num_tokens": 769192646.0, + "step": 20159 + }, + { + "epoch": 2.564559216384684, + "ewc_loss": 0.008498811163008213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498811075696722e-05, + "grad_norm": 4.258423328399658, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8617708683013916, + "num_tokens": 769230360.0, + "step": 20160 + }, + { + "epoch": 2.5646864266632745, + "ewc_loss": 0.00849106814712286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491068001603708e-05, + "grad_norm": 4.186943054199219, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8967058658599854, + "num_tokens": 769270754.0, + "step": 20161 + }, + { + "epoch": 2.564813636941865, + "ewc_loss": 0.008439619094133377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439618977718055e-05, + "grad_norm": 4.278416156768799, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8614743947982788, + "num_tokens": 769310109.0, + "step": 20162 + }, + { + "epoch": 2.5649408472204556, + "ewc_loss": 0.008499872870743275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499872637912631e-05, + "grad_norm": 4.229163646697998, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8801182508468628, + "num_tokens": 769352101.0, + "step": 20163 + }, + { + "epoch": 2.5650680574990457, + "ewc_loss": 0.008432761766016483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432762115262449e-05, + "grad_norm": 4.217087745666504, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8963785171508789, + "num_tokens": 769386570.0, + "step": 20164 + }, + { + "epoch": 2.5651952677776366, + "ewc_loss": 0.00845217052847147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452170732198283e-05, + "grad_norm": 4.208885192871094, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8918883800506592, + "num_tokens": 769427167.0, + "step": 20165 + }, + { + "epoch": 2.5653224780562267, + "ewc_loss": 0.008442418649792671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442418766207993e-05, + "grad_norm": 4.253406524658203, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8731741905212402, + "num_tokens": 769465456.0, + "step": 20166 + }, + { + "epoch": 2.5654496883348177, + "ewc_loss": 0.008456114679574966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45611430122517e-05, + "grad_norm": 4.259486198425293, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8956823348999023, + "num_tokens": 769497135.0, + "step": 20167 + }, + { + "epoch": 2.5655768986134078, + "ewc_loss": 0.008435123600065708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435123891104013e-05, + "grad_norm": 4.23480749130249, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8775568008422852, + "num_tokens": 769533706.0, + "step": 20168 + }, + { + "epoch": 2.5657041088919987, + "ewc_loss": 0.008426759392023087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.426759450230747e-05, + "grad_norm": 4.166068077087402, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8771138191223145, + "num_tokens": 769578698.0, + "step": 20169 + }, + { + "epoch": 2.565831319170589, + "ewc_loss": 0.008407682180404663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407681889366359e-05, + "grad_norm": 4.179069519042969, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8947142362594604, + "num_tokens": 769619799.0, + "step": 20170 + }, + { + "epoch": 2.5659585294491793, + "ewc_loss": 0.008436132222414017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436132338829339e-05, + "grad_norm": 4.233314037322998, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8828489780426025, + "num_tokens": 769655973.0, + "step": 20171 + }, + { + "epoch": 2.56608573972777, + "ewc_loss": 0.00846592616289854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465925930067897e-05, + "grad_norm": 4.246835231781006, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.878239095211029, + "num_tokens": 769692333.0, + "step": 20172 + }, + { + "epoch": 2.5662129500063604, + "ewc_loss": 0.008439124561846256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439124212600291e-05, + "grad_norm": 4.165980339050293, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8862446546554565, + "num_tokens": 769731804.0, + "step": 20173 + }, + { + "epoch": 2.566340160284951, + "ewc_loss": 0.008424145169556141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.424145198659971e-05, + "grad_norm": 4.23726224899292, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8884105682373047, + "num_tokens": 769767409.0, + "step": 20174 + }, + { + "epoch": 2.5664673705635415, + "ewc_loss": 0.00847079511731863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470795000903308e-05, + "grad_norm": 4.1897358894348145, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.880098819732666, + "num_tokens": 769806468.0, + "step": 20175 + }, + { + "epoch": 2.566594580842132, + "ewc_loss": 0.008446728810667992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446729043498635e-05, + "grad_norm": 4.229036808013916, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8907837867736816, + "num_tokens": 769842025.0, + "step": 20176 + }, + { + "epoch": 2.5667217911207225, + "ewc_loss": 0.008486038073897362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486038132105023e-05, + "grad_norm": 4.189179420471191, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8783224821090698, + "num_tokens": 769884125.0, + "step": 20177 + }, + { + "epoch": 2.566849001399313, + "ewc_loss": 0.008430222049355507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430222078459337e-05, + "grad_norm": 4.257697105407715, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8739511966705322, + "num_tokens": 769920523.0, + "step": 20178 + }, + { + "epoch": 2.5669762116779036, + "ewc_loss": 0.008512976579368114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512976637575775e-05, + "grad_norm": 4.212994575500488, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8813176155090332, + "num_tokens": 769960141.0, + "step": 20179 + }, + { + "epoch": 2.567103421956494, + "ewc_loss": 0.008436612784862518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436612552031875e-05, + "grad_norm": 4.2216081619262695, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8854042291641235, + "num_tokens": 769995281.0, + "step": 20180 + }, + { + "epoch": 2.5672306322350846, + "ewc_loss": 0.008463850244879723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46385009936057e-05, + "grad_norm": 4.253396511077881, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8871127367019653, + "num_tokens": 770028230.0, + "step": 20181 + }, + { + "epoch": 2.567357842513675, + "ewc_loss": 0.00847669318318367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476693619741127e-05, + "grad_norm": 4.164396286010742, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8886522054672241, + "num_tokens": 770068996.0, + "step": 20182 + }, + { + "epoch": 2.5674850527922657, + "ewc_loss": 0.008423345163464546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.423345570918173e-05, + "grad_norm": 4.2244110107421875, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.873115599155426, + "num_tokens": 770107018.0, + "step": 20183 + }, + { + "epoch": 2.567612263070856, + "ewc_loss": 0.008500657044351101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50065698614344e-05, + "grad_norm": 4.256821632385254, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8820088505744934, + "num_tokens": 770140251.0, + "step": 20184 + }, + { + "epoch": 2.5677394733494467, + "ewc_loss": 0.008496064692735672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496065129293129e-05, + "grad_norm": 4.233893871307373, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8761028051376343, + "num_tokens": 770182241.0, + "step": 20185 + }, + { + "epoch": 2.5678666836280373, + "ewc_loss": 0.008464516140520573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464515849482268e-05, + "grad_norm": 4.205719947814941, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8948085308074951, + "num_tokens": 770223267.0, + "step": 20186 + }, + { + "epoch": 2.567993893906628, + "ewc_loss": 0.008454694412648678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454694761894643e-05, + "grad_norm": 4.2403717041015625, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.872127890586853, + "num_tokens": 770261856.0, + "step": 20187 + }, + { + "epoch": 2.5681211041852183, + "ewc_loss": 0.00851806253194809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51806253194809e-05, + "grad_norm": 4.217933654785156, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8718865513801575, + "num_tokens": 770299762.0, + "step": 20188 + }, + { + "epoch": 2.5682483144638084, + "ewc_loss": 0.008489170111715794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489169704262167e-05, + "grad_norm": 4.251402854919434, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8771747350692749, + "num_tokens": 770333690.0, + "step": 20189 + }, + { + "epoch": 2.5683755247423994, + "ewc_loss": 0.008512916974723339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512916974723339e-05, + "grad_norm": 4.179479598999023, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.889047384262085, + "num_tokens": 770375755.0, + "step": 20190 + }, + { + "epoch": 2.5685027350209895, + "ewc_loss": 0.008477323688566685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477323717670515e-05, + "grad_norm": 4.164926528930664, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8912017345428467, + "num_tokens": 770417805.0, + "step": 20191 + }, + { + "epoch": 2.5686299452995804, + "ewc_loss": 0.008501432836055756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501432603225112e-05, + "grad_norm": 4.2389235496521, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8809959292411804, + "num_tokens": 770458183.0, + "step": 20192 + }, + { + "epoch": 2.5687571555781705, + "ewc_loss": 0.00854081753641367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540817361790687e-05, + "grad_norm": 4.277247428894043, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8856383562088013, + "num_tokens": 770488973.0, + "step": 20193 + }, + { + "epoch": 2.5688843658567615, + "ewc_loss": 0.008532753214240074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532753417966887e-05, + "grad_norm": 4.1636433601379395, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8771638870239258, + "num_tokens": 770529030.0, + "step": 20194 + }, + { + "epoch": 2.5690115761353516, + "ewc_loss": 0.00846256036311388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462560072075576e-05, + "grad_norm": 4.190666198730469, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8715783357620239, + "num_tokens": 770574697.0, + "step": 20195 + }, + { + "epoch": 2.569138786413942, + "ewc_loss": 0.00851733610033989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517335663782433e-05, + "grad_norm": 4.178135871887207, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8872639536857605, + "num_tokens": 770615417.0, + "step": 20196 + }, + { + "epoch": 2.5692659966925326, + "ewc_loss": 0.008502255193889141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502254786435515e-05, + "grad_norm": 4.294212818145752, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8702265620231628, + "num_tokens": 770653228.0, + "step": 20197 + }, + { + "epoch": 2.569393206971123, + "ewc_loss": 0.008554649539291859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554649684811011e-05, + "grad_norm": 4.172027111053467, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8962280750274658, + "num_tokens": 770693876.0, + "step": 20198 + }, + { + "epoch": 2.5695204172497137, + "ewc_loss": 0.008459764532744884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459764649160206e-05, + "grad_norm": 4.268556594848633, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8811363577842712, + "num_tokens": 770728754.0, + "step": 20199 + }, + { + "epoch": 2.5696476275283042, + "ewc_loss": 0.00853237695991993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532376523362473e-05, + "grad_norm": 4.238370895385742, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8796216249465942, + "num_tokens": 770767961.0, + "step": 20200 + }, + { + "epoch": 2.5697748378068948, + "ewc_loss": 0.008476709946990013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476709626847878e-05, + "grad_norm": 4.225938320159912, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8573746085166931, + "num_tokens": 770808880.0, + "step": 20201 + }, + { + "epoch": 2.5699020480854853, + "ewc_loss": 0.008495922200381756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49592179292813e-05, + "grad_norm": 4.238552570343018, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8773319721221924, + "num_tokens": 770846483.0, + "step": 20202 + }, + { + "epoch": 2.570029258364076, + "ewc_loss": 0.008513727225363255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513727516401559e-05, + "grad_norm": 4.232547283172607, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8618045449256897, + "num_tokens": 770884234.0, + "step": 20203 + }, + { + "epoch": 2.5701564686426663, + "ewc_loss": 0.008499914780259132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499914838466793e-05, + "grad_norm": 4.262890338897705, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.86812824010849, + "num_tokens": 770918785.0, + "step": 20204 + }, + { + "epoch": 2.570283678921257, + "ewc_loss": 0.008535383269190788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535383676644415e-05, + "grad_norm": 4.175930976867676, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.9011914730072021, + "num_tokens": 770956889.0, + "step": 20205 + }, + { + "epoch": 2.5704108891998474, + "ewc_loss": 0.00847038347274065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470383181702346e-05, + "grad_norm": 4.2726731300354, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.868445634841919, + "num_tokens": 770994937.0, + "step": 20206 + }, + { + "epoch": 2.570538099478438, + "ewc_loss": 0.008554359897971153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554360101697966e-05, + "grad_norm": 4.231585502624512, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8909896612167358, + "num_tokens": 771031451.0, + "step": 20207 + }, + { + "epoch": 2.5706653097570284, + "ewc_loss": 0.008485021069645882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48502095323056e-05, + "grad_norm": 4.254688739776611, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.889606773853302, + "num_tokens": 771061575.0, + "step": 20208 + }, + { + "epoch": 2.570792520035619, + "ewc_loss": 0.008529835380613804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5298357589636e-05, + "grad_norm": 4.201314449310303, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8807240724563599, + "num_tokens": 771103366.0, + "step": 20209 + }, + { + "epoch": 2.5709197303142095, + "ewc_loss": 0.008502471260726452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502471609972417e-05, + "grad_norm": 4.239418983459473, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8855537176132202, + "num_tokens": 771142178.0, + "step": 20210 + }, + { + "epoch": 2.5710469405928, + "ewc_loss": 0.008531092666089535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531092316843569e-05, + "grad_norm": 4.2563323974609375, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8822070956230164, + "num_tokens": 771179296.0, + "step": 20211 + }, + { + "epoch": 2.5711741508713906, + "ewc_loss": 0.008544370532035828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544370939489454e-05, + "grad_norm": 4.211601734161377, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8752779960632324, + "num_tokens": 771219084.0, + "step": 20212 + }, + { + "epoch": 2.571301361149981, + "ewc_loss": 0.008495284244418144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495284419041127e-05, + "grad_norm": 4.249426364898682, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.888085126876831, + "num_tokens": 771250609.0, + "step": 20213 + }, + { + "epoch": 2.571428571428571, + "ewc_loss": 0.008551498875021935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551498467568308e-05, + "grad_norm": 4.247773170471191, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8817664980888367, + "num_tokens": 771286191.0, + "step": 20214 + }, + { + "epoch": 2.571555781707162, + "ewc_loss": 0.00855169352144003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551693463232368e-05, + "grad_norm": 4.244429588317871, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8801476955413818, + "num_tokens": 771320898.0, + "step": 20215 + }, + { + "epoch": 2.5716829919857522, + "ewc_loss": 0.008544002659618855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544002776034176e-05, + "grad_norm": 4.311390399932861, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8745555877685547, + "num_tokens": 771353977.0, + "step": 20216 + }, + { + "epoch": 2.571810202264343, + "ewc_loss": 0.008595149032771587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595149120083079e-05, + "grad_norm": 4.219549179077148, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8858704566955566, + "num_tokens": 771391960.0, + "step": 20217 + }, + { + "epoch": 2.5719374125429333, + "ewc_loss": 0.008528747595846653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52874800330028e-05, + "grad_norm": 4.256937503814697, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8745384216308594, + "num_tokens": 771427764.0, + "step": 20218 + }, + { + "epoch": 2.572064622821524, + "ewc_loss": 0.008583735674619675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58373605296947e-05, + "grad_norm": 4.228092670440674, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8826030492782593, + "num_tokens": 771469706.0, + "step": 20219 + }, + { + "epoch": 2.5721918331001143, + "ewc_loss": 0.008531300351023674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531300409231335e-05, + "grad_norm": 4.1915740966796875, + "learning_rate": 1e-06, + "loss": 0.279, + "mean_token_accuracy": 0.9010228514671326, + "num_tokens": 771504296.0, + "step": 20220 + }, + { + "epoch": 2.572319043378705, + "ewc_loss": 0.008538607507944107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53860765346326e-05, + "grad_norm": 4.211589336395264, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8819783926010132, + "num_tokens": 771544496.0, + "step": 20221 + }, + { + "epoch": 2.5724462536572954, + "ewc_loss": 0.008541509509086609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541509305359796e-05, + "grad_norm": 4.197234630584717, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8891230821609497, + "num_tokens": 771584322.0, + "step": 20222 + }, + { + "epoch": 2.572573463935886, + "ewc_loss": 0.008521568030118942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521568088326603e-05, + "grad_norm": 4.192343711853027, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8903648257255554, + "num_tokens": 771621424.0, + "step": 20223 + }, + { + "epoch": 2.5727006742144765, + "ewc_loss": 0.008525463752448559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525463636033237e-05, + "grad_norm": 4.234038352966309, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8839235305786133, + "num_tokens": 771658173.0, + "step": 20224 + }, + { + "epoch": 2.572827884493067, + "ewc_loss": 0.008548498153686523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54849859024398e-05, + "grad_norm": 4.230932235717773, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8754860162734985, + "num_tokens": 771698182.0, + "step": 20225 + }, + { + "epoch": 2.5729550947716575, + "ewc_loss": 0.008542724885046482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542725117877126e-05, + "grad_norm": 4.326888561248779, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8689736127853394, + "num_tokens": 771731559.0, + "step": 20226 + }, + { + "epoch": 2.573082305050248, + "ewc_loss": 0.008584683761000633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584683382650837e-05, + "grad_norm": 4.174647331237793, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8765971064567566, + "num_tokens": 771774362.0, + "step": 20227 + }, + { + "epoch": 2.5732095153288386, + "ewc_loss": 0.008463161997497082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463161793770269e-05, + "grad_norm": 4.264904022216797, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8720095753669739, + "num_tokens": 771810540.0, + "step": 20228 + }, + { + "epoch": 2.573336725607429, + "ewc_loss": 0.0085474057123065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54740574141033e-05, + "grad_norm": 4.1980414390563965, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8858522772789001, + "num_tokens": 771846221.0, + "step": 20229 + }, + { + "epoch": 2.5734639358860196, + "ewc_loss": 0.008506196551024914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506196172675118e-05, + "grad_norm": 4.261545181274414, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8922007083892822, + "num_tokens": 771882268.0, + "step": 20230 + }, + { + "epoch": 2.57359114616461, + "ewc_loss": 0.008545075543224812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545075979782268e-05, + "grad_norm": 4.289156436920166, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8810611963272095, + "num_tokens": 771913176.0, + "step": 20231 + }, + { + "epoch": 2.5737183564432007, + "ewc_loss": 0.008553124964237213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553124644095078e-05, + "grad_norm": 4.260648727416992, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.876560628414154, + "num_tokens": 771948163.0, + "step": 20232 + }, + { + "epoch": 2.573845566721791, + "ewc_loss": 0.008529814891517162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529814658686519e-05, + "grad_norm": 4.2150559425354, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8930802345275879, + "num_tokens": 771988296.0, + "step": 20233 + }, + { + "epoch": 2.5739727770003817, + "ewc_loss": 0.00852547213435173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525472367182374e-05, + "grad_norm": 4.224105358123779, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8895013928413391, + "num_tokens": 772024330.0, + "step": 20234 + }, + { + "epoch": 2.5740999872789723, + "ewc_loss": 0.008549096994102001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549097401555628e-05, + "grad_norm": 4.250360488891602, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8711927533149719, + "num_tokens": 772062291.0, + "step": 20235 + }, + { + "epoch": 2.574227197557563, + "ewc_loss": 0.00855077151209116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550771599402651e-05, + "grad_norm": 4.150346755981445, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8838993310928345, + "num_tokens": 772110239.0, + "step": 20236 + }, + { + "epoch": 2.5743544078361533, + "ewc_loss": 0.008480369113385677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480369433527812e-05, + "grad_norm": 4.220768451690674, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8767000436782837, + "num_tokens": 772153201.0, + "step": 20237 + }, + { + "epoch": 2.574481618114744, + "ewc_loss": 0.008548391982913017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548392361262813e-05, + "grad_norm": 4.243418216705322, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8837506175041199, + "num_tokens": 772186816.0, + "step": 20238 + }, + { + "epoch": 2.574608828393334, + "ewc_loss": 0.008541887626051903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541887655155733e-05, + "grad_norm": 4.210803031921387, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8921067118644714, + "num_tokens": 772224238.0, + "step": 20239 + }, + { + "epoch": 2.574736038671925, + "ewc_loss": 0.008506599813699722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506599988322705e-05, + "grad_norm": 4.177769660949707, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.890192985534668, + "num_tokens": 772263505.0, + "step": 20240 + }, + { + "epoch": 2.574863248950515, + "ewc_loss": 0.008509189821779728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509190229233354e-05, + "grad_norm": 4.213295936584473, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.890691876411438, + "num_tokens": 772303205.0, + "step": 20241 + }, + { + "epoch": 2.574990459229106, + "ewc_loss": 0.008531918749213219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53191886562854e-05, + "grad_norm": 4.199173450469971, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8860102295875549, + "num_tokens": 772344495.0, + "step": 20242 + }, + { + "epoch": 2.575117669507696, + "ewc_loss": 0.008495374582707882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495374640915543e-05, + "grad_norm": 4.292861461639404, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.877490758895874, + "num_tokens": 772380296.0, + "step": 20243 + }, + { + "epoch": 2.5752448797862866, + "ewc_loss": 0.008577127009630203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577126573072746e-05, + "grad_norm": 4.203062534332275, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8943972587585449, + "num_tokens": 772423933.0, + "step": 20244 + }, + { + "epoch": 2.575372090064877, + "ewc_loss": 0.008462470956146717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462471305392683e-05, + "grad_norm": 4.188926696777344, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8697518110275269, + "num_tokens": 772464514.0, + "step": 20245 + }, + { + "epoch": 2.5754993003434676, + "ewc_loss": 0.008521230891346931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521231211489066e-05, + "grad_norm": 4.217281341552734, + "learning_rate": 1e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.8983617424964905, + "num_tokens": 772498501.0, + "step": 20246 + }, + { + "epoch": 2.575626510622058, + "ewc_loss": 0.008536084555089474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53608435136266e-05, + "grad_norm": 4.223536968231201, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8782908916473389, + "num_tokens": 772538072.0, + "step": 20247 + }, + { + "epoch": 2.5757537209006487, + "ewc_loss": 0.008518688380718231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51868826430291e-05, + "grad_norm": 4.233637809753418, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8821702599525452, + "num_tokens": 772577434.0, + "step": 20248 + }, + { + "epoch": 2.575880931179239, + "ewc_loss": 0.008521594107151031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521594281774014e-05, + "grad_norm": 4.18809700012207, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8942264318466187, + "num_tokens": 772612917.0, + "step": 20249 + }, + { + "epoch": 2.5760081414578297, + "ewc_loss": 0.008487624116241932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487624290864915e-05, + "grad_norm": 4.230121612548828, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8933095932006836, + "num_tokens": 772652939.0, + "step": 20250 + }, + { + "epoch": 2.5761353517364203, + "ewc_loss": 0.008530069142580032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530069317203015e-05, + "grad_norm": 4.252899646759033, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8883501291275024, + "num_tokens": 772688018.0, + "step": 20251 + }, + { + "epoch": 2.576262562015011, + "ewc_loss": 0.008516648784279823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516648813383654e-05, + "grad_norm": 4.225939750671387, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8727484345436096, + "num_tokens": 772728071.0, + "step": 20252 + }, + { + "epoch": 2.5763897722936013, + "ewc_loss": 0.008471425622701645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471425826428458e-05, + "grad_norm": 4.24360466003418, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8850193023681641, + "num_tokens": 772762730.0, + "step": 20253 + }, + { + "epoch": 2.576516982572192, + "ewc_loss": 0.008514514192938805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514514047419652e-05, + "grad_norm": 4.214273929595947, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8861315846443176, + "num_tokens": 772799125.0, + "step": 20254 + }, + { + "epoch": 2.5766441928507824, + "ewc_loss": 0.00848165899515152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481658733217046e-05, + "grad_norm": 4.1868696212768555, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8948459625244141, + "num_tokens": 772838657.0, + "step": 20255 + }, + { + "epoch": 2.576771403129373, + "ewc_loss": 0.008474974893033504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474975038552657e-05, + "grad_norm": 4.2350263595581055, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8908138871192932, + "num_tokens": 772876470.0, + "step": 20256 + }, + { + "epoch": 2.5768986134079634, + "ewc_loss": 0.008508764207363129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508763858117163e-05, + "grad_norm": 4.286163330078125, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8756519556045532, + "num_tokens": 772910369.0, + "step": 20257 + }, + { + "epoch": 2.577025823686554, + "ewc_loss": 0.008501218631863594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501218690071255e-05, + "grad_norm": 4.188933849334717, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8885622620582581, + "num_tokens": 772948129.0, + "step": 20258 + }, + { + "epoch": 2.5771530339651445, + "ewc_loss": 0.008436613716185093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436614007223397e-05, + "grad_norm": 4.1627421379089355, + "learning_rate": 1e-06, + "loss": 0.2649, + "mean_token_accuracy": 0.9081280827522278, + "num_tokens": 772985359.0, + "step": 20259 + }, + { + "epoch": 2.577280244243735, + "ewc_loss": 0.008475788868963718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475788490613922e-05, + "grad_norm": 4.285959720611572, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8856209516525269, + "num_tokens": 773019402.0, + "step": 20260 + }, + { + "epoch": 2.5774074545223256, + "ewc_loss": 0.00854824110865593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548241021344438e-05, + "grad_norm": 4.271997451782227, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8834383487701416, + "num_tokens": 773059176.0, + "step": 20261 + }, + { + "epoch": 2.5775346648009156, + "ewc_loss": 0.008503399789333344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503400022163987e-05, + "grad_norm": 4.2312397956848145, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.886995792388916, + "num_tokens": 773094274.0, + "step": 20262 + }, + { + "epoch": 2.5776618750795066, + "ewc_loss": 0.008467775769531727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467775478493422e-05, + "grad_norm": 4.232359409332275, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8811639547348022, + "num_tokens": 773135679.0, + "step": 20263 + }, + { + "epoch": 2.5777890853580967, + "ewc_loss": 0.008511140011250973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511140185873955e-05, + "grad_norm": 4.224798679351807, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8786323070526123, + "num_tokens": 773174270.0, + "step": 20264 + }, + { + "epoch": 2.5779162956366877, + "ewc_loss": 0.008485141210258007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485141006531194e-05, + "grad_norm": 4.23892068862915, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8836249709129333, + "num_tokens": 773213582.0, + "step": 20265 + }, + { + "epoch": 2.5780435059152778, + "ewc_loss": 0.008507389575242996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507389429723844e-05, + "grad_norm": 4.314321041107178, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8792398571968079, + "num_tokens": 773244980.0, + "step": 20266 + }, + { + "epoch": 2.5781707161938687, + "ewc_loss": 0.008547987788915634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547987818019465e-05, + "grad_norm": 4.182684898376465, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8960084915161133, + "num_tokens": 773287395.0, + "step": 20267 + }, + { + "epoch": 2.578297926472459, + "ewc_loss": 0.00845483597368002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454835915472358e-05, + "grad_norm": 4.256506443023682, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8797425627708435, + "num_tokens": 773324593.0, + "step": 20268 + }, + { + "epoch": 2.5784251367510493, + "ewc_loss": 0.008539548143744469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539547707187012e-05, + "grad_norm": 4.223239421844482, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8791148662567139, + "num_tokens": 773361802.0, + "step": 20269 + }, + { + "epoch": 2.57855234702964, + "ewc_loss": 0.008488494902849197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488495222991332e-05, + "grad_norm": 4.2408447265625, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.886929452419281, + "num_tokens": 773401348.0, + "step": 20270 + }, + { + "epoch": 2.5786795573082304, + "ewc_loss": 0.008503584191203117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503584103891626e-05, + "grad_norm": 4.213870525360107, + "learning_rate": 1e-06, + "loss": 0.2965, + "mean_token_accuracy": 0.8980189561843872, + "num_tokens": 773439040.0, + "step": 20271 + }, + { + "epoch": 2.578806767586821, + "ewc_loss": 0.008487080223858356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487080049235374e-05, + "grad_norm": 4.289347171783447, + "learning_rate": 1e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.89445561170578, + "num_tokens": 773471382.0, + "step": 20272 + }, + { + "epoch": 2.5789339778654115, + "ewc_loss": 0.008524766191840172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524766599293798e-05, + "grad_norm": 4.237325668334961, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.872600257396698, + "num_tokens": 773508924.0, + "step": 20273 + }, + { + "epoch": 2.579061188144002, + "ewc_loss": 0.008478643372654915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478643576381728e-05, + "grad_norm": 4.1537909507751465, + "learning_rate": 1e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8965436220169067, + "num_tokens": 773550219.0, + "step": 20274 + }, + { + "epoch": 2.5791883984225925, + "ewc_loss": 0.008468739688396454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468739542877302e-05, + "grad_norm": 4.23151159286499, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8788666129112244, + "num_tokens": 773589363.0, + "step": 20275 + }, + { + "epoch": 2.579315608701183, + "ewc_loss": 0.008512559346854687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512559725204483e-05, + "grad_norm": 4.203610897064209, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8822864890098572, + "num_tokens": 773630289.0, + "step": 20276 + }, + { + "epoch": 2.5794428189797736, + "ewc_loss": 0.008471034467220306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471034379908815e-05, + "grad_norm": 4.219815731048584, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8833599090576172, + "num_tokens": 773668660.0, + "step": 20277 + }, + { + "epoch": 2.579570029258364, + "ewc_loss": 0.008491678163409233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491678454447538e-05, + "grad_norm": 4.230128765106201, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8804178833961487, + "num_tokens": 773704318.0, + "step": 20278 + }, + { + "epoch": 2.5796972395369546, + "ewc_loss": 0.008489489555358887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489489846397191e-05, + "grad_norm": 4.2476959228515625, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8680694699287415, + "num_tokens": 773745821.0, + "step": 20279 + }, + { + "epoch": 2.579824449815545, + "ewc_loss": 0.008503010496497154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503010030835867e-05, + "grad_norm": 4.200488090515137, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8908420205116272, + "num_tokens": 773782880.0, + "step": 20280 + }, + { + "epoch": 2.5799516600941357, + "ewc_loss": 0.008458626456558704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458626689389348e-05, + "grad_norm": 4.2348198890686035, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8703164458274841, + "num_tokens": 773824880.0, + "step": 20281 + }, + { + "epoch": 2.580078870372726, + "ewc_loss": 0.008495262823998928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495262591168284e-05, + "grad_norm": 4.202702045440674, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8866893649101257, + "num_tokens": 773863210.0, + "step": 20282 + }, + { + "epoch": 2.5802060806513167, + "ewc_loss": 0.008453874848783016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453874761471525e-05, + "grad_norm": 4.222810745239258, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8884195685386658, + "num_tokens": 773903432.0, + "step": 20283 + }, + { + "epoch": 2.5803332909299073, + "ewc_loss": 0.008474228903651237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474229252897203e-05, + "grad_norm": 4.230644226074219, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8905438780784607, + "num_tokens": 773942424.0, + "step": 20284 + }, + { + "epoch": 2.580460501208498, + "ewc_loss": 0.00846412219107151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464122220175341e-05, + "grad_norm": 4.257772445678711, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8819867372512817, + "num_tokens": 773976598.0, + "step": 20285 + }, + { + "epoch": 2.5805877114870883, + "ewc_loss": 0.008463157340884209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4631574281957e-05, + "grad_norm": 4.241657257080078, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8765187859535217, + "num_tokens": 774016696.0, + "step": 20286 + }, + { + "epoch": 2.5807149217656784, + "ewc_loss": 0.0084372004494071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4372004494071e-05, + "grad_norm": 4.165555953979492, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8770298361778259, + "num_tokens": 774059046.0, + "step": 20287 + }, + { + "epoch": 2.5808421320442694, + "ewc_loss": 0.008410505019128323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.410504960920662e-05, + "grad_norm": 4.257601261138916, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8798972368240356, + "num_tokens": 774095322.0, + "step": 20288 + }, + { + "epoch": 2.5809693423228595, + "ewc_loss": 0.008474133908748627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474133937852457e-05, + "grad_norm": 4.202317714691162, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8725495338439941, + "num_tokens": 774134506.0, + "step": 20289 + }, + { + "epoch": 2.5810965526014504, + "ewc_loss": 0.008422993123531342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.422992686973885e-05, + "grad_norm": 4.169920444488525, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8970491886138916, + "num_tokens": 774176997.0, + "step": 20290 + }, + { + "epoch": 2.5812237628800405, + "ewc_loss": 0.008418681100010872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.41868095449172e-05, + "grad_norm": 4.304148197174072, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8788713216781616, + "num_tokens": 774213900.0, + "step": 20291 + }, + { + "epoch": 2.5813509731586315, + "ewc_loss": 0.008516977541148663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516977686667815e-05, + "grad_norm": 4.216660976409912, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8742773532867432, + "num_tokens": 774259687.0, + "step": 20292 + }, + { + "epoch": 2.5814781834372216, + "ewc_loss": 0.008380630053579807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.380629878956825e-05, + "grad_norm": 4.216495990753174, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8862582445144653, + "num_tokens": 774297200.0, + "step": 20293 + }, + { + "epoch": 2.581605393715812, + "ewc_loss": 0.008454586379230022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454586350126192e-05, + "grad_norm": 4.231932640075684, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8940222859382629, + "num_tokens": 774336642.0, + "step": 20294 + }, + { + "epoch": 2.5817326039944026, + "ewc_loss": 0.008448443375527859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448443259112537e-05, + "grad_norm": 4.249118328094482, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8688976168632507, + "num_tokens": 774376762.0, + "step": 20295 + }, + { + "epoch": 2.581859814272993, + "ewc_loss": 0.008421003818511963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421003440162167e-05, + "grad_norm": 4.1736249923706055, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8910537958145142, + "num_tokens": 774421360.0, + "step": 20296 + }, + { + "epoch": 2.5819870245515837, + "ewc_loss": 0.008403005078434944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403004903811961e-05, + "grad_norm": 4.2097368240356445, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8890641927719116, + "num_tokens": 774460850.0, + "step": 20297 + }, + { + "epoch": 2.582114234830174, + "ewc_loss": 0.008446086198091507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446085848845541e-05, + "grad_norm": 4.236851692199707, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8802855014801025, + "num_tokens": 774497542.0, + "step": 20298 + }, + { + "epoch": 2.5822414451087647, + "ewc_loss": 0.008414657786488533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.414657349931076e-05, + "grad_norm": 4.201144218444824, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8759346008300781, + "num_tokens": 774538164.0, + "step": 20299 + }, + { + "epoch": 2.5823686553873553, + "ewc_loss": 0.00839496124535799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394961332669482e-05, + "grad_norm": 4.2776198387146, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.877031147480011, + "num_tokens": 774576118.0, + "step": 20300 + }, + { + "epoch": 2.582495865665946, + "ewc_loss": 0.008459527976810932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459528180537745e-05, + "grad_norm": 4.1680779457092285, + "learning_rate": 1e-06, + "loss": 0.2828, + "mean_token_accuracy": 0.9025028944015503, + "num_tokens": 774617506.0, + "step": 20301 + }, + { + "epoch": 2.5826230759445363, + "ewc_loss": 0.008354698307812214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.354698366019875e-05, + "grad_norm": 4.339457988739014, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8605694770812988, + "num_tokens": 774653720.0, + "step": 20302 + }, + { + "epoch": 2.582750286223127, + "ewc_loss": 0.008511468768119812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511468331562355e-05, + "grad_norm": 4.206548690795898, + "learning_rate": 1e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.9020766615867615, + "num_tokens": 774691056.0, + "step": 20303 + }, + { + "epoch": 2.5828774965017174, + "ewc_loss": 0.008385753259062767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.385753608308733e-05, + "grad_norm": 4.252727031707764, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8783767223358154, + "num_tokens": 774726514.0, + "step": 20304 + }, + { + "epoch": 2.583004706780308, + "ewc_loss": 0.008436824195086956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436824282398447e-05, + "grad_norm": 4.275292873382568, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8751993179321289, + "num_tokens": 774760831.0, + "step": 20305 + }, + { + "epoch": 2.5831319170588984, + "ewc_loss": 0.0084349000826478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434899791609496e-05, + "grad_norm": 4.195457458496094, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.891631007194519, + "num_tokens": 774799399.0, + "step": 20306 + }, + { + "epoch": 2.583259127337489, + "ewc_loss": 0.008404402993619442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.404403342865407e-05, + "grad_norm": 4.259655952453613, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8749277591705322, + "num_tokens": 774839910.0, + "step": 20307 + }, + { + "epoch": 2.5833863376160795, + "ewc_loss": 0.00846320390701294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463203994324431e-05, + "grad_norm": 4.250615119934082, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.882826566696167, + "num_tokens": 774875694.0, + "step": 20308 + }, + { + "epoch": 2.58351354789467, + "ewc_loss": 0.00845396239310503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453962800558656e-05, + "grad_norm": 4.331092357635498, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.884964644908905, + "num_tokens": 774908020.0, + "step": 20309 + }, + { + "epoch": 2.5836407581732606, + "ewc_loss": 0.008481652475893497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481652912450954e-05, + "grad_norm": 4.171205520629883, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8743569850921631, + "num_tokens": 774946716.0, + "step": 20310 + }, + { + "epoch": 2.583767968451851, + "ewc_loss": 0.008397458121180534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.39745844132267e-05, + "grad_norm": 4.27428674697876, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8773418664932251, + "num_tokens": 774981732.0, + "step": 20311 + }, + { + "epoch": 2.583895178730441, + "ewc_loss": 0.008515316061675549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515315857948735e-05, + "grad_norm": 4.203094959259033, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8883214592933655, + "num_tokens": 775015280.0, + "step": 20312 + }, + { + "epoch": 2.584022389009032, + "ewc_loss": 0.008451193571090698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451193571090698e-05, + "grad_norm": 4.219328880310059, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8900414705276489, + "num_tokens": 775054518.0, + "step": 20313 + }, + { + "epoch": 2.5841495992876222, + "ewc_loss": 0.008489325642585754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489325409755111e-05, + "grad_norm": 4.187518119812012, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8879018425941467, + "num_tokens": 775091548.0, + "step": 20314 + }, + { + "epoch": 2.584276809566213, + "ewc_loss": 0.008483055047690868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483054989483207e-05, + "grad_norm": 4.233831405639648, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8712455630302429, + "num_tokens": 775130916.0, + "step": 20315 + }, + { + "epoch": 2.5844040198448033, + "ewc_loss": 0.008513052016496658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513051579939201e-05, + "grad_norm": 4.222959995269775, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8952544927597046, + "num_tokens": 775165875.0, + "step": 20316 + }, + { + "epoch": 2.584531230123394, + "ewc_loss": 0.008501428179442883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501428237650543e-05, + "grad_norm": 4.200437545776367, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8694103360176086, + "num_tokens": 775211463.0, + "step": 20317 + }, + { + "epoch": 2.5846584404019843, + "ewc_loss": 0.008485655300319195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485655416734517e-05, + "grad_norm": 4.250528812408447, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8899258375167847, + "num_tokens": 775248338.0, + "step": 20318 + }, + { + "epoch": 2.584785650680575, + "ewc_loss": 0.008525258861482143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525258454028517e-05, + "grad_norm": 4.283855438232422, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.872925341129303, + "num_tokens": 775285707.0, + "step": 20319 + }, + { + "epoch": 2.5849128609591654, + "ewc_loss": 0.008522610180079937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522610005456954e-05, + "grad_norm": 4.2366719245910645, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8910790085792542, + "num_tokens": 775322682.0, + "step": 20320 + }, + { + "epoch": 2.585040071237756, + "ewc_loss": 0.00849184487015009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491845073876902e-05, + "grad_norm": 4.187677383422852, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8917692303657532, + "num_tokens": 775358274.0, + "step": 20321 + }, + { + "epoch": 2.5851672815163464, + "ewc_loss": 0.00849592313170433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495923248119652e-05, + "grad_norm": 4.267971038818359, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8802825808525085, + "num_tokens": 775394472.0, + "step": 20322 + }, + { + "epoch": 2.585294491794937, + "ewc_loss": 0.008544481359422207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54448153404519e-05, + "grad_norm": 4.223460674285889, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8920539617538452, + "num_tokens": 775433647.0, + "step": 20323 + }, + { + "epoch": 2.5854217020735275, + "ewc_loss": 0.008480655029416084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480654651066288e-05, + "grad_norm": 4.272271633148193, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8744056224822998, + "num_tokens": 775469159.0, + "step": 20324 + }, + { + "epoch": 2.585548912352118, + "ewc_loss": 0.008541690185666084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541689749108627e-05, + "grad_norm": 4.2255167961120605, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8910055160522461, + "num_tokens": 775507329.0, + "step": 20325 + }, + { + "epoch": 2.5856761226307086, + "ewc_loss": 0.008492778986692429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492778579238802e-05, + "grad_norm": 4.309665679931641, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8794292211532593, + "num_tokens": 775544638.0, + "step": 20326 + }, + { + "epoch": 2.585803332909299, + "ewc_loss": 0.00854556541889906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545565651729703e-05, + "grad_norm": 4.15464973449707, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8839872479438782, + "num_tokens": 775584918.0, + "step": 20327 + }, + { + "epoch": 2.5859305431878896, + "ewc_loss": 0.008464782498776913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464782877126709e-05, + "grad_norm": 4.212801933288574, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8766475915908813, + "num_tokens": 775624214.0, + "step": 20328 + }, + { + "epoch": 2.58605775346648, + "ewc_loss": 0.00854012742638588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540127601008862e-05, + "grad_norm": 4.208659648895264, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8709287643432617, + "num_tokens": 775668372.0, + "step": 20329 + }, + { + "epoch": 2.5861849637450707, + "ewc_loss": 0.008522089570760727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52208977448754e-05, + "grad_norm": 4.31899881362915, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8571048974990845, + "num_tokens": 775700578.0, + "step": 20330 + }, + { + "epoch": 2.586312174023661, + "ewc_loss": 0.008580132387578487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580132271163166e-05, + "grad_norm": 4.168724060058594, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8887811899185181, + "num_tokens": 775746980.0, + "step": 20331 + }, + { + "epoch": 2.5864393843022517, + "ewc_loss": 0.008442402817308903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442402759101242e-05, + "grad_norm": 4.2621049880981445, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8778254985809326, + "num_tokens": 775780762.0, + "step": 20332 + }, + { + "epoch": 2.5865665945808423, + "ewc_loss": 0.00857914611697197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579145651310682e-05, + "grad_norm": 4.249698638916016, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8892694711685181, + "num_tokens": 775815591.0, + "step": 20333 + }, + { + "epoch": 2.586693804859433, + "ewc_loss": 0.008515770547091961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515770605299622e-05, + "grad_norm": 4.194636821746826, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8735496997833252, + "num_tokens": 775856414.0, + "step": 20334 + }, + { + "epoch": 2.5868210151380233, + "ewc_loss": 0.008504653349518776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504653669660911e-05, + "grad_norm": 4.1900248527526855, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8832029104232788, + "num_tokens": 775896939.0, + "step": 20335 + }, + { + "epoch": 2.586948225416614, + "ewc_loss": 0.00851613748818636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516137313563377e-05, + "grad_norm": 4.187612533569336, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8772845268249512, + "num_tokens": 775941778.0, + "step": 20336 + }, + { + "epoch": 2.587075435695204, + "ewc_loss": 0.00852216873317957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522168354829773e-05, + "grad_norm": 4.249466419219971, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8828603029251099, + "num_tokens": 775979346.0, + "step": 20337 + }, + { + "epoch": 2.587202645973795, + "ewc_loss": 0.008536291308701038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536290988558903e-05, + "grad_norm": 4.280570030212402, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8806958198547363, + "num_tokens": 776013793.0, + "step": 20338 + }, + { + "epoch": 2.587329856252385, + "ewc_loss": 0.008549579419195652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549579069949687e-05, + "grad_norm": 4.173629283905029, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8943265676498413, + "num_tokens": 776058513.0, + "step": 20339 + }, + { + "epoch": 2.587457066530976, + "ewc_loss": 0.00847609806805849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476097718812525e-05, + "grad_norm": 4.182684421539307, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8830852508544922, + "num_tokens": 776102790.0, + "step": 20340 + }, + { + "epoch": 2.587584276809566, + "ewc_loss": 0.008533833548426628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533833170076832e-05, + "grad_norm": 4.27982759475708, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8979417085647583, + "num_tokens": 776133898.0, + "step": 20341 + }, + { + "epoch": 2.5877114870881566, + "ewc_loss": 0.00855852011591196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558520494261757e-05, + "grad_norm": 4.274998664855957, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8804425001144409, + "num_tokens": 776167590.0, + "step": 20342 + }, + { + "epoch": 2.587838697366747, + "ewc_loss": 0.008495282381772995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495282236253843e-05, + "grad_norm": 4.194432258605957, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8769817352294922, + "num_tokens": 776207966.0, + "step": 20343 + }, + { + "epoch": 2.5879659076453376, + "ewc_loss": 0.008489524945616722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48952477099374e-05, + "grad_norm": 4.240683078765869, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8835458755493164, + "num_tokens": 776245601.0, + "step": 20344 + }, + { + "epoch": 2.588093117923928, + "ewc_loss": 0.008527659811079502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527660247636959e-05, + "grad_norm": 4.212271690368652, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.890905499458313, + "num_tokens": 776285377.0, + "step": 20345 + }, + { + "epoch": 2.5882203282025187, + "ewc_loss": 0.008474218659102917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474218338960782e-05, + "grad_norm": 4.197187900543213, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8748126029968262, + "num_tokens": 776328116.0, + "step": 20346 + }, + { + "epoch": 2.588347538481109, + "ewc_loss": 0.008502999320626259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502999116899446e-05, + "grad_norm": 4.240237236022949, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.868284285068512, + "num_tokens": 776369008.0, + "step": 20347 + }, + { + "epoch": 2.5884747487596997, + "ewc_loss": 0.008497808128595352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497807721141726e-05, + "grad_norm": 4.214247703552246, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8738599419593811, + "num_tokens": 776410151.0, + "step": 20348 + }, + { + "epoch": 2.5886019590382903, + "ewc_loss": 0.008489716798067093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489716856274754e-05, + "grad_norm": 4.211794376373291, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.9059531688690186, + "num_tokens": 776445944.0, + "step": 20349 + }, + { + "epoch": 2.588729169316881, + "ewc_loss": 0.008510565385222435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510565385222435e-05, + "grad_norm": 4.258590221405029, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8870065212249756, + "num_tokens": 776485388.0, + "step": 20350 + }, + { + "epoch": 2.5888563795954713, + "ewc_loss": 0.008517159149050713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51715958560817e-05, + "grad_norm": 4.218630313873291, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8656612634658813, + "num_tokens": 776529135.0, + "step": 20351 + }, + { + "epoch": 2.588983589874062, + "ewc_loss": 0.008474962785840034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474962669424713e-05, + "grad_norm": 4.232434272766113, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.875363826751709, + "num_tokens": 776564310.0, + "step": 20352 + }, + { + "epoch": 2.5891108001526524, + "ewc_loss": 0.008489438332617283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48943818709813e-05, + "grad_norm": 4.148067474365234, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8945900797843933, + "num_tokens": 776609004.0, + "step": 20353 + }, + { + "epoch": 2.589238010431243, + "ewc_loss": 0.00844465009868145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444650302408263e-05, + "grad_norm": 4.25112247467041, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8886755704879761, + "num_tokens": 776645796.0, + "step": 20354 + }, + { + "epoch": 2.5893652207098334, + "ewc_loss": 0.008522900752723217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522901043761522e-05, + "grad_norm": 4.225783348083496, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8736072778701782, + "num_tokens": 776684248.0, + "step": 20355 + }, + { + "epoch": 2.589492430988424, + "ewc_loss": 0.008471496403217316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471496403217316e-05, + "grad_norm": 4.2196364402771, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8855046033859253, + "num_tokens": 776722684.0, + "step": 20356 + }, + { + "epoch": 2.5896196412670145, + "ewc_loss": 0.00848808791488409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488087769364938e-05, + "grad_norm": 4.20781946182251, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8863198757171631, + "num_tokens": 776764635.0, + "step": 20357 + }, + { + "epoch": 2.589746851545605, + "ewc_loss": 0.008492117747664452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492117922287434e-05, + "grad_norm": 4.254920959472656, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8724232316017151, + "num_tokens": 776805683.0, + "step": 20358 + }, + { + "epoch": 2.5898740618241956, + "ewc_loss": 0.008499245159327984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499245450366288e-05, + "grad_norm": 4.24466609954834, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8853259086608887, + "num_tokens": 776841229.0, + "step": 20359 + }, + { + "epoch": 2.5900012721027856, + "ewc_loss": 0.008484448306262493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484448335366324e-05, + "grad_norm": 4.1817240715026855, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8900056481361389, + "num_tokens": 776881319.0, + "step": 20360 + }, + { + "epoch": 2.5901284823813766, + "ewc_loss": 0.008448208682239056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448208245681599e-05, + "grad_norm": 4.230440616607666, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8870722651481628, + "num_tokens": 776919436.0, + "step": 20361 + }, + { + "epoch": 2.5902556926599667, + "ewc_loss": 0.008489472791552544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489473111694679e-05, + "grad_norm": 4.250028610229492, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8668543100357056, + "num_tokens": 776958644.0, + "step": 20362 + }, + { + "epoch": 2.5903829029385577, + "ewc_loss": 0.008494398556649685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494398934999481e-05, + "grad_norm": 4.2494659423828125, + "learning_rate": 1e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.8999689221382141, + "num_tokens": 776994509.0, + "step": 20363 + }, + { + "epoch": 2.5905101132171477, + "ewc_loss": 0.00846023391932249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460233948426321e-05, + "grad_norm": 4.16460657119751, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8880466222763062, + "num_tokens": 777037557.0, + "step": 20364 + }, + { + "epoch": 2.5906373234957387, + "ewc_loss": 0.008427335880696774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427335706073791e-05, + "grad_norm": 4.270900249481201, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8641108870506287, + "num_tokens": 777077237.0, + "step": 20365 + }, + { + "epoch": 2.590764533774329, + "ewc_loss": 0.00850578024983406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505780715495348e-05, + "grad_norm": 4.203192234039307, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8904076814651489, + "num_tokens": 777120813.0, + "step": 20366 + }, + { + "epoch": 2.5908917440529193, + "ewc_loss": 0.008429757319390774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.429757144767791e-05, + "grad_norm": 4.2828569412231445, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8714622259140015, + "num_tokens": 777155809.0, + "step": 20367 + }, + { + "epoch": 2.59101895433151, + "ewc_loss": 0.008494488894939423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494488429278135e-05, + "grad_norm": 4.232865333557129, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8826214075088501, + "num_tokens": 777195868.0, + "step": 20368 + }, + { + "epoch": 2.5911461646101004, + "ewc_loss": 0.008439593948423862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439593511866406e-05, + "grad_norm": 4.224827289581299, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8866733312606812, + "num_tokens": 777233406.0, + "step": 20369 + }, + { + "epoch": 2.591273374888691, + "ewc_loss": 0.008471168577671051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471168257528916e-05, + "grad_norm": 4.252110958099365, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8858433961868286, + "num_tokens": 777272817.0, + "step": 20370 + }, + { + "epoch": 2.5914005851672814, + "ewc_loss": 0.00847333949059248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473339403280988e-05, + "grad_norm": 4.202805995941162, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8793638944625854, + "num_tokens": 777312398.0, + "step": 20371 + }, + { + "epoch": 2.591527795445872, + "ewc_loss": 0.008447539992630482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447539585176855e-05, + "grad_norm": 4.265372276306152, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8918533325195312, + "num_tokens": 777349375.0, + "step": 20372 + }, + { + "epoch": 2.5916550057244625, + "ewc_loss": 0.008521213196218014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521213021595031e-05, + "grad_norm": 4.181033134460449, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8928039073944092, + "num_tokens": 777394806.0, + "step": 20373 + }, + { + "epoch": 2.591782216003053, + "ewc_loss": 0.00844557210803032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44557216623798e-05, + "grad_norm": 4.202124118804932, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8866785168647766, + "num_tokens": 777434350.0, + "step": 20374 + }, + { + "epoch": 2.5919094262816436, + "ewc_loss": 0.008470717817544937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470717875752598e-05, + "grad_norm": 4.312487602233887, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8589332103729248, + "num_tokens": 777469690.0, + "step": 20375 + }, + { + "epoch": 2.592036636560234, + "ewc_loss": 0.008526943624019623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526943565811962e-05, + "grad_norm": 4.297814846038818, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8765693306922913, + "num_tokens": 777502095.0, + "step": 20376 + }, + { + "epoch": 2.5921638468388246, + "ewc_loss": 0.008474410511553288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474410424241796e-05, + "grad_norm": 4.201271057128906, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8939770460128784, + "num_tokens": 777540387.0, + "step": 20377 + }, + { + "epoch": 2.592291057117415, + "ewc_loss": 0.00847089383751154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470893953926861e-05, + "grad_norm": 4.189962863922119, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8883538246154785, + "num_tokens": 777581855.0, + "step": 20378 + }, + { + "epoch": 2.5924182673960057, + "ewc_loss": 0.008485551923513412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485552098136395e-05, + "grad_norm": 4.207835674285889, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.884825587272644, + "num_tokens": 777619852.0, + "step": 20379 + }, + { + "epoch": 2.592545477674596, + "ewc_loss": 0.00849786214530468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497862290823832e-05, + "grad_norm": 4.214293956756592, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8763487935066223, + "num_tokens": 777662772.0, + "step": 20380 + }, + { + "epoch": 2.5926726879531867, + "ewc_loss": 0.008503509685397148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503509889123961e-05, + "grad_norm": 4.244184494018555, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.867633581161499, + "num_tokens": 777701171.0, + "step": 20381 + }, + { + "epoch": 2.5927998982317773, + "ewc_loss": 0.008512581698596478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512581553077325e-05, + "grad_norm": 4.242086410522461, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8704610466957092, + "num_tokens": 777738245.0, + "step": 20382 + }, + { + "epoch": 2.592927108510368, + "ewc_loss": 0.008485857397317886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485857688356191e-05, + "grad_norm": 4.196230888366699, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8821517825126648, + "num_tokens": 777775732.0, + "step": 20383 + }, + { + "epoch": 2.5930543187889583, + "ewc_loss": 0.00849014613777399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49014613777399e-05, + "grad_norm": 4.2387919425964355, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8842375874519348, + "num_tokens": 777811077.0, + "step": 20384 + }, + { + "epoch": 2.5931815290675484, + "ewc_loss": 0.008536587469279766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5365871200338e-05, + "grad_norm": 4.193548202514648, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8816142082214355, + "num_tokens": 777854107.0, + "step": 20385 + }, + { + "epoch": 2.5933087393461394, + "ewc_loss": 0.008476154878735542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476155198877677e-05, + "grad_norm": 4.24442195892334, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8810935020446777, + "num_tokens": 777891950.0, + "step": 20386 + }, + { + "epoch": 2.5934359496247295, + "ewc_loss": 0.008536816574633121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536816312698647e-05, + "grad_norm": 4.173608303070068, + "learning_rate": 1e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.8987981677055359, + "num_tokens": 777932638.0, + "step": 20387 + }, + { + "epoch": 2.5935631599033204, + "ewc_loss": 0.008471338078379631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471338514937088e-05, + "grad_norm": 4.180091381072998, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8808581233024597, + "num_tokens": 777977733.0, + "step": 20388 + }, + { + "epoch": 2.5936903701819105, + "ewc_loss": 0.008510231971740723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510232146363705e-05, + "grad_norm": 4.194340229034424, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8865066766738892, + "num_tokens": 778020451.0, + "step": 20389 + }, + { + "epoch": 2.5938175804605015, + "ewc_loss": 0.008496453054249287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496452937833965e-05, + "grad_norm": 4.201108932495117, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8881045579910278, + "num_tokens": 778059594.0, + "step": 20390 + }, + { + "epoch": 2.5939447907390916, + "ewc_loss": 0.008481626398861408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481626719003543e-05, + "grad_norm": 4.209261417388916, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8860559463500977, + "num_tokens": 778099235.0, + "step": 20391 + }, + { + "epoch": 2.594072001017682, + "ewc_loss": 0.00847969762980938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479697862640023e-05, + "grad_norm": 4.228528022766113, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8943528532981873, + "num_tokens": 778135430.0, + "step": 20392 + }, + { + "epoch": 2.5941992112962726, + "ewc_loss": 0.008483435027301311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483434794470668e-05, + "grad_norm": 4.23916482925415, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8762094378471375, + "num_tokens": 778175111.0, + "step": 20393 + }, + { + "epoch": 2.594326421574863, + "ewc_loss": 0.008485558442771435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485558646498248e-05, + "grad_norm": 4.240591526031494, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8740142583847046, + "num_tokens": 778211954.0, + "step": 20394 + }, + { + "epoch": 2.5944536318534537, + "ewc_loss": 0.008479641750454903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479641837766394e-05, + "grad_norm": 4.302235126495361, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8695195317268372, + "num_tokens": 778246580.0, + "step": 20395 + }, + { + "epoch": 2.594580842132044, + "ewc_loss": 0.008515777066349983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515777153661475e-05, + "grad_norm": 4.1973700523376465, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8903554677963257, + "num_tokens": 778288659.0, + "step": 20396 + }, + { + "epoch": 2.5947080524106347, + "ewc_loss": 0.008445984683930874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.445984713034704e-05, + "grad_norm": 4.233189582824707, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8813768029212952, + "num_tokens": 778325365.0, + "step": 20397 + }, + { + "epoch": 2.5948352626892253, + "ewc_loss": 0.008518239483237267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518239337718114e-05, + "grad_norm": 4.22447395324707, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8883165717124939, + "num_tokens": 778362982.0, + "step": 20398 + }, + { + "epoch": 2.594962472967816, + "ewc_loss": 0.008488253690302372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488253661198542e-05, + "grad_norm": 4.285560607910156, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8699859976768494, + "num_tokens": 778394230.0, + "step": 20399 + }, + { + "epoch": 2.5950896832464063, + "ewc_loss": 0.008540437556803226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540437556803226e-05, + "grad_norm": 4.246451377868652, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8716036081314087, + "num_tokens": 778434345.0, + "step": 20400 + }, + { + "epoch": 2.595216893524997, + "ewc_loss": 0.008493983186781406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493983477819711e-05, + "grad_norm": 4.212006568908691, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8778989911079407, + "num_tokens": 778478303.0, + "step": 20401 + }, + { + "epoch": 2.5953441038035874, + "ewc_loss": 0.008491996675729752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491996413795277e-05, + "grad_norm": 4.264990329742432, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8873987197875977, + "num_tokens": 778513915.0, + "step": 20402 + }, + { + "epoch": 2.595471314082178, + "ewc_loss": 0.008536646142601967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536646055290475e-05, + "grad_norm": 4.154353141784668, + "learning_rate": 1e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.898551344871521, + "num_tokens": 778554294.0, + "step": 20403 + }, + { + "epoch": 2.5955985243607684, + "ewc_loss": 0.008466792292892933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.466792496619746e-05, + "grad_norm": 4.24280309677124, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8803757429122925, + "num_tokens": 778592836.0, + "step": 20404 + }, + { + "epoch": 2.595725734639359, + "ewc_loss": 0.008519197814166546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519197581335902e-05, + "grad_norm": 4.2399163246154785, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8747074007987976, + "num_tokens": 778629316.0, + "step": 20405 + }, + { + "epoch": 2.5958529449179495, + "ewc_loss": 0.008504543453454971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504543075105175e-05, + "grad_norm": 4.357879638671875, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8799266219139099, + "num_tokens": 778659065.0, + "step": 20406 + }, + { + "epoch": 2.59598015519654, + "ewc_loss": 0.008574225939810276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574225648771971e-05, + "grad_norm": 4.22256326675415, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8973066806793213, + "num_tokens": 778693949.0, + "step": 20407 + }, + { + "epoch": 2.5961073654751305, + "ewc_loss": 0.008457181043922901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45718095661141e-05, + "grad_norm": 4.17210054397583, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8870709538459778, + "num_tokens": 778738045.0, + "step": 20408 + }, + { + "epoch": 2.596234575753721, + "ewc_loss": 0.008486466482281685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486466686008498e-05, + "grad_norm": 4.386114120483398, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8434388041496277, + "num_tokens": 778769578.0, + "step": 20409 + }, + { + "epoch": 2.596361786032311, + "ewc_loss": 0.00861917994916439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619180152891204e-05, + "grad_norm": 4.204096794128418, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8786783814430237, + "num_tokens": 778810052.0, + "step": 20410 + }, + { + "epoch": 2.596488996310902, + "ewc_loss": 0.008443093858659267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443093975074589e-05, + "grad_norm": 4.208929538726807, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8779435753822327, + "num_tokens": 778850868.0, + "step": 20411 + }, + { + "epoch": 2.596616206589492, + "ewc_loss": 0.008545534685254097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545534365111962e-05, + "grad_norm": 4.187137126922607, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8880817890167236, + "num_tokens": 778890129.0, + "step": 20412 + }, + { + "epoch": 2.596743416868083, + "ewc_loss": 0.00852425117045641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524251461494714e-05, + "grad_norm": 4.2645487785339355, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.887315034866333, + "num_tokens": 778923181.0, + "step": 20413 + }, + { + "epoch": 2.5968706271466733, + "ewc_loss": 0.008558663539588451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558663830626756e-05, + "grad_norm": 4.219307899475098, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8736042976379395, + "num_tokens": 778963328.0, + "step": 20414 + }, + { + "epoch": 2.596997837425264, + "ewc_loss": 0.008522438816726208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522439020453021e-05, + "grad_norm": 4.1960320472717285, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8928182721138, + "num_tokens": 779003932.0, + "step": 20415 + }, + { + "epoch": 2.5971250477038543, + "ewc_loss": 0.008528809063136578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528809121344239e-05, + "grad_norm": 4.286197662353516, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8742436170578003, + "num_tokens": 779043913.0, + "step": 20416 + }, + { + "epoch": 2.597252257982445, + "ewc_loss": 0.008572557009756565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572557271691039e-05, + "grad_norm": 4.2534942626953125, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8812163472175598, + "num_tokens": 779077126.0, + "step": 20417 + }, + { + "epoch": 2.5973794682610354, + "ewc_loss": 0.008528491482138634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528491161996499e-05, + "grad_norm": 4.238576412200928, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8875124454498291, + "num_tokens": 779111544.0, + "step": 20418 + }, + { + "epoch": 2.597506678539626, + "ewc_loss": 0.008539962582290173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53996243677102e-05, + "grad_norm": 4.312394142150879, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8810031414031982, + "num_tokens": 779145493.0, + "step": 20419 + }, + { + "epoch": 2.5976338888182164, + "ewc_loss": 0.008581098169088364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581097790738568e-05, + "grad_norm": 4.196904182434082, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8755037188529968, + "num_tokens": 779189216.0, + "step": 20420 + }, + { + "epoch": 2.597761099096807, + "ewc_loss": 0.00849330797791481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493308268953115e-05, + "grad_norm": 4.241296291351318, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8885681629180908, + "num_tokens": 779225914.0, + "step": 20421 + }, + { + "epoch": 2.5978883093753975, + "ewc_loss": 0.008560904301702976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560904097976163e-05, + "grad_norm": 4.241733551025391, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8877702951431274, + "num_tokens": 779264118.0, + "step": 20422 + }, + { + "epoch": 2.598015519653988, + "ewc_loss": 0.008526289835572243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526290184818208e-05, + "grad_norm": 4.254096508026123, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8863627314567566, + "num_tokens": 779301694.0, + "step": 20423 + }, + { + "epoch": 2.5981427299325786, + "ewc_loss": 0.008520697243511677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520697156200185e-05, + "grad_norm": 4.238157272338867, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8833498954772949, + "num_tokens": 779339012.0, + "step": 20424 + }, + { + "epoch": 2.598269940211169, + "ewc_loss": 0.008509183302521706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50918295327574e-05, + "grad_norm": 4.2211151123046875, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8865398168563843, + "num_tokens": 779376358.0, + "step": 20425 + }, + { + "epoch": 2.5983971504897596, + "ewc_loss": 0.008501616306602955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50161595735699e-05, + "grad_norm": 4.207608699798584, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8815491795539856, + "num_tokens": 779416344.0, + "step": 20426 + }, + { + "epoch": 2.59852436076835, + "ewc_loss": 0.008506371639668941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50637152325362e-05, + "grad_norm": 4.297507286071777, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8731610774993896, + "num_tokens": 779452189.0, + "step": 20427 + }, + { + "epoch": 2.5986515710469407, + "ewc_loss": 0.008549490943551064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549491030862555e-05, + "grad_norm": 4.232519149780273, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8812159299850464, + "num_tokens": 779493513.0, + "step": 20428 + }, + { + "epoch": 2.598778781325531, + "ewc_loss": 0.008483964949846268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483965211780742e-05, + "grad_norm": 4.220600605010986, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8877462148666382, + "num_tokens": 779533061.0, + "step": 20429 + }, + { + "epoch": 2.5989059916041217, + "ewc_loss": 0.00850367359817028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50367359817028e-05, + "grad_norm": 4.2599029541015625, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8748698234558105, + "num_tokens": 779570159.0, + "step": 20430 + }, + { + "epoch": 2.5990332018827123, + "ewc_loss": 0.008523913100361824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523913129465654e-05, + "grad_norm": 4.232110500335693, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8805766701698303, + "num_tokens": 779609066.0, + "step": 20431 + }, + { + "epoch": 2.599160412161303, + "ewc_loss": 0.008477890864014626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477891242364421e-05, + "grad_norm": 4.192989826202393, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8833082914352417, + "num_tokens": 779651087.0, + "step": 20432 + }, + { + "epoch": 2.5992876224398933, + "ewc_loss": 0.008484511636197567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484511636197567e-05, + "grad_norm": 4.241659164428711, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8954513072967529, + "num_tokens": 779689531.0, + "step": 20433 + }, + { + "epoch": 2.599414832718484, + "ewc_loss": 0.008501325733959675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501325646648183e-05, + "grad_norm": 4.29226016998291, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8772911429405212, + "num_tokens": 779724062.0, + "step": 20434 + }, + { + "epoch": 2.599542042997074, + "ewc_loss": 0.008508824743330479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508824976161122e-05, + "grad_norm": 4.237604141235352, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8698768615722656, + "num_tokens": 779766029.0, + "step": 20435 + }, + { + "epoch": 2.599669253275665, + "ewc_loss": 0.0084519162774086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451916073681787e-05, + "grad_norm": 4.216528415679932, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8676143884658813, + "num_tokens": 779811065.0, + "step": 20436 + }, + { + "epoch": 2.599796463554255, + "ewc_loss": 0.008467459119856358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467458974337205e-05, + "grad_norm": 4.228265762329102, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.870101809501648, + "num_tokens": 779852442.0, + "step": 20437 + }, + { + "epoch": 2.599923673832846, + "ewc_loss": 0.008476103655993938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476103539578617e-05, + "grad_norm": 4.2614850997924805, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8857120275497437, + "num_tokens": 779885993.0, + "step": 20438 + }, + { + "epoch": 2.600050884111436, + "ewc_loss": 0.008499331772327423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499332034261897e-05, + "grad_norm": 4.205072402954102, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8858113288879395, + "num_tokens": 779927791.0, + "step": 20439 + }, + { + "epoch": 2.6001780943900266, + "ewc_loss": 0.008462945930659771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462945697829127e-05, + "grad_norm": 4.253666877746582, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8847672939300537, + "num_tokens": 779961517.0, + "step": 20440 + }, + { + "epoch": 2.600305304668617, + "ewc_loss": 0.008510995656251907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510996121913195e-05, + "grad_norm": 4.3363447189331055, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8718944787979126, + "num_tokens": 779992902.0, + "step": 20441 + }, + { + "epoch": 2.6004325149472076, + "ewc_loss": 0.008528672158718109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528672333341092e-05, + "grad_norm": 4.193981170654297, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8824470043182373, + "num_tokens": 780032849.0, + "step": 20442 + }, + { + "epoch": 2.600559725225798, + "ewc_loss": 0.008448904380202293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448904554825276e-05, + "grad_norm": 4.2044758796691895, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8936679363250732, + "num_tokens": 780072563.0, + "step": 20443 + }, + { + "epoch": 2.6006869355043887, + "ewc_loss": 0.008514578454196453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514578803442419e-05, + "grad_norm": 4.230041027069092, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8825593590736389, + "num_tokens": 780115299.0, + "step": 20444 + }, + { + "epoch": 2.600814145782979, + "ewc_loss": 0.00850543100386858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505430741934106e-05, + "grad_norm": 4.2408037185668945, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8805760145187378, + "num_tokens": 780155792.0, + "step": 20445 + }, + { + "epoch": 2.6009413560615697, + "ewc_loss": 0.008489738218486309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489737956551835e-05, + "grad_norm": 4.252695560455322, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8782411217689514, + "num_tokens": 780189040.0, + "step": 20446 + }, + { + "epoch": 2.6010685663401603, + "ewc_loss": 0.008513129316270351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513129432685673e-05, + "grad_norm": 4.229319095611572, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8811945915222168, + "num_tokens": 780229463.0, + "step": 20447 + }, + { + "epoch": 2.601195776618751, + "ewc_loss": 0.008449876680970192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.449876622762531e-05, + "grad_norm": 4.197344779968262, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8949607610702515, + "num_tokens": 780265961.0, + "step": 20448 + }, + { + "epoch": 2.6013229868973413, + "ewc_loss": 0.00843882281333208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438822987955064e-05, + "grad_norm": 4.2792768478393555, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8777405619621277, + "num_tokens": 780303220.0, + "step": 20449 + }, + { + "epoch": 2.601450197175932, + "ewc_loss": 0.008511305786669254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511305350111797e-05, + "grad_norm": 4.252618312835693, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8781795501708984, + "num_tokens": 780340456.0, + "step": 20450 + }, + { + "epoch": 2.6015774074545224, + "ewc_loss": 0.008456215262413025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.456215437036008e-05, + "grad_norm": 4.244326591491699, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8814404010772705, + "num_tokens": 780381402.0, + "step": 20451 + }, + { + "epoch": 2.601704617733113, + "ewc_loss": 0.008455111645162106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455111674265936e-05, + "grad_norm": 4.211400985717773, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8865048885345459, + "num_tokens": 780422571.0, + "step": 20452 + }, + { + "epoch": 2.6018318280117034, + "ewc_loss": 0.008440814912319183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440815145149827e-05, + "grad_norm": 4.227601051330566, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8776301145553589, + "num_tokens": 780462447.0, + "step": 20453 + }, + { + "epoch": 2.601959038290294, + "ewc_loss": 0.008473780006170273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.473779598716646e-05, + "grad_norm": 4.248532295227051, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8801026344299316, + "num_tokens": 780499183.0, + "step": 20454 + }, + { + "epoch": 2.6020862485688845, + "ewc_loss": 0.008474072441458702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474072819808498e-05, + "grad_norm": 4.247686386108398, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8707479238510132, + "num_tokens": 780539216.0, + "step": 20455 + }, + { + "epoch": 2.602213458847475, + "ewc_loss": 0.008470610715448856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47061091917567e-05, + "grad_norm": 4.232041358947754, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8887679576873779, + "num_tokens": 780578725.0, + "step": 20456 + }, + { + "epoch": 2.6023406691260655, + "ewc_loss": 0.008454378694295883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454378985334188e-05, + "grad_norm": 4.254234790802002, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.883318305015564, + "num_tokens": 780615175.0, + "step": 20457 + }, + { + "epoch": 2.6024678794046556, + "ewc_loss": 0.00848837848752737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488378080073744e-05, + "grad_norm": 4.279330730438232, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8855897188186646, + "num_tokens": 780651186.0, + "step": 20458 + }, + { + "epoch": 2.6025950896832466, + "ewc_loss": 0.008498153649270535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498154056724161e-05, + "grad_norm": 4.284661293029785, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8774722814559937, + "num_tokens": 780685550.0, + "step": 20459 + }, + { + "epoch": 2.6027222999618367, + "ewc_loss": 0.008474019356071949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474019705317914e-05, + "grad_norm": 4.167540073394775, + "learning_rate": 1e-06, + "loss": 0.2782, + "mean_token_accuracy": 0.8993984460830688, + "num_tokens": 780725477.0, + "step": 20460 + }, + { + "epoch": 2.6028495102404277, + "ewc_loss": 0.008427316322922707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.427316060988232e-05, + "grad_norm": 4.252922058105469, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.884024977684021, + "num_tokens": 780762481.0, + "step": 20461 + }, + { + "epoch": 2.6029767205190177, + "ewc_loss": 0.008508360013365746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508360042469576e-05, + "grad_norm": 4.289045333862305, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8631329536437988, + "num_tokens": 780800305.0, + "step": 20462 + }, + { + "epoch": 2.6031039307976087, + "ewc_loss": 0.008495263755321503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495264046359807e-05, + "grad_norm": 4.230535507202148, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8830204010009766, + "num_tokens": 780836271.0, + "step": 20463 + }, + { + "epoch": 2.603231141076199, + "ewc_loss": 0.008481265977025032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48126583150588e-05, + "grad_norm": 4.1933207511901855, + "learning_rate": 1e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.8937352895736694, + "num_tokens": 780877363.0, + "step": 20464 + }, + { + "epoch": 2.6033583513547893, + "ewc_loss": 0.00846864189952612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468642045045272e-05, + "grad_norm": 4.186340808868408, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.871097207069397, + "num_tokens": 780922982.0, + "step": 20465 + }, + { + "epoch": 2.60348556163338, + "ewc_loss": 0.008503381162881851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50338110467419e-05, + "grad_norm": 4.277544021606445, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8820446133613586, + "num_tokens": 780957925.0, + "step": 20466 + }, + { + "epoch": 2.6036127719119704, + "ewc_loss": 0.008551714941859245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55171456350945e-05, + "grad_norm": 4.248904228210449, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8650674223899841, + "num_tokens": 780996922.0, + "step": 20467 + }, + { + "epoch": 2.603739982190561, + "ewc_loss": 0.008480400778353214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480400720145553e-05, + "grad_norm": 4.263306140899658, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8663831949234009, + "num_tokens": 781037209.0, + "step": 20468 + }, + { + "epoch": 2.6038671924691514, + "ewc_loss": 0.00851577427238226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515774243278429e-05, + "grad_norm": 4.228446006774902, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8767182230949402, + "num_tokens": 781078077.0, + "step": 20469 + }, + { + "epoch": 2.603994402747742, + "ewc_loss": 0.008492431603372097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492431516060606e-05, + "grad_norm": 4.209286689758301, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8800822496414185, + "num_tokens": 781122516.0, + "step": 20470 + }, + { + "epoch": 2.6041216130263325, + "ewc_loss": 0.008495072834193707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495072688674554e-05, + "grad_norm": 4.34922981262207, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8753316402435303, + "num_tokens": 781156290.0, + "step": 20471 + }, + { + "epoch": 2.604248823304923, + "ewc_loss": 0.008593854494392872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593854727223516e-05, + "grad_norm": 4.224560260772705, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8879623413085938, + "num_tokens": 781192769.0, + "step": 20472 + }, + { + "epoch": 2.6043760335835135, + "ewc_loss": 0.008477249182760715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47724950290285e-05, + "grad_norm": 4.258997440338135, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.886083722114563, + "num_tokens": 781227594.0, + "step": 20473 + }, + { + "epoch": 2.604503243862104, + "ewc_loss": 0.008537456393241882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537456596968696e-05, + "grad_norm": 4.244604587554932, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8935964703559875, + "num_tokens": 781259369.0, + "step": 20474 + }, + { + "epoch": 2.6046304541406946, + "ewc_loss": 0.00853298231959343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532981883035973e-05, + "grad_norm": 4.2575578689575195, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8824664354324341, + "num_tokens": 781296788.0, + "step": 20475 + }, + { + "epoch": 2.604757664419285, + "ewc_loss": 0.00854950025677681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549500489607453e-05, + "grad_norm": 4.2311506271362305, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8838692903518677, + "num_tokens": 781333103.0, + "step": 20476 + }, + { + "epoch": 2.6048848746978757, + "ewc_loss": 0.008517539128661156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517538662999868e-05, + "grad_norm": 4.213216304779053, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8889958262443542, + "num_tokens": 781372691.0, + "step": 20477 + }, + { + "epoch": 2.605012084976466, + "ewc_loss": 0.008530654944479465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530655031790957e-05, + "grad_norm": 4.290058135986328, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8799744248390198, + "num_tokens": 781410480.0, + "step": 20478 + }, + { + "epoch": 2.6051392952550567, + "ewc_loss": 0.008572320453822613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572320803068578e-05, + "grad_norm": 4.229573726654053, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8717191219329834, + "num_tokens": 781454110.0, + "step": 20479 + }, + { + "epoch": 2.6052665055336472, + "ewc_loss": 0.0085222152993083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522215648554265e-05, + "grad_norm": 4.218778133392334, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8786051869392395, + "num_tokens": 781494034.0, + "step": 20480 + }, + { + "epoch": 2.6053937158122378, + "ewc_loss": 0.008537069894373417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537070243619382e-05, + "grad_norm": 4.1899614334106445, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8811358213424683, + "num_tokens": 781536225.0, + "step": 20481 + }, + { + "epoch": 2.6055209260908283, + "ewc_loss": 0.008534812368452549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534812513971701e-05, + "grad_norm": 4.215633869171143, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8873380422592163, + "num_tokens": 781573987.0, + "step": 20482 + }, + { + "epoch": 2.6056481363694184, + "ewc_loss": 0.008544822223484516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544822048861533e-05, + "grad_norm": 4.282461643218994, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8867099285125732, + "num_tokens": 781608987.0, + "step": 20483 + }, + { + "epoch": 2.6057753466480094, + "ewc_loss": 0.008535876870155334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535876986570656e-05, + "grad_norm": 4.231173515319824, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8882358074188232, + "num_tokens": 781646185.0, + "step": 20484 + }, + { + "epoch": 2.6059025569265994, + "ewc_loss": 0.008516760542988777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516760135535151e-05, + "grad_norm": 4.277741432189941, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8716975450515747, + "num_tokens": 781682999.0, + "step": 20485 + }, + { + "epoch": 2.6060297672051904, + "ewc_loss": 0.008535989560186863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535989763913676e-05, + "grad_norm": 4.173798561096191, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8800020217895508, + "num_tokens": 781726052.0, + "step": 20486 + }, + { + "epoch": 2.6061569774837805, + "ewc_loss": 0.008493317291140556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493317000102252e-05, + "grad_norm": 4.288881778717041, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8949774503707886, + "num_tokens": 781760562.0, + "step": 20487 + }, + { + "epoch": 2.6062841877623715, + "ewc_loss": 0.008574606850743294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574606908950955e-05, + "grad_norm": 4.211848735809326, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.872048556804657, + "num_tokens": 781803765.0, + "step": 20488 + }, + { + "epoch": 2.6064113980409616, + "ewc_loss": 0.008481597527861595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481597615173087e-05, + "grad_norm": 4.207399368286133, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8844908475875854, + "num_tokens": 781845419.0, + "step": 20489 + }, + { + "epoch": 2.606538608319552, + "ewc_loss": 0.008508441038429737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508440805599093e-05, + "grad_norm": 4.280456066131592, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8891555070877075, + "num_tokens": 781883371.0, + "step": 20490 + }, + { + "epoch": 2.6066658185981426, + "ewc_loss": 0.008517861366271973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517860987922177e-05, + "grad_norm": 4.202334880828857, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8823594450950623, + "num_tokens": 781919669.0, + "step": 20491 + }, + { + "epoch": 2.606793028876733, + "ewc_loss": 0.008484111167490482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484111458528787e-05, + "grad_norm": 4.266772747039795, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8875190019607544, + "num_tokens": 781953212.0, + "step": 20492 + }, + { + "epoch": 2.6069202391553237, + "ewc_loss": 0.008541091345250607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54109093779698e-05, + "grad_norm": 4.1722283363342285, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8856672644615173, + "num_tokens": 781996824.0, + "step": 20493 + }, + { + "epoch": 2.607047449433914, + "ewc_loss": 0.008467237465083599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467237785225734e-05, + "grad_norm": 4.183298110961914, + "learning_rate": 1e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.9003239870071411, + "num_tokens": 782035350.0, + "step": 20494 + }, + { + "epoch": 2.6071746597125047, + "ewc_loss": 0.00850985012948513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509850158588961e-05, + "grad_norm": 4.282796859741211, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8778731822967529, + "num_tokens": 782073217.0, + "step": 20495 + }, + { + "epoch": 2.6073018699910953, + "ewc_loss": 0.008559885434806347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559885463910177e-05, + "grad_norm": 4.295284748077393, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8758441209793091, + "num_tokens": 782110777.0, + "step": 20496 + }, + { + "epoch": 2.607429080269686, + "ewc_loss": 0.008517307229340076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517307287547737e-05, + "grad_norm": 4.253152370452881, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8757351040840149, + "num_tokens": 782153300.0, + "step": 20497 + }, + { + "epoch": 2.6075562905482763, + "ewc_loss": 0.008498434908688068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498434908688068e-05, + "grad_norm": 4.2410664558410645, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8880336284637451, + "num_tokens": 782191945.0, + "step": 20498 + }, + { + "epoch": 2.607683500826867, + "ewc_loss": 0.008497321978211403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497321687173098e-05, + "grad_norm": 4.254792213439941, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8922345042228699, + "num_tokens": 782230472.0, + "step": 20499 + }, + { + "epoch": 2.6078107111054574, + "ewc_loss": 0.008514638058841228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514637738699093e-05, + "grad_norm": 4.299540042877197, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8728876709938049, + "num_tokens": 782263868.0, + "step": 20500 + }, + { + "epoch": 2.607937921384048, + "ewc_loss": 0.008538013324141502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53801320772618e-05, + "grad_norm": 4.261849880218506, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8849847316741943, + "num_tokens": 782299781.0, + "step": 20501 + }, + { + "epoch": 2.6080651316626384, + "ewc_loss": 0.008494282141327858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494282519677654e-05, + "grad_norm": 4.215731620788574, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.876482367515564, + "num_tokens": 782343376.0, + "step": 20502 + }, + { + "epoch": 2.608192341941229, + "ewc_loss": 0.008496358059346676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49635835038498e-05, + "grad_norm": 4.159542560577393, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8921685218811035, + "num_tokens": 782389894.0, + "step": 20503 + }, + { + "epoch": 2.6083195522198195, + "ewc_loss": 0.008482730016112328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482730481773615e-05, + "grad_norm": 4.213469982147217, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8876201510429382, + "num_tokens": 782430055.0, + "step": 20504 + }, + { + "epoch": 2.60844676249841, + "ewc_loss": 0.008501139469444752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50113938213326e-05, + "grad_norm": 4.227452278137207, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8837974071502686, + "num_tokens": 782467211.0, + "step": 20505 + }, + { + "epoch": 2.6085739727770005, + "ewc_loss": 0.008519510738551617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519511175109074e-05, + "grad_norm": 4.270565986633301, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8738027811050415, + "num_tokens": 782505913.0, + "step": 20506 + }, + { + "epoch": 2.608701183055591, + "ewc_loss": 0.008524530567228794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524530130671337e-05, + "grad_norm": 4.222960948944092, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.881727933883667, + "num_tokens": 782541858.0, + "step": 20507 + }, + { + "epoch": 2.608828393334181, + "ewc_loss": 0.008481323719024658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481324039166793e-05, + "grad_norm": 4.274673938751221, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8805551528930664, + "num_tokens": 782575854.0, + "step": 20508 + }, + { + "epoch": 2.608955603612772, + "ewc_loss": 0.008523821830749512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523821452399716e-05, + "grad_norm": 4.217338562011719, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8832964897155762, + "num_tokens": 782614005.0, + "step": 20509 + }, + { + "epoch": 2.609082813891362, + "ewc_loss": 0.008481084369122982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481083932565525e-05, + "grad_norm": 4.244052886962891, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8888567686080933, + "num_tokens": 782650147.0, + "step": 20510 + }, + { + "epoch": 2.609210024169953, + "ewc_loss": 0.00852114986628294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521149720763788e-05, + "grad_norm": 4.265082836151123, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8819786310195923, + "num_tokens": 782683215.0, + "step": 20511 + }, + { + "epoch": 2.6093372344485433, + "ewc_loss": 0.008546104654669762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546104800188914e-05, + "grad_norm": 4.213793754577637, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8868610858917236, + "num_tokens": 782721184.0, + "step": 20512 + }, + { + "epoch": 2.609464444727134, + "ewc_loss": 0.00848773680627346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487736340612173e-05, + "grad_norm": 4.229681491851807, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8771302700042725, + "num_tokens": 782759891.0, + "step": 20513 + }, + { + "epoch": 2.6095916550057243, + "ewc_loss": 0.008523736149072647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52373632369563e-05, + "grad_norm": 4.264900207519531, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8726223111152649, + "num_tokens": 782796398.0, + "step": 20514 + }, + { + "epoch": 2.609718865284315, + "ewc_loss": 0.008538027293980122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538027032045648e-05, + "grad_norm": 4.193081855773926, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8907772898674011, + "num_tokens": 782835968.0, + "step": 20515 + }, + { + "epoch": 2.6098460755629054, + "ewc_loss": 0.008479723706841469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479724056087434e-05, + "grad_norm": 4.2220001220703125, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8930149674415588, + "num_tokens": 782871367.0, + "step": 20516 + }, + { + "epoch": 2.609973285841496, + "ewc_loss": 0.008528311736881733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52831217343919e-05, + "grad_norm": 4.231276988983154, + "learning_rate": 1e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.9008523225784302, + "num_tokens": 782905766.0, + "step": 20517 + }, + { + "epoch": 2.6101004961200864, + "ewc_loss": 0.00851205550134182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51205550134182e-05, + "grad_norm": 4.2251877784729, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.885993480682373, + "num_tokens": 782942256.0, + "step": 20518 + }, + { + "epoch": 2.610227706398677, + "ewc_loss": 0.008509703911840916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509703911840916e-05, + "grad_norm": 4.238584041595459, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8924413919448853, + "num_tokens": 782976156.0, + "step": 20519 + }, + { + "epoch": 2.6103549166772675, + "ewc_loss": 0.008519218303263187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519218681612983e-05, + "grad_norm": 4.225460052490234, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8909865617752075, + "num_tokens": 783011450.0, + "step": 20520 + }, + { + "epoch": 2.610482126955858, + "ewc_loss": 0.008528213948011398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528213948011398e-05, + "grad_norm": 4.222136974334717, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8885908126831055, + "num_tokens": 783050355.0, + "step": 20521 + }, + { + "epoch": 2.6106093372344485, + "ewc_loss": 0.00851601641625166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516016532666981e-05, + "grad_norm": 4.185446739196777, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8930181264877319, + "num_tokens": 783091621.0, + "step": 20522 + }, + { + "epoch": 2.610736547513039, + "ewc_loss": 0.00849016010761261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490159962093458e-05, + "grad_norm": 4.265664100646973, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8725271224975586, + "num_tokens": 783130471.0, + "step": 20523 + }, + { + "epoch": 2.6108637577916296, + "ewc_loss": 0.008556757122278214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55675752973184e-05, + "grad_norm": 4.199098587036133, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8870235085487366, + "num_tokens": 783168340.0, + "step": 20524 + }, + { + "epoch": 2.61099096807022, + "ewc_loss": 0.008491275832057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491276093991473e-05, + "grad_norm": 4.271759986877441, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8651934862136841, + "num_tokens": 783207279.0, + "step": 20525 + }, + { + "epoch": 2.6111181783488107, + "ewc_loss": 0.008543932810425758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54393292684108e-05, + "grad_norm": 4.235828876495361, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8773130178451538, + "num_tokens": 783243010.0, + "step": 20526 + }, + { + "epoch": 2.611245388627401, + "ewc_loss": 0.008511451072990894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511450869264081e-05, + "grad_norm": 4.210552215576172, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8741259574890137, + "num_tokens": 783287548.0, + "step": 20527 + }, + { + "epoch": 2.6113725989059917, + "ewc_loss": 0.008504344150424004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504344441462308e-05, + "grad_norm": 4.190466403961182, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8797017335891724, + "num_tokens": 783331549.0, + "step": 20528 + }, + { + "epoch": 2.6114998091845822, + "ewc_loss": 0.008523710072040558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523710130248219e-05, + "grad_norm": 4.259581565856934, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8929647207260132, + "num_tokens": 783369445.0, + "step": 20529 + }, + { + "epoch": 2.6116270194631728, + "ewc_loss": 0.008553058840334415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55305916047655e-05, + "grad_norm": 4.2246246337890625, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8778403997421265, + "num_tokens": 783413218.0, + "step": 20530 + }, + { + "epoch": 2.6117542297417633, + "ewc_loss": 0.008496853522956371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496853843098506e-05, + "grad_norm": 4.289264678955078, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8830046653747559, + "num_tokens": 783449944.0, + "step": 20531 + }, + { + "epoch": 2.611881440020354, + "ewc_loss": 0.008547886274755001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547886682208627e-05, + "grad_norm": 4.231531143188477, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8838763236999512, + "num_tokens": 783486142.0, + "step": 20532 + }, + { + "epoch": 2.612008650298944, + "ewc_loss": 0.008465205319225788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465205610264093e-05, + "grad_norm": 4.249749183654785, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8900094628334045, + "num_tokens": 783521170.0, + "step": 20533 + }, + { + "epoch": 2.612135860577535, + "ewc_loss": 0.008510550484061241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510550105711445e-05, + "grad_norm": 4.217637538909912, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8825934529304504, + "num_tokens": 783563902.0, + "step": 20534 + }, + { + "epoch": 2.612263070856125, + "ewc_loss": 0.008486315608024597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486316073685884e-05, + "grad_norm": 4.265639305114746, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8867052793502808, + "num_tokens": 783597655.0, + "step": 20535 + }, + { + "epoch": 2.612390281134716, + "ewc_loss": 0.008520328439772129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520328265149146e-05, + "grad_norm": 4.269840240478516, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8735021352767944, + "num_tokens": 783635233.0, + "step": 20536 + }, + { + "epoch": 2.612517491413306, + "ewc_loss": 0.00852007046341896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520070696249604e-05, + "grad_norm": 4.189572334289551, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8827146291732788, + "num_tokens": 783679677.0, + "step": 20537 + }, + { + "epoch": 2.6126447016918966, + "ewc_loss": 0.00847108755260706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471087494399399e-05, + "grad_norm": 4.208949565887451, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8882483243942261, + "num_tokens": 783720608.0, + "step": 20538 + }, + { + "epoch": 2.612771911970487, + "ewc_loss": 0.008522098883986473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522098505636677e-05, + "grad_norm": 4.268049716949463, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8935317993164062, + "num_tokens": 783753198.0, + "step": 20539 + }, + { + "epoch": 2.6128991222490776, + "ewc_loss": 0.008539261296391487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539261762052774e-05, + "grad_norm": 4.2364959716796875, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.883833110332489, + "num_tokens": 783791068.0, + "step": 20540 + }, + { + "epoch": 2.613026332527668, + "ewc_loss": 0.008497297763824463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497297676512972e-05, + "grad_norm": 4.243211269378662, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8829071521759033, + "num_tokens": 783831319.0, + "step": 20541 + }, + { + "epoch": 2.6131535428062587, + "ewc_loss": 0.00851539894938469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515398803865537e-05, + "grad_norm": 4.264832496643066, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8871870040893555, + "num_tokens": 783867537.0, + "step": 20542 + }, + { + "epoch": 2.613280753084849, + "ewc_loss": 0.008530799299478531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530799095751718e-05, + "grad_norm": 4.205596923828125, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8760294318199158, + "num_tokens": 783907787.0, + "step": 20543 + }, + { + "epoch": 2.6134079633634397, + "ewc_loss": 0.008486858569085598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486858132528141e-05, + "grad_norm": 4.240980625152588, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8761330246925354, + "num_tokens": 783948558.0, + "step": 20544 + }, + { + "epoch": 2.6135351736420303, + "ewc_loss": 0.008510733023285866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510732732247561e-05, + "grad_norm": 4.200416564941406, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.9010587930679321, + "num_tokens": 783986726.0, + "step": 20545 + }, + { + "epoch": 2.613662383920621, + "ewc_loss": 0.008476197719573975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47619739943184e-05, + "grad_norm": 4.233054161071777, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8669683933258057, + "num_tokens": 784023449.0, + "step": 20546 + }, + { + "epoch": 2.6137895941992113, + "ewc_loss": 0.00853215716779232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532157517038286e-05, + "grad_norm": 4.2041215896606445, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8983525037765503, + "num_tokens": 784059529.0, + "step": 20547 + }, + { + "epoch": 2.613916804477802, + "ewc_loss": 0.00849939789623022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499397517880425e-05, + "grad_norm": 4.2001633644104, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.883332371711731, + "num_tokens": 784103530.0, + "step": 20548 + }, + { + "epoch": 2.6140440147563924, + "ewc_loss": 0.008529050275683403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529049955541268e-05, + "grad_norm": 4.233330249786377, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8773763179779053, + "num_tokens": 784141721.0, + "step": 20549 + }, + { + "epoch": 2.614171225034983, + "ewc_loss": 0.00852091796696186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520917617715895e-05, + "grad_norm": 4.286046504974365, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8842425346374512, + "num_tokens": 784175868.0, + "step": 20550 + }, + { + "epoch": 2.6142984353135734, + "ewc_loss": 0.008532825857400894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532825449947268e-05, + "grad_norm": 4.249109745025635, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.894658625125885, + "num_tokens": 784210646.0, + "step": 20551 + }, + { + "epoch": 2.614425645592164, + "ewc_loss": 0.008497515693306923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497515227645636e-05, + "grad_norm": 4.162839889526367, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8897602558135986, + "num_tokens": 784255588.0, + "step": 20552 + }, + { + "epoch": 2.6145528558707545, + "ewc_loss": 0.008472021669149399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47202172735706e-05, + "grad_norm": 4.2914934158325195, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8852013349533081, + "num_tokens": 784292877.0, + "step": 20553 + }, + { + "epoch": 2.614680066149345, + "ewc_loss": 0.008538234047591686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53823366924189e-05, + "grad_norm": 4.199183940887451, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8859502077102661, + "num_tokens": 784336150.0, + "step": 20554 + }, + { + "epoch": 2.6148072764279355, + "ewc_loss": 0.008469355292618275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.469355088891461e-05, + "grad_norm": 4.235498428344727, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8948063850402832, + "num_tokens": 784371547.0, + "step": 20555 + }, + { + "epoch": 2.6149344867065256, + "ewc_loss": 0.008525724522769451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525724842911586e-05, + "grad_norm": 4.20340633392334, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8840464949607849, + "num_tokens": 784411473.0, + "step": 20556 + }, + { + "epoch": 2.6150616969851166, + "ewc_loss": 0.008491751737892628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491751941619441e-05, + "grad_norm": 4.218689918518066, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8753536343574524, + "num_tokens": 784456713.0, + "step": 20557 + }, + { + "epoch": 2.6151889072637067, + "ewc_loss": 0.008493246510624886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493246423313394e-05, + "grad_norm": 4.178824424743652, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8983858823776245, + "num_tokens": 784493685.0, + "step": 20558 + }, + { + "epoch": 2.6153161175422976, + "ewc_loss": 0.008481350727379322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481350960209966e-05, + "grad_norm": 4.218667984008789, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8938544392585754, + "num_tokens": 784534641.0, + "step": 20559 + }, + { + "epoch": 2.6154433278208877, + "ewc_loss": 0.008495168760418892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495168731315061e-05, + "grad_norm": 4.227181911468506, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8609681129455566, + "num_tokens": 784576184.0, + "step": 20560 + }, + { + "epoch": 2.6155705380994787, + "ewc_loss": 0.008482825011014938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482825069222599e-05, + "grad_norm": 4.268399715423584, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8877308368682861, + "num_tokens": 784613851.0, + "step": 20561 + }, + { + "epoch": 2.615697748378069, + "ewc_loss": 0.008516700938344002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516701200278476e-05, + "grad_norm": 4.203751087188721, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8854219913482666, + "num_tokens": 784654133.0, + "step": 20562 + }, + { + "epoch": 2.6158249586566593, + "ewc_loss": 0.00844633486121893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.446334686595947e-05, + "grad_norm": 4.234071731567383, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8843655586242676, + "num_tokens": 784690529.0, + "step": 20563 + }, + { + "epoch": 2.61595216893525, + "ewc_loss": 0.008498781360685825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498781244270504e-05, + "grad_norm": 4.205048084259033, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8822408318519592, + "num_tokens": 784731972.0, + "step": 20564 + }, + { + "epoch": 2.6160793792138404, + "ewc_loss": 0.008466341532766819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.466341387247667e-05, + "grad_norm": 4.207174301147461, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.877669632434845, + "num_tokens": 784773275.0, + "step": 20565 + }, + { + "epoch": 2.616206589492431, + "ewc_loss": 0.008452815935015678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.452816109638661e-05, + "grad_norm": 4.2414655685424805, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.869793176651001, + "num_tokens": 784811468.0, + "step": 20566 + }, + { + "epoch": 2.6163337997710214, + "ewc_loss": 0.008497845381498337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49784555612132e-05, + "grad_norm": 4.212322235107422, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8855531811714172, + "num_tokens": 784850611.0, + "step": 20567 + }, + { + "epoch": 2.616461010049612, + "ewc_loss": 0.008438089862465858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438089571427554e-05, + "grad_norm": 4.245484828948975, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.888620138168335, + "num_tokens": 784886502.0, + "step": 20568 + }, + { + "epoch": 2.6165882203282025, + "ewc_loss": 0.008492855355143547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492854976793751e-05, + "grad_norm": 4.21898078918457, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8762523531913757, + "num_tokens": 784924489.0, + "step": 20569 + }, + { + "epoch": 2.616715430606793, + "ewc_loss": 0.008472301065921783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.472301124129444e-05, + "grad_norm": 4.282673358917236, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8885424733161926, + "num_tokens": 784957224.0, + "step": 20570 + }, + { + "epoch": 2.6168426408853835, + "ewc_loss": 0.008508272469043732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508272730978206e-05, + "grad_norm": 4.240509033203125, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.861517608165741, + "num_tokens": 784997754.0, + "step": 20571 + }, + { + "epoch": 2.616969851163974, + "ewc_loss": 0.008471420034766197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471420005662367e-05, + "grad_norm": 4.2213544845581055, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8693720102310181, + "num_tokens": 785033932.0, + "step": 20572 + }, + { + "epoch": 2.6170970614425646, + "ewc_loss": 0.008513683453202248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513683860655874e-05, + "grad_norm": 4.23504638671875, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8801745176315308, + "num_tokens": 785073190.0, + "step": 20573 + }, + { + "epoch": 2.617224271721155, + "ewc_loss": 0.00851812306791544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518122922396287e-05, + "grad_norm": 4.228885650634766, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8983098268508911, + "num_tokens": 785108397.0, + "step": 20574 + }, + { + "epoch": 2.6173514819997457, + "ewc_loss": 0.008495070040225983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49507050588727e-05, + "grad_norm": 4.226676940917969, + "learning_rate": 1e-06, + "loss": 0.2701, + "mean_token_accuracy": 0.9075073003768921, + "num_tokens": 785142671.0, + "step": 20575 + }, + { + "epoch": 2.617478692278336, + "ewc_loss": 0.008509662002325058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509661711286753e-05, + "grad_norm": 4.201138496398926, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8815187811851501, + "num_tokens": 785183467.0, + "step": 20576 + }, + { + "epoch": 2.6176059025569267, + "ewc_loss": 0.008501002565026283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501002594130114e-05, + "grad_norm": 4.249170303344727, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.874522864818573, + "num_tokens": 785222072.0, + "step": 20577 + }, + { + "epoch": 2.6177331128355172, + "ewc_loss": 0.00851785484701395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517854439560324e-05, + "grad_norm": 4.205658435821533, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8726312518119812, + "num_tokens": 785260458.0, + "step": 20578 + }, + { + "epoch": 2.6178603231141078, + "ewc_loss": 0.008483730256557465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483730198349804e-05, + "grad_norm": 4.232299327850342, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.871212363243103, + "num_tokens": 785302434.0, + "step": 20579 + }, + { + "epoch": 2.6179875333926983, + "ewc_loss": 0.00853018183261156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530182094546035e-05, + "grad_norm": 4.306502819061279, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8713026642799377, + "num_tokens": 785338251.0, + "step": 20580 + }, + { + "epoch": 2.6181147436712884, + "ewc_loss": 0.008551981300115585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55198159115389e-05, + "grad_norm": 4.178572177886963, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8788270354270935, + "num_tokens": 785384789.0, + "step": 20581 + }, + { + "epoch": 2.6182419539498794, + "ewc_loss": 0.008450922556221485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450922177871689e-05, + "grad_norm": 4.217652320861816, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.881793737411499, + "num_tokens": 785424978.0, + "step": 20582 + }, + { + "epoch": 2.6183691642284694, + "ewc_loss": 0.008538450114428997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538450492778793e-05, + "grad_norm": 4.297036647796631, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8760248422622681, + "num_tokens": 785464421.0, + "step": 20583 + }, + { + "epoch": 2.6184963745070604, + "ewc_loss": 0.008534126915037632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534127118764445e-05, + "grad_norm": 4.215360641479492, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.9007776975631714, + "num_tokens": 785496166.0, + "step": 20584 + }, + { + "epoch": 2.6186235847856505, + "ewc_loss": 0.008480006828904152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480007090838626e-05, + "grad_norm": 4.342497825622559, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8687265515327454, + "num_tokens": 785528504.0, + "step": 20585 + }, + { + "epoch": 2.6187507950642415, + "ewc_loss": 0.008577057160437107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57705672387965e-05, + "grad_norm": 4.18349027633667, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8860423564910889, + "num_tokens": 785570896.0, + "step": 20586 + }, + { + "epoch": 2.6188780053428315, + "ewc_loss": 0.008459663949906826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459663513349369e-05, + "grad_norm": 4.21092414855957, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8791501522064209, + "num_tokens": 785612464.0, + "step": 20587 + }, + { + "epoch": 2.619005215621422, + "ewc_loss": 0.00855213776230812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552137296646833e-05, + "grad_norm": 4.198978900909424, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8819383382797241, + "num_tokens": 785655500.0, + "step": 20588 + }, + { + "epoch": 2.6191324259000126, + "ewc_loss": 0.0085219731554389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521973359165713e-05, + "grad_norm": 4.2450032234191895, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8857077360153198, + "num_tokens": 785691186.0, + "step": 20589 + }, + { + "epoch": 2.619259636178603, + "ewc_loss": 0.00852938648313284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529386832378805e-05, + "grad_norm": 4.25205659866333, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.8947697877883911, + "num_tokens": 785728575.0, + "step": 20590 + }, + { + "epoch": 2.6193868464571937, + "ewc_loss": 0.008525642566382885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525642624590546e-05, + "grad_norm": 4.250744342803955, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8808821439743042, + "num_tokens": 785765782.0, + "step": 20591 + }, + { + "epoch": 2.619514056735784, + "ewc_loss": 0.00853053294122219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530532795703039e-05, + "grad_norm": 4.296656131744385, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8874090909957886, + "num_tokens": 785800130.0, + "step": 20592 + }, + { + "epoch": 2.6196412670143747, + "ewc_loss": 0.008556263521313667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556263492209837e-05, + "grad_norm": 4.219210147857666, + "learning_rate": 1e-06, + "loss": 0.2615, + "mean_token_accuracy": 0.9070240259170532, + "num_tokens": 785836647.0, + "step": 20593 + }, + { + "epoch": 2.6197684772929652, + "ewc_loss": 0.008494362235069275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494361827615649e-05, + "grad_norm": 4.230101585388184, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.877905011177063, + "num_tokens": 785873897.0, + "step": 20594 + }, + { + "epoch": 2.6198956875715558, + "ewc_loss": 0.008520558476448059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520558185409755e-05, + "grad_norm": 4.240386486053467, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8792757987976074, + "num_tokens": 785914296.0, + "step": 20595 + }, + { + "epoch": 2.6200228978501463, + "ewc_loss": 0.008524486795067787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524487202521414e-05, + "grad_norm": 4.269009113311768, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8711013793945312, + "num_tokens": 785951426.0, + "step": 20596 + }, + { + "epoch": 2.620150108128737, + "ewc_loss": 0.008524470031261444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52446974022314e-05, + "grad_norm": 4.2241902351379395, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8848856687545776, + "num_tokens": 785989906.0, + "step": 20597 + }, + { + "epoch": 2.6202773184073274, + "ewc_loss": 0.008494878187775612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494878420606256e-05, + "grad_norm": 4.251901149749756, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8775968551635742, + "num_tokens": 786024763.0, + "step": 20598 + }, + { + "epoch": 2.620404528685918, + "ewc_loss": 0.008531425148248672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531425555702299e-05, + "grad_norm": 4.274292469024658, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8780046701431274, + "num_tokens": 786063055.0, + "step": 20599 + }, + { + "epoch": 2.6205317389645084, + "ewc_loss": 0.008511727675795555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51172735565342e-05, + "grad_norm": 4.33802604675293, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8591947555541992, + "num_tokens": 786097675.0, + "step": 20600 + }, + { + "epoch": 2.620658949243099, + "ewc_loss": 0.008565610274672508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565610187361017e-05, + "grad_norm": 4.2401652336120605, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8952693939208984, + "num_tokens": 786133721.0, + "step": 20601 + }, + { + "epoch": 2.6207861595216895, + "ewc_loss": 0.008483942598104477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483942656312138e-05, + "grad_norm": 4.1688385009765625, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8660572171211243, + "num_tokens": 786179615.0, + "step": 20602 + }, + { + "epoch": 2.62091336980028, + "ewc_loss": 0.008514652028679848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51465156301856e-05, + "grad_norm": 4.235827922821045, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8956652879714966, + "num_tokens": 786214945.0, + "step": 20603 + }, + { + "epoch": 2.6210405800788705, + "ewc_loss": 0.008574633859097958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574633829994127e-05, + "grad_norm": 4.2396392822265625, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8854966163635254, + "num_tokens": 786250880.0, + "step": 20604 + }, + { + "epoch": 2.621167790357461, + "ewc_loss": 0.008532268926501274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532268839189783e-05, + "grad_norm": 4.258571147918701, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8832045793533325, + "num_tokens": 786286095.0, + "step": 20605 + }, + { + "epoch": 2.621295000636051, + "ewc_loss": 0.008568081073462963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568081102566794e-05, + "grad_norm": 4.267135143280029, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8874876499176025, + "num_tokens": 786319547.0, + "step": 20606 + }, + { + "epoch": 2.621422210914642, + "ewc_loss": 0.008569537661969662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569537749281153e-05, + "grad_norm": 4.187788963317871, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8832148313522339, + "num_tokens": 786358699.0, + "step": 20607 + }, + { + "epoch": 2.621549421193232, + "ewc_loss": 0.008529147133231163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529146725777537e-05, + "grad_norm": 4.233299255371094, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8881151080131531, + "num_tokens": 786399732.0, + "step": 20608 + }, + { + "epoch": 2.621676631471823, + "ewc_loss": 0.008574917912483215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57491759234108e-05, + "grad_norm": 4.220299243927002, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8742215633392334, + "num_tokens": 786437375.0, + "step": 20609 + }, + { + "epoch": 2.6218038417504133, + "ewc_loss": 0.008565947413444519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565947791794315e-05, + "grad_norm": 4.25784969329834, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8731388449668884, + "num_tokens": 786475538.0, + "step": 20610 + }, + { + "epoch": 2.621931052029004, + "ewc_loss": 0.008565401658415794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565402094973251e-05, + "grad_norm": 4.178985118865967, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8842868804931641, + "num_tokens": 786516802.0, + "step": 20611 + }, + { + "epoch": 2.6220582623075943, + "ewc_loss": 0.00853216927498579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532169158570468e-05, + "grad_norm": 4.221761226654053, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8838210105895996, + "num_tokens": 786558470.0, + "step": 20612 + }, + { + "epoch": 2.622185472586185, + "ewc_loss": 0.008575175888836384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575175888836384e-05, + "grad_norm": 4.202356815338135, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8771862983703613, + "num_tokens": 786601163.0, + "step": 20613 + }, + { + "epoch": 2.6223126828647754, + "ewc_loss": 0.008559034205973148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559034176869318e-05, + "grad_norm": 4.307712554931641, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8899824619293213, + "num_tokens": 786636741.0, + "step": 20614 + }, + { + "epoch": 2.622439893143366, + "ewc_loss": 0.00858835969120264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588359924033284e-05, + "grad_norm": 4.22647762298584, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8736074566841125, + "num_tokens": 786672931.0, + "step": 20615 + }, + { + "epoch": 2.6225671034219564, + "ewc_loss": 0.008499312214553356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499312389176339e-05, + "grad_norm": 4.2537031173706055, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.895580530166626, + "num_tokens": 786709310.0, + "step": 20616 + }, + { + "epoch": 2.622694313700547, + "ewc_loss": 0.008536400273442268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536400127923116e-05, + "grad_norm": 4.238994598388672, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8883205056190491, + "num_tokens": 786747113.0, + "step": 20617 + }, + { + "epoch": 2.6228215239791375, + "ewc_loss": 0.008535613305866718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535613596905023e-05, + "grad_norm": 4.281985282897949, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8817956447601318, + "num_tokens": 786782033.0, + "step": 20618 + }, + { + "epoch": 2.622948734257728, + "ewc_loss": 0.008544426411390305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544426236767322e-05, + "grad_norm": 4.165348052978516, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8833864331245422, + "num_tokens": 786827229.0, + "step": 20619 + }, + { + "epoch": 2.6230759445363185, + "ewc_loss": 0.00847310945391655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47310948302038e-05, + "grad_norm": 4.248291015625, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.890863299369812, + "num_tokens": 786861722.0, + "step": 20620 + }, + { + "epoch": 2.623203154814909, + "ewc_loss": 0.008574006147682667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574005914852023e-05, + "grad_norm": 4.2684831619262695, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8861898183822632, + "num_tokens": 786896202.0, + "step": 20621 + }, + { + "epoch": 2.6233303650934996, + "ewc_loss": 0.008551009930670261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551010250812396e-05, + "grad_norm": 4.256048202514648, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8853035569190979, + "num_tokens": 786932154.0, + "step": 20622 + }, + { + "epoch": 2.62345757537209, + "ewc_loss": 0.008515630848705769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51563090691343e-05, + "grad_norm": 4.21465539932251, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8742971420288086, + "num_tokens": 786972184.0, + "step": 20623 + }, + { + "epoch": 2.6235847856506807, + "ewc_loss": 0.008518761023879051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518761023879051e-05, + "grad_norm": 4.216506481170654, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8775414228439331, + "num_tokens": 787015514.0, + "step": 20624 + }, + { + "epoch": 2.623711995929271, + "ewc_loss": 0.008515398018062115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515398076269776e-05, + "grad_norm": 4.22912073135376, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8841196894645691, + "num_tokens": 787054023.0, + "step": 20625 + }, + { + "epoch": 2.6238392062078617, + "ewc_loss": 0.00852965284138918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529653132427484e-05, + "grad_norm": 4.277934551239014, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8790783882141113, + "num_tokens": 787088858.0, + "step": 20626 + }, + { + "epoch": 2.6239664164864522, + "ewc_loss": 0.008543474599719048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543474541511387e-05, + "grad_norm": 4.2381744384765625, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8849109411239624, + "num_tokens": 787124805.0, + "step": 20627 + }, + { + "epoch": 2.6240936267650428, + "ewc_loss": 0.008511703461408615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511703344993293e-05, + "grad_norm": 4.219865798950195, + "learning_rate": 1e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8942809104919434, + "num_tokens": 787164922.0, + "step": 20628 + }, + { + "epoch": 2.6242208370436333, + "ewc_loss": 0.008502276614308357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502276614308357e-05, + "grad_norm": 4.180065631866455, + "learning_rate": 1e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8939290046691895, + "num_tokens": 787204936.0, + "step": 20629 + }, + { + "epoch": 2.624348047322224, + "ewc_loss": 0.008509568870067596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509568579029292e-05, + "grad_norm": 4.25875997543335, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8933089971542358, + "num_tokens": 787238292.0, + "step": 20630 + }, + { + "epoch": 2.624475257600814, + "ewc_loss": 0.008558112196624279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558112313039601e-05, + "grad_norm": 4.276360034942627, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.9012879729270935, + "num_tokens": 787269054.0, + "step": 20631 + }, + { + "epoch": 2.624602467879405, + "ewc_loss": 0.008538954891264439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538954716641456e-05, + "grad_norm": 4.210001468658447, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8824090957641602, + "num_tokens": 787310022.0, + "step": 20632 + }, + { + "epoch": 2.624729678157995, + "ewc_loss": 0.008492425084114075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492424967698753e-05, + "grad_norm": 4.219448089599609, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8744968771934509, + "num_tokens": 787350917.0, + "step": 20633 + }, + { + "epoch": 2.624856888436586, + "ewc_loss": 0.00852765329182148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527652971679345e-05, + "grad_norm": 4.221396446228027, + "learning_rate": 1e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8905999660491943, + "num_tokens": 787388187.0, + "step": 20634 + }, + { + "epoch": 2.624984098715176, + "ewc_loss": 0.008535442873835564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535443339496851e-05, + "grad_norm": 4.235886573791504, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8891090750694275, + "num_tokens": 787424998.0, + "step": 20635 + }, + { + "epoch": 2.6251113089937665, + "ewc_loss": 0.008529257029294968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529257320333272e-05, + "grad_norm": 4.190619468688965, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8942482471466064, + "num_tokens": 787465456.0, + "step": 20636 + }, + { + "epoch": 2.625238519272357, + "ewc_loss": 0.008485364727675915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485364378429949e-05, + "grad_norm": 4.237432479858398, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8814189434051514, + "num_tokens": 787504797.0, + "step": 20637 + }, + { + "epoch": 2.6253657295509476, + "ewc_loss": 0.00851426925510168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514268847648054e-05, + "grad_norm": 4.194652557373047, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8926794528961182, + "num_tokens": 787542911.0, + "step": 20638 + }, + { + "epoch": 2.625492939829538, + "ewc_loss": 0.008479542098939419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47954215714708e-05, + "grad_norm": 4.313563346862793, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8751790523529053, + "num_tokens": 787576143.0, + "step": 20639 + }, + { + "epoch": 2.6256201501081287, + "ewc_loss": 0.008562840521335602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562840957893059e-05, + "grad_norm": 4.196828842163086, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8855410814285278, + "num_tokens": 787619912.0, + "step": 20640 + }, + { + "epoch": 2.625747360386719, + "ewc_loss": 0.008430992253124714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430991874774918e-05, + "grad_norm": 4.234679698944092, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8736162185668945, + "num_tokens": 787658181.0, + "step": 20641 + }, + { + "epoch": 2.6258745706653097, + "ewc_loss": 0.008504370227456093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504370634909719e-05, + "grad_norm": 4.306462287902832, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8760929703712463, + "num_tokens": 787694889.0, + "step": 20642 + }, + { + "epoch": 2.6260017809439002, + "ewc_loss": 0.008519252762198448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51925287861377e-05, + "grad_norm": 4.188840389251709, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8956290483474731, + "num_tokens": 787737633.0, + "step": 20643 + }, + { + "epoch": 2.6261289912224908, + "ewc_loss": 0.008421655744314194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421655365964398e-05, + "grad_norm": 4.227957248687744, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8733484745025635, + "num_tokens": 787779069.0, + "step": 20644 + }, + { + "epoch": 2.6262562015010813, + "ewc_loss": 0.008493253961205482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49325442686677e-05, + "grad_norm": 4.304780006408691, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8713140487670898, + "num_tokens": 787815544.0, + "step": 20645 + }, + { + "epoch": 2.626383411779672, + "ewc_loss": 0.00850798562169075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507985330652446e-05, + "grad_norm": 4.223507404327393, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8785877227783203, + "num_tokens": 787852684.0, + "step": 20646 + }, + { + "epoch": 2.6265106220582624, + "ewc_loss": 0.008444339968264103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444340346613899e-05, + "grad_norm": 4.233325958251953, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8778844475746155, + "num_tokens": 787893452.0, + "step": 20647 + }, + { + "epoch": 2.626637832336853, + "ewc_loss": 0.008509056642651558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509056351613253e-05, + "grad_norm": 4.256406784057617, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8835620880126953, + "num_tokens": 787929430.0, + "step": 20648 + }, + { + "epoch": 2.6267650426154434, + "ewc_loss": 0.00851150881499052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511509076924995e-05, + "grad_norm": 4.200103759765625, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8807253241539001, + "num_tokens": 787969768.0, + "step": 20649 + }, + { + "epoch": 2.626892252894034, + "ewc_loss": 0.008457520976662636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457520743831992e-05, + "grad_norm": 4.22257661819458, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8834130764007568, + "num_tokens": 788009605.0, + "step": 20650 + }, + { + "epoch": 2.6270194631726245, + "ewc_loss": 0.008509245701134205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509245526511222e-05, + "grad_norm": 4.2455010414123535, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8946856260299683, + "num_tokens": 788044652.0, + "step": 20651 + }, + { + "epoch": 2.627146673451215, + "ewc_loss": 0.008510936051607132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510935731464997e-05, + "grad_norm": 4.259838104248047, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8591218590736389, + "num_tokens": 788085981.0, + "step": 20652 + }, + { + "epoch": 2.6272738837298055, + "ewc_loss": 0.008512132801115513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51213262649253e-05, + "grad_norm": 4.215533256530762, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8799835443496704, + "num_tokens": 788132196.0, + "step": 20653 + }, + { + "epoch": 2.6274010940083956, + "ewc_loss": 0.00850596185773611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505961886839941e-05, + "grad_norm": 4.309994220733643, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8950545787811279, + "num_tokens": 788160099.0, + "step": 20654 + }, + { + "epoch": 2.6275283042869866, + "ewc_loss": 0.008562401868402958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562401490053162e-05, + "grad_norm": 4.24053955078125, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8834770917892456, + "num_tokens": 788197264.0, + "step": 20655 + }, + { + "epoch": 2.6276555145655767, + "ewc_loss": 0.008481931872665882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481932309223339e-05, + "grad_norm": 4.244204044342041, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8940713405609131, + "num_tokens": 788232900.0, + "step": 20656 + }, + { + "epoch": 2.6277827248441676, + "ewc_loss": 0.00853455625474453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534556400263682e-05, + "grad_norm": 4.233485698699951, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8773276805877686, + "num_tokens": 788273720.0, + "step": 20657 + }, + { + "epoch": 2.6279099351227577, + "ewc_loss": 0.008519237861037254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51923759910278e-05, + "grad_norm": 4.331399917602539, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8856143355369568, + "num_tokens": 788308425.0, + "step": 20658 + }, + { + "epoch": 2.6280371454013487, + "ewc_loss": 0.00854195561259985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541955321561545e-05, + "grad_norm": 4.270496845245361, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8765525817871094, + "num_tokens": 788343701.0, + "step": 20659 + }, + { + "epoch": 2.628164355679939, + "ewc_loss": 0.008484148420393467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484148565912619e-05, + "grad_norm": 4.3980512619018555, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8685954809188843, + "num_tokens": 788372789.0, + "step": 20660 + }, + { + "epoch": 2.6282915659585293, + "ewc_loss": 0.008584607392549515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58460771269165e-05, + "grad_norm": 4.252635478973389, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8864096403121948, + "num_tokens": 788408422.0, + "step": 20661 + }, + { + "epoch": 2.62841877623712, + "ewc_loss": 0.00847618468105793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476184302708134e-05, + "grad_norm": 4.290042400360107, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8864725232124329, + "num_tokens": 788446113.0, + "step": 20662 + }, + { + "epoch": 2.6285459865157104, + "ewc_loss": 0.00854365061968565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54365061968565e-05, + "grad_norm": 4.236632347106934, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8767846822738647, + "num_tokens": 788485009.0, + "step": 20663 + }, + { + "epoch": 2.628673196794301, + "ewc_loss": 0.008518771268427372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518771210219711e-05, + "grad_norm": 4.229053974151611, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.884778618812561, + "num_tokens": 788523446.0, + "step": 20664 + }, + { + "epoch": 2.6288004070728914, + "ewc_loss": 0.0085374154150486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537415851606056e-05, + "grad_norm": 4.234485626220703, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.891209065914154, + "num_tokens": 788557423.0, + "step": 20665 + }, + { + "epoch": 2.628927617351482, + "ewc_loss": 0.00855239573866129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552395593142137e-05, + "grad_norm": 4.306891918182373, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8812761306762695, + "num_tokens": 788588177.0, + "step": 20666 + }, + { + "epoch": 2.6290548276300725, + "ewc_loss": 0.008581585250794888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581585279898718e-05, + "grad_norm": 4.258550643920898, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8847797513008118, + "num_tokens": 788624877.0, + "step": 20667 + }, + { + "epoch": 2.629182037908663, + "ewc_loss": 0.00853034108877182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530340710422024e-05, + "grad_norm": 4.210994243621826, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8927457332611084, + "num_tokens": 788662532.0, + "step": 20668 + }, + { + "epoch": 2.6293092481872535, + "ewc_loss": 0.008539700880646706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539701229892671e-05, + "grad_norm": 4.213103294372559, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8831672072410583, + "num_tokens": 788702734.0, + "step": 20669 + }, + { + "epoch": 2.629436458465844, + "ewc_loss": 0.008551369421184063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551369683118537e-05, + "grad_norm": 4.246933460235596, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8837863206863403, + "num_tokens": 788738041.0, + "step": 20670 + }, + { + "epoch": 2.6295636687444346, + "ewc_loss": 0.00856624636799097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566246106056497e-05, + "grad_norm": 4.273869514465332, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8847382068634033, + "num_tokens": 788772998.0, + "step": 20671 + }, + { + "epoch": 2.629690879023025, + "ewc_loss": 0.00857479777187109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574797539040446e-05, + "grad_norm": 4.244443893432617, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.868077278137207, + "num_tokens": 788814310.0, + "step": 20672 + }, + { + "epoch": 2.6298180893016156, + "ewc_loss": 0.008538706228137016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538706606486812e-05, + "grad_norm": 4.208953380584717, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8873381018638611, + "num_tokens": 788852811.0, + "step": 20673 + }, + { + "epoch": 2.629945299580206, + "ewc_loss": 0.008548744954168797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54874451761134e-05, + "grad_norm": 4.282701015472412, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8910889029502869, + "num_tokens": 788888756.0, + "step": 20674 + }, + { + "epoch": 2.6300725098587967, + "ewc_loss": 0.008568528108298779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568527846364304e-05, + "grad_norm": 4.244706630706787, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8875710368156433, + "num_tokens": 788926292.0, + "step": 20675 + }, + { + "epoch": 2.6301997201373872, + "ewc_loss": 0.008530506864190102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530507329851389e-05, + "grad_norm": 4.229942321777344, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8925044536590576, + "num_tokens": 788965625.0, + "step": 20676 + }, + { + "epoch": 2.6303269304159778, + "ewc_loss": 0.008547106757760048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547106699552387e-05, + "grad_norm": 4.238119125366211, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.888158917427063, + "num_tokens": 789007390.0, + "step": 20677 + }, + { + "epoch": 2.6304541406945683, + "ewc_loss": 0.008542818017303944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542818250134587e-05, + "grad_norm": 4.232940196990967, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8860120177268982, + "num_tokens": 789047940.0, + "step": 20678 + }, + { + "epoch": 2.6305813509731584, + "ewc_loss": 0.008516192436218262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516192610841244e-05, + "grad_norm": 4.2553181648254395, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8832385540008545, + "num_tokens": 789086321.0, + "step": 20679 + }, + { + "epoch": 2.6307085612517493, + "ewc_loss": 0.008538419380784035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538419206161052e-05, + "grad_norm": 4.234933853149414, + "learning_rate": 1e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8935942053794861, + "num_tokens": 789121757.0, + "step": 20680 + }, + { + "epoch": 2.6308357715303394, + "ewc_loss": 0.008514555171132088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514554792782292e-05, + "grad_norm": 4.236166000366211, + "learning_rate": 1e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8971647024154663, + "num_tokens": 789160076.0, + "step": 20681 + }, + { + "epoch": 2.6309629818089304, + "ewc_loss": 0.008515779860317707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51578006404452e-05, + "grad_norm": 4.288818836212158, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.874261736869812, + "num_tokens": 789197080.0, + "step": 20682 + }, + { + "epoch": 2.6310901920875205, + "ewc_loss": 0.00853737723082304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537377289030701e-05, + "grad_norm": 4.157376289367676, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8774261474609375, + "num_tokens": 789249876.0, + "step": 20683 + }, + { + "epoch": 2.6312174023661115, + "ewc_loss": 0.008444994688034058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.444994455203414e-05, + "grad_norm": 4.2368245124816895, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8801923990249634, + "num_tokens": 789290146.0, + "step": 20684 + }, + { + "epoch": 2.6313446126447015, + "ewc_loss": 0.008529147133231163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529147453373298e-05, + "grad_norm": 4.179391860961914, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8865392804145813, + "num_tokens": 789335610.0, + "step": 20685 + }, + { + "epoch": 2.631471822923292, + "ewc_loss": 0.008441733196377754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.441733371000737e-05, + "grad_norm": 4.2490997314453125, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8784545063972473, + "num_tokens": 789371956.0, + "step": 20686 + }, + { + "epoch": 2.6315990332018826, + "ewc_loss": 0.00852371845394373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523718861397356e-05, + "grad_norm": 4.227394104003906, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8782577514648438, + "num_tokens": 789415257.0, + "step": 20687 + }, + { + "epoch": 2.631726243480473, + "ewc_loss": 0.00845456775277853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454567432636395e-05, + "grad_norm": 4.228277206420898, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8829302191734314, + "num_tokens": 789452533.0, + "step": 20688 + }, + { + "epoch": 2.6318534537590637, + "ewc_loss": 0.00846448726952076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.464487473247573e-05, + "grad_norm": 4.251725196838379, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8801828622817993, + "num_tokens": 789488780.0, + "step": 20689 + }, + { + "epoch": 2.631980664037654, + "ewc_loss": 0.008462401106953621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462400728603825e-05, + "grad_norm": 4.215439319610596, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8792106509208679, + "num_tokens": 789528169.0, + "step": 20690 + }, + { + "epoch": 2.6321078743162447, + "ewc_loss": 0.008447036147117615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447036088909954e-05, + "grad_norm": 4.255782604217529, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8762258291244507, + "num_tokens": 789568382.0, + "step": 20691 + }, + { + "epoch": 2.6322350845948352, + "ewc_loss": 0.008475827984511852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.475827780785039e-05, + "grad_norm": 4.20058012008667, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8809492588043213, + "num_tokens": 789606895.0, + "step": 20692 + }, + { + "epoch": 2.6323622948734258, + "ewc_loss": 0.008421136997640133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.421136590186507e-05, + "grad_norm": 4.226371765136719, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8686644434928894, + "num_tokens": 789649106.0, + "step": 20693 + }, + { + "epoch": 2.6324895051520163, + "ewc_loss": 0.008463460020720959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.46346010803245e-05, + "grad_norm": 4.21757698059082, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8860774040222168, + "num_tokens": 789689449.0, + "step": 20694 + }, + { + "epoch": 2.632616715430607, + "ewc_loss": 0.008432349190115929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.432349568465725e-05, + "grad_norm": 4.265749931335449, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8931617736816406, + "num_tokens": 789726179.0, + "step": 20695 + }, + { + "epoch": 2.6327439257091974, + "ewc_loss": 0.008459367789328098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.459368109470233e-05, + "grad_norm": 4.256250381469727, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8675406575202942, + "num_tokens": 789764858.0, + "step": 20696 + }, + { + "epoch": 2.632871135987788, + "ewc_loss": 0.008438131771981716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438131771981716e-05, + "grad_norm": 4.276798248291016, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8713624477386475, + "num_tokens": 789799196.0, + "step": 20697 + }, + { + "epoch": 2.6329983462663784, + "ewc_loss": 0.008460847660899162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.460848039248958e-05, + "grad_norm": 4.1943359375, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.880206823348999, + "num_tokens": 789840498.0, + "step": 20698 + }, + { + "epoch": 2.633125556544969, + "ewc_loss": 0.00841367244720459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413672912865877e-05, + "grad_norm": 4.2510504722595215, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8863697052001953, + "num_tokens": 789874269.0, + "step": 20699 + }, + { + "epoch": 2.6332527668235595, + "ewc_loss": 0.008465217426419258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465217251796275e-05, + "grad_norm": 4.203385829925537, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8831933736801147, + "num_tokens": 789914938.0, + "step": 20700 + }, + { + "epoch": 2.63337997710215, + "ewc_loss": 0.008415077812969685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.415077900281176e-05, + "grad_norm": 4.202724456787109, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8826004266738892, + "num_tokens": 789959219.0, + "step": 20701 + }, + { + "epoch": 2.6335071873807405, + "ewc_loss": 0.008462892845273018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462892583338544e-05, + "grad_norm": 4.317352294921875, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8762603998184204, + "num_tokens": 789993707.0, + "step": 20702 + }, + { + "epoch": 2.633634397659331, + "ewc_loss": 0.008500365540385246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500365947838873e-05, + "grad_norm": 4.221061706542969, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8650816679000854, + "num_tokens": 790034285.0, + "step": 20703 + }, + { + "epoch": 2.633761607937921, + "ewc_loss": 0.008419531397521496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.419531513936818e-05, + "grad_norm": 4.259162902832031, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8871008157730103, + "num_tokens": 790069714.0, + "step": 20704 + }, + { + "epoch": 2.633888818216512, + "ewc_loss": 0.008501709438860416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501709817210212e-05, + "grad_norm": 4.240509033203125, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8807233572006226, + "num_tokens": 790110733.0, + "step": 20705 + }, + { + "epoch": 2.634016028495102, + "ewc_loss": 0.008455649949610233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455650095129386e-05, + "grad_norm": 4.263425350189209, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8879956007003784, + "num_tokens": 790144468.0, + "step": 20706 + }, + { + "epoch": 2.634143238773693, + "ewc_loss": 0.00850063469260931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500634430674836e-05, + "grad_norm": 4.208703994750977, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8802686929702759, + "num_tokens": 790186881.0, + "step": 20707 + }, + { + "epoch": 2.6342704490522832, + "ewc_loss": 0.00845808070152998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458080264972523e-05, + "grad_norm": 4.208978176116943, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.889784574508667, + "num_tokens": 790225083.0, + "step": 20708 + }, + { + "epoch": 2.6343976593308738, + "ewc_loss": 0.008480342105031013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48034251248464e-05, + "grad_norm": 4.216691970825195, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8816787600517273, + "num_tokens": 790266822.0, + "step": 20709 + }, + { + "epoch": 2.6345248696094643, + "ewc_loss": 0.008465253747999668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465253631584346e-05, + "grad_norm": 4.287678241729736, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8807868957519531, + "num_tokens": 790305948.0, + "step": 20710 + }, + { + "epoch": 2.634652079888055, + "ewc_loss": 0.008493966422975063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493966015521437e-05, + "grad_norm": 4.2141804695129395, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8838615417480469, + "num_tokens": 790341639.0, + "step": 20711 + }, + { + "epoch": 2.6347792901666454, + "ewc_loss": 0.008439666591584682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439666999038309e-05, + "grad_norm": 4.248994827270508, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8911402821540833, + "num_tokens": 790377442.0, + "step": 20712 + }, + { + "epoch": 2.634906500445236, + "ewc_loss": 0.008489133790135384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489134052069858e-05, + "grad_norm": 4.257242202758789, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8846218585968018, + "num_tokens": 790414911.0, + "step": 20713 + }, + { + "epoch": 2.6350337107238264, + "ewc_loss": 0.008477102033793926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477101800963283e-05, + "grad_norm": 4.220903396606445, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8813096284866333, + "num_tokens": 790453968.0, + "step": 20714 + }, + { + "epoch": 2.635160921002417, + "ewc_loss": 0.008449137210845947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44913738546893e-05, + "grad_norm": 4.284092426300049, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8777838349342346, + "num_tokens": 790493216.0, + "step": 20715 + }, + { + "epoch": 2.6352881312810075, + "ewc_loss": 0.00851486437022686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514864020980895e-05, + "grad_norm": 4.272639751434326, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8785436153411865, + "num_tokens": 790529380.0, + "step": 20716 + }, + { + "epoch": 2.635415341559598, + "ewc_loss": 0.008477631956338882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477632218273357e-05, + "grad_norm": 4.242126941680908, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8785202503204346, + "num_tokens": 790569568.0, + "step": 20717 + }, + { + "epoch": 2.6355425518381885, + "ewc_loss": 0.008457853458821774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45785325509496e-05, + "grad_norm": 4.242215156555176, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8901031613349915, + "num_tokens": 790602269.0, + "step": 20718 + }, + { + "epoch": 2.635669762116779, + "ewc_loss": 0.008480949327349663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480949327349663e-05, + "grad_norm": 4.278546333312988, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8727077841758728, + "num_tokens": 790639723.0, + "step": 20719 + }, + { + "epoch": 2.6357969723953696, + "ewc_loss": 0.00849114265292883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491142216371372e-05, + "grad_norm": 4.269344329833984, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8702461123466492, + "num_tokens": 790676103.0, + "step": 20720 + }, + { + "epoch": 2.63592418267396, + "ewc_loss": 0.008504367433488369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504366996930912e-05, + "grad_norm": 4.252443313598633, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8843746185302734, + "num_tokens": 790712118.0, + "step": 20721 + }, + { + "epoch": 2.6360513929525506, + "ewc_loss": 0.008483029901981354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483029523631558e-05, + "grad_norm": 4.224557876586914, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8827418088912964, + "num_tokens": 790747079.0, + "step": 20722 + }, + { + "epoch": 2.636178603231141, + "ewc_loss": 0.008523416705429554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523416909156367e-05, + "grad_norm": 4.298497200012207, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8739261627197266, + "num_tokens": 790781741.0, + "step": 20723 + }, + { + "epoch": 2.6363058135097317, + "ewc_loss": 0.00856589525938034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565895404899493e-05, + "grad_norm": 4.241052150726318, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8823545575141907, + "num_tokens": 790820243.0, + "step": 20724 + }, + { + "epoch": 2.6364330237883222, + "ewc_loss": 0.008525240235030651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525240264134482e-05, + "grad_norm": 4.243045806884766, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.89860600233078, + "num_tokens": 790851492.0, + "step": 20725 + }, + { + "epoch": 2.6365602340669128, + "ewc_loss": 0.008554576896131039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554576925234869e-05, + "grad_norm": 4.291710376739502, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.886559247970581, + "num_tokens": 790882293.0, + "step": 20726 + }, + { + "epoch": 2.636687444345503, + "ewc_loss": 0.008598960004746914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598960266681388e-05, + "grad_norm": 4.230137348175049, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8787081241607666, + "num_tokens": 790922188.0, + "step": 20727 + }, + { + "epoch": 2.636814654624094, + "ewc_loss": 0.008551745675504208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55174585012719e-05, + "grad_norm": 4.224359035491943, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8932112455368042, + "num_tokens": 790960675.0, + "step": 20728 + }, + { + "epoch": 2.636941864902684, + "ewc_loss": 0.008591298945248127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591298683313653e-05, + "grad_norm": 4.247401237487793, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8897164463996887, + "num_tokens": 790999352.0, + "step": 20729 + }, + { + "epoch": 2.637069075181275, + "ewc_loss": 0.008586813695728779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58681378304027e-05, + "grad_norm": 4.257798671722412, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8833563923835754, + "num_tokens": 791035898.0, + "step": 20730 + }, + { + "epoch": 2.637196285459865, + "ewc_loss": 0.008585618808865547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58561834320426e-05, + "grad_norm": 4.264618396759033, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8611910343170166, + "num_tokens": 791079731.0, + "step": 20731 + }, + { + "epoch": 2.637323495738456, + "ewc_loss": 0.008576350286602974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576350228395313e-05, + "grad_norm": 4.174197673797607, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8948941230773926, + "num_tokens": 791118565.0, + "step": 20732 + }, + { + "epoch": 2.637450706017046, + "ewc_loss": 0.008552312850952148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552312647225335e-05, + "grad_norm": 4.3087358474731445, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8837093710899353, + "num_tokens": 791154876.0, + "step": 20733 + }, + { + "epoch": 2.6375779162956365, + "ewc_loss": 0.008651942014694214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.651941607240587e-05, + "grad_norm": 4.205766201019287, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8828921318054199, + "num_tokens": 791196003.0, + "step": 20734 + }, + { + "epoch": 2.637705126574227, + "ewc_loss": 0.008542218245565891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542217983631417e-05, + "grad_norm": 4.245909214019775, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8805809020996094, + "num_tokens": 791236462.0, + "step": 20735 + }, + { + "epoch": 2.6378323368528176, + "ewc_loss": 0.008604834787547588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60483487485908e-05, + "grad_norm": 4.256425380706787, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8707734942436218, + "num_tokens": 791271918.0, + "step": 20736 + }, + { + "epoch": 2.637959547131408, + "ewc_loss": 0.008591779507696629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59177962411195e-05, + "grad_norm": 4.236015319824219, + "learning_rate": 1e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8932903409004211, + "num_tokens": 791309813.0, + "step": 20737 + }, + { + "epoch": 2.6380867574099987, + "ewc_loss": 0.008568032644689083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568032353650779e-05, + "grad_norm": 4.291777610778809, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8931060433387756, + "num_tokens": 791338854.0, + "step": 20738 + }, + { + "epoch": 2.638213967688589, + "ewc_loss": 0.008607841096818447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60784130054526e-05, + "grad_norm": 4.227633476257324, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8879985809326172, + "num_tokens": 791376580.0, + "step": 20739 + }, + { + "epoch": 2.6383411779671797, + "ewc_loss": 0.008553990162909031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553990483051166e-05, + "grad_norm": 4.258883953094482, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8793148994445801, + "num_tokens": 791415931.0, + "step": 20740 + }, + { + "epoch": 2.6384683882457702, + "ewc_loss": 0.008586459793150425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586460171500221e-05, + "grad_norm": 4.2648444175720215, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8798736333847046, + "num_tokens": 791448201.0, + "step": 20741 + }, + { + "epoch": 2.6385955985243608, + "ewc_loss": 0.008581233210861683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581233123550192e-05, + "grad_norm": 4.240349292755127, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8766636252403259, + "num_tokens": 791489183.0, + "step": 20742 + }, + { + "epoch": 2.6387228088029513, + "ewc_loss": 0.008572451770305634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572451770305634e-05, + "grad_norm": 4.263132572174072, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.884384036064148, + "num_tokens": 791528414.0, + "step": 20743 + }, + { + "epoch": 2.638850019081542, + "ewc_loss": 0.008585273288190365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585273462813348e-05, + "grad_norm": 4.223024845123291, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.9016832113265991, + "num_tokens": 791566379.0, + "step": 20744 + }, + { + "epoch": 2.6389772293601323, + "ewc_loss": 0.008557319641113281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557319233659655e-05, + "grad_norm": 4.290691375732422, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8864012360572815, + "num_tokens": 791599789.0, + "step": 20745 + }, + { + "epoch": 2.639104439638723, + "ewc_loss": 0.008604135364294052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604134927736595e-05, + "grad_norm": 4.2625813484191895, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.871812105178833, + "num_tokens": 791642333.0, + "step": 20746 + }, + { + "epoch": 2.6392316499173134, + "ewc_loss": 0.008552792482078075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55279213283211e-05, + "grad_norm": 4.210146903991699, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8823789358139038, + "num_tokens": 791687190.0, + "step": 20747 + }, + { + "epoch": 2.639358860195904, + "ewc_loss": 0.008550625294446945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550625352654606e-05, + "grad_norm": 4.274507522583008, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8694530725479126, + "num_tokens": 791722806.0, + "step": 20748 + }, + { + "epoch": 2.6394860704744945, + "ewc_loss": 0.00860049482434988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600494766142219e-05, + "grad_norm": 4.31475305557251, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8684633374214172, + "num_tokens": 791757726.0, + "step": 20749 + }, + { + "epoch": 2.639613280753085, + "ewc_loss": 0.008588247932493687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588247874286026e-05, + "grad_norm": 4.265347003936768, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8911811113357544, + "num_tokens": 791795160.0, + "step": 20750 + }, + { + "epoch": 2.6397404910316755, + "ewc_loss": 0.008556129410862923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556129614589736e-05, + "grad_norm": 4.214213848114014, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8942868113517761, + "num_tokens": 791831682.0, + "step": 20751 + }, + { + "epoch": 2.6398677013102656, + "ewc_loss": 0.008525182493031025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525182056473568e-05, + "grad_norm": 4.269867897033691, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8924474716186523, + "num_tokens": 791863230.0, + "step": 20752 + }, + { + "epoch": 2.6399949115888566, + "ewc_loss": 0.008594777435064316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594777318648994e-05, + "grad_norm": 4.187192440032959, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8939854502677917, + "num_tokens": 791904323.0, + "step": 20753 + }, + { + "epoch": 2.6401221218674467, + "ewc_loss": 0.00854200217872858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542001887690276e-05, + "grad_norm": 4.1997456550598145, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8765223026275635, + "num_tokens": 791951836.0, + "step": 20754 + }, + { + "epoch": 2.6402493321460376, + "ewc_loss": 0.008572355844080448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572355727665126e-05, + "grad_norm": 4.263633728027344, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8841421008110046, + "num_tokens": 791989782.0, + "step": 20755 + }, + { + "epoch": 2.6403765424246277, + "ewc_loss": 0.008598213084042072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598213025834411e-05, + "grad_norm": 4.2707953453063965, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8845019936561584, + "num_tokens": 792024038.0, + "step": 20756 + }, + { + "epoch": 2.6405037527032187, + "ewc_loss": 0.008568281307816505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568281191401184e-05, + "grad_norm": 4.16908597946167, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8821948766708374, + "num_tokens": 792067893.0, + "step": 20757 + }, + { + "epoch": 2.6406309629818088, + "ewc_loss": 0.008523054420948029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523054566467181e-05, + "grad_norm": 4.218689918518066, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8927873969078064, + "num_tokens": 792108593.0, + "step": 20758 + }, + { + "epoch": 2.6407581732603993, + "ewc_loss": 0.008579830639064312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579831046517938e-05, + "grad_norm": 4.282171726226807, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8942473530769348, + "num_tokens": 792141201.0, + "step": 20759 + }, + { + "epoch": 2.64088538353899, + "ewc_loss": 0.008572706952691078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572707156417891e-05, + "grad_norm": 4.222092628479004, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.9016168117523193, + "num_tokens": 792182908.0, + "step": 20760 + }, + { + "epoch": 2.6410125938175804, + "ewc_loss": 0.008521015755832195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521015843143687e-05, + "grad_norm": 4.1969099044799805, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8830826282501221, + "num_tokens": 792225223.0, + "step": 20761 + }, + { + "epoch": 2.641139804096171, + "ewc_loss": 0.008509352803230286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50935248308815e-05, + "grad_norm": 4.3255720138549805, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8662875890731812, + "num_tokens": 792258131.0, + "step": 20762 + }, + { + "epoch": 2.6412670143747614, + "ewc_loss": 0.008587976917624474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587976481067017e-05, + "grad_norm": 4.2448530197143555, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8876149654388428, + "num_tokens": 792291429.0, + "step": 20763 + }, + { + "epoch": 2.641394224653352, + "ewc_loss": 0.008490690030157566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490690379403532e-05, + "grad_norm": 4.2565765380859375, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8807043433189392, + "num_tokens": 792327777.0, + "step": 20764 + }, + { + "epoch": 2.6415214349319425, + "ewc_loss": 0.008527696132659912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527695899829268e-05, + "grad_norm": 4.1861891746521, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8848898410797119, + "num_tokens": 792367918.0, + "step": 20765 + }, + { + "epoch": 2.641648645210533, + "ewc_loss": 0.008489539846777916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489540050504729e-05, + "grad_norm": 4.2704010009765625, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8856119513511658, + "num_tokens": 792405550.0, + "step": 20766 + }, + { + "epoch": 2.6417758554891235, + "ewc_loss": 0.008556948974728584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556948887417093e-05, + "grad_norm": 4.27122163772583, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.884315550327301, + "num_tokens": 792440948.0, + "step": 20767 + }, + { + "epoch": 2.641903065767714, + "ewc_loss": 0.00852302648127079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523026190232486e-05, + "grad_norm": 4.178278923034668, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8901516199111938, + "num_tokens": 792488065.0, + "step": 20768 + }, + { + "epoch": 2.6420302760463046, + "ewc_loss": 0.008451974019408226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4519742813427e-05, + "grad_norm": 4.278501510620117, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8897399306297302, + "num_tokens": 792522845.0, + "step": 20769 + }, + { + "epoch": 2.642157486324895, + "ewc_loss": 0.008545873686671257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545874152332544e-05, + "grad_norm": 4.2424211502075195, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8924587368965149, + "num_tokens": 792561118.0, + "step": 20770 + }, + { + "epoch": 2.6422846966034856, + "ewc_loss": 0.00846166629344225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.461665856884792e-05, + "grad_norm": 4.239009380340576, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8825669288635254, + "num_tokens": 792600125.0, + "step": 20771 + }, + { + "epoch": 2.642411906882076, + "ewc_loss": 0.008474445901811123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474446076434106e-05, + "grad_norm": 4.2729010581970215, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8887863159179688, + "num_tokens": 792633856.0, + "step": 20772 + }, + { + "epoch": 2.6425391171606667, + "ewc_loss": 0.008489400148391724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.489400352118537e-05, + "grad_norm": 4.275357246398926, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8812936544418335, + "num_tokens": 792672427.0, + "step": 20773 + }, + { + "epoch": 2.6426663274392572, + "ewc_loss": 0.008493524044752121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493523637298495e-05, + "grad_norm": 4.250326156616211, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8636828064918518, + "num_tokens": 792713856.0, + "step": 20774 + }, + { + "epoch": 2.6427935377178478, + "ewc_loss": 0.0084714749827981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471474575344473e-05, + "grad_norm": 4.23935604095459, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8759212493896484, + "num_tokens": 792757602.0, + "step": 20775 + }, + { + "epoch": 2.6429207479964383, + "ewc_loss": 0.008481666445732117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48166600917466e-05, + "grad_norm": 4.277344703674316, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8767021894454956, + "num_tokens": 792797067.0, + "step": 20776 + }, + { + "epoch": 2.6430479582750284, + "ewc_loss": 0.008511637337505817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511637133779004e-05, + "grad_norm": 4.2545905113220215, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8860810995101929, + "num_tokens": 792835748.0, + "step": 20777 + }, + { + "epoch": 2.6431751685536193, + "ewc_loss": 0.008474926464259624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474926289636642e-05, + "grad_norm": 4.250027656555176, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8842489719390869, + "num_tokens": 792875039.0, + "step": 20778 + }, + { + "epoch": 2.6433023788322094, + "ewc_loss": 0.008488913998007774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48891431814991e-05, + "grad_norm": 4.246988296508789, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8885284662246704, + "num_tokens": 792916580.0, + "step": 20779 + }, + { + "epoch": 2.6434295891108004, + "ewc_loss": 0.00847834162414074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47834162414074e-05, + "grad_norm": 4.296130180358887, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8843984603881836, + "num_tokens": 792952507.0, + "step": 20780 + }, + { + "epoch": 2.6435567993893905, + "ewc_loss": 0.008495999500155449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495999645674601e-05, + "grad_norm": 4.2192864418029785, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8869622945785522, + "num_tokens": 792994046.0, + "step": 20781 + }, + { + "epoch": 2.6436840096679814, + "ewc_loss": 0.008442088030278683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.442088437732309e-05, + "grad_norm": 4.262801647186279, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8997424840927124, + "num_tokens": 793033855.0, + "step": 20782 + }, + { + "epoch": 2.6438112199465715, + "ewc_loss": 0.008471934124827385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47193441586569e-05, + "grad_norm": 4.24399995803833, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8859792947769165, + "num_tokens": 793072226.0, + "step": 20783 + }, + { + "epoch": 2.643938430225162, + "ewc_loss": 0.0084489481523633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.448948210570961e-05, + "grad_norm": 4.298327922821045, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.88657546043396, + "num_tokens": 793110884.0, + "step": 20784 + }, + { + "epoch": 2.6440656405037526, + "ewc_loss": 0.008483344689011574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483344572596252e-05, + "grad_norm": 4.306430339813232, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8662458062171936, + "num_tokens": 793151885.0, + "step": 20785 + }, + { + "epoch": 2.644192850782343, + "ewc_loss": 0.008453473448753357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.453473856206983e-05, + "grad_norm": 4.3081207275390625, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8619292974472046, + "num_tokens": 793187253.0, + "step": 20786 + }, + { + "epoch": 2.6443200610609336, + "ewc_loss": 0.008455600589513779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455600618617609e-05, + "grad_norm": 4.1961469650268555, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8879666328430176, + "num_tokens": 793231249.0, + "step": 20787 + }, + { + "epoch": 2.644447271339524, + "ewc_loss": 0.008393163792788982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.393164171138778e-05, + "grad_norm": 4.305518627166748, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8949931263923645, + "num_tokens": 793269918.0, + "step": 20788 + }, + { + "epoch": 2.6445744816181147, + "ewc_loss": 0.008468996733427048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.468997111776844e-05, + "grad_norm": 4.225833415985107, + "learning_rate": 1e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8931679725646973, + "num_tokens": 793306856.0, + "step": 20789 + }, + { + "epoch": 2.6447016918967052, + "ewc_loss": 0.008394808508455753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394808537559584e-05, + "grad_norm": 4.293330192565918, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8767313957214355, + "num_tokens": 793347866.0, + "step": 20790 + }, + { + "epoch": 2.6448289021752958, + "ewc_loss": 0.008454333059489727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.454333146801218e-05, + "grad_norm": 4.176918983459473, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8835431337356567, + "num_tokens": 793391037.0, + "step": 20791 + }, + { + "epoch": 2.6449561124538863, + "ewc_loss": 0.008373581804335117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.373581658815965e-05, + "grad_norm": 4.366805553436279, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8702139258384705, + "num_tokens": 793423560.0, + "step": 20792 + }, + { + "epoch": 2.645083322732477, + "ewc_loss": 0.00853288359940052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532883657608181e-05, + "grad_norm": 4.254234313964844, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8911048173904419, + "num_tokens": 793460215.0, + "step": 20793 + }, + { + "epoch": 2.6452105330110673, + "ewc_loss": 0.008403249084949493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.403248648392037e-05, + "grad_norm": 4.309341907501221, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8729045987129211, + "num_tokens": 793492671.0, + "step": 20794 + }, + { + "epoch": 2.645337743289658, + "ewc_loss": 0.008478399366140366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478399104205891e-05, + "grad_norm": 4.259486198425293, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8821498155593872, + "num_tokens": 793529526.0, + "step": 20795 + }, + { + "epoch": 2.6454649535682484, + "ewc_loss": 0.008449938148260117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.44993774080649e-05, + "grad_norm": 4.236167907714844, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8922501802444458, + "num_tokens": 793566748.0, + "step": 20796 + }, + { + "epoch": 2.645592163846839, + "ewc_loss": 0.008463052101433277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463051926810294e-05, + "grad_norm": 4.223466396331787, + "learning_rate": 1e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8985279202461243, + "num_tokens": 793600703.0, + "step": 20797 + }, + { + "epoch": 2.6457193741254295, + "ewc_loss": 0.008471219800412655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.471219916827977e-05, + "grad_norm": 4.252668380737305, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8828467130661011, + "num_tokens": 793639226.0, + "step": 20798 + }, + { + "epoch": 2.64584658440402, + "ewc_loss": 0.008495486341416836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495486690662801e-05, + "grad_norm": 4.245368957519531, + "learning_rate": 1e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8964073657989502, + "num_tokens": 793676147.0, + "step": 20799 + }, + { + "epoch": 2.6459737946826105, + "ewc_loss": 0.008481084369122982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481084660161287e-05, + "grad_norm": 4.219671726226807, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8919907808303833, + "num_tokens": 793715778.0, + "step": 20800 + }, + { + "epoch": 2.646101004961201, + "ewc_loss": 0.008505311794579029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505312143824995e-05, + "grad_norm": 4.239696502685547, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8970905542373657, + "num_tokens": 793752096.0, + "step": 20801 + }, + { + "epoch": 2.646228215239791, + "ewc_loss": 0.00849463976919651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49463976919651e-05, + "grad_norm": 4.244029521942139, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8784998655319214, + "num_tokens": 793789809.0, + "step": 20802 + }, + { + "epoch": 2.646355425518382, + "ewc_loss": 0.008509818464517593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50981887197122e-05, + "grad_norm": 4.270441055297852, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8787406086921692, + "num_tokens": 793826505.0, + "step": 20803 + }, + { + "epoch": 2.646482635796972, + "ewc_loss": 0.008519722148776054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.519722177879885e-05, + "grad_norm": 4.210946559906006, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.883970320224762, + "num_tokens": 793868360.0, + "step": 20804 + }, + { + "epoch": 2.646609846075563, + "ewc_loss": 0.00849191565066576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49191565066576e-05, + "grad_norm": 4.256953239440918, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8856987357139587, + "num_tokens": 793906566.0, + "step": 20805 + }, + { + "epoch": 2.6467370563541532, + "ewc_loss": 0.008537562564015388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537562825949863e-05, + "grad_norm": 4.2570929527282715, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8820908069610596, + "num_tokens": 793947814.0, + "step": 20806 + }, + { + "epoch": 2.6468642666327438, + "ewc_loss": 0.0085106510668993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510651241522282e-05, + "grad_norm": 4.181023597717285, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8895284533500671, + "num_tokens": 793987391.0, + "step": 20807 + }, + { + "epoch": 2.6469914769113343, + "ewc_loss": 0.008486302569508553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486302249366418e-05, + "grad_norm": 4.246104717254639, + "learning_rate": 1e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8953614830970764, + "num_tokens": 794024045.0, + "step": 20808 + }, + { + "epoch": 2.647118687189925, + "ewc_loss": 0.008547551929950714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547551988158375e-05, + "grad_norm": 4.201357364654541, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8882398009300232, + "num_tokens": 794063405.0, + "step": 20809 + }, + { + "epoch": 2.6472458974685154, + "ewc_loss": 0.008500734344124794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50073411129415e-05, + "grad_norm": 4.364821910858154, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8625015020370483, + "num_tokens": 794099729.0, + "step": 20810 + }, + { + "epoch": 2.647373107747106, + "ewc_loss": 0.008594798855483532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594799146521837e-05, + "grad_norm": 4.224018573760986, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8770449757575989, + "num_tokens": 794139669.0, + "step": 20811 + }, + { + "epoch": 2.6475003180256964, + "ewc_loss": 0.008455854840576649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.455854549538344e-05, + "grad_norm": 4.236892223358154, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8834128379821777, + "num_tokens": 794176906.0, + "step": 20812 + }, + { + "epoch": 2.647627528304287, + "ewc_loss": 0.008525297977030277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525297744199634e-05, + "grad_norm": 4.234658241271973, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.886648416519165, + "num_tokens": 794216594.0, + "step": 20813 + }, + { + "epoch": 2.6477547385828775, + "ewc_loss": 0.008512087166309357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512087515555322e-05, + "grad_norm": 4.301571846008301, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8855859041213989, + "num_tokens": 794248379.0, + "step": 20814 + }, + { + "epoch": 2.647881948861468, + "ewc_loss": 0.008555533364415169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555532986065373e-05, + "grad_norm": 4.232966423034668, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.862977147102356, + "num_tokens": 794293604.0, + "step": 20815 + }, + { + "epoch": 2.6480091591400585, + "ewc_loss": 0.008510105311870575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510105544701219e-05, + "grad_norm": 4.2445902824401855, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8753561973571777, + "num_tokens": 794332084.0, + "step": 20816 + }, + { + "epoch": 2.648136369418649, + "ewc_loss": 0.00854115653783083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541156421415508e-05, + "grad_norm": 4.198191165924072, + "learning_rate": 1e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.8976989984512329, + "num_tokens": 794369158.0, + "step": 20817 + }, + { + "epoch": 2.6482635796972396, + "ewc_loss": 0.00853536557406187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535365486750379e-05, + "grad_norm": 4.245558261871338, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8870844841003418, + "num_tokens": 794410531.0, + "step": 20818 + }, + { + "epoch": 2.64839078997583, + "ewc_loss": 0.008556093089282513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556093234801665e-05, + "grad_norm": 4.161342144012451, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8827914595603943, + "num_tokens": 794457556.0, + "step": 20819 + }, + { + "epoch": 2.6485180002544206, + "ewc_loss": 0.008503010496497154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503010758431628e-05, + "grad_norm": 4.293910980224609, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8725017309188843, + "num_tokens": 794492737.0, + "step": 20820 + }, + { + "epoch": 2.648645210533011, + "ewc_loss": 0.008598005399107933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598004933446646e-05, + "grad_norm": 4.227661609649658, + "learning_rate": 1e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8931128978729248, + "num_tokens": 794532589.0, + "step": 20821 + }, + { + "epoch": 2.6487724208116017, + "ewc_loss": 0.008518719114363194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51871955092065e-05, + "grad_norm": 4.287116050720215, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8722913265228271, + "num_tokens": 794566982.0, + "step": 20822 + }, + { + "epoch": 2.648899631090192, + "ewc_loss": 0.008583182469010353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583182352595031e-05, + "grad_norm": 4.291110515594482, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.869125247001648, + "num_tokens": 794604278.0, + "step": 20823 + }, + { + "epoch": 2.6490268413687827, + "ewc_loss": 0.008559513837099075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559513662476093e-05, + "grad_norm": 4.200758457183838, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8922654390335083, + "num_tokens": 794643785.0, + "step": 20824 + }, + { + "epoch": 2.649154051647373, + "ewc_loss": 0.00851887371391058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518873801222071e-05, + "grad_norm": 4.259250640869141, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8937737941741943, + "num_tokens": 794678302.0, + "step": 20825 + }, + { + "epoch": 2.649281261925964, + "ewc_loss": 0.008573440834879875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573440572945401e-05, + "grad_norm": 4.250864505767822, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.874018132686615, + "num_tokens": 794715732.0, + "step": 20826 + }, + { + "epoch": 2.649408472204554, + "ewc_loss": 0.008538496680557728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538497058907524e-05, + "grad_norm": 4.213867664337158, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8879294395446777, + "num_tokens": 794754359.0, + "step": 20827 + }, + { + "epoch": 2.649535682483145, + "ewc_loss": 0.008546936325728893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546936442144215e-05, + "grad_norm": 4.220287799835205, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8800767660140991, + "num_tokens": 794795770.0, + "step": 20828 + }, + { + "epoch": 2.649662892761735, + "ewc_loss": 0.008556226268410683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556226384826005e-05, + "grad_norm": 4.249626636505127, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8782976865768433, + "num_tokens": 794835654.0, + "step": 20829 + }, + { + "epoch": 2.649790103040326, + "ewc_loss": 0.008559931069612503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559931302443147e-05, + "grad_norm": 4.2167558670043945, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8850053548812866, + "num_tokens": 794872527.0, + "step": 20830 + }, + { + "epoch": 2.649917313318916, + "ewc_loss": 0.008531471714377403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531471394235268e-05, + "grad_norm": 4.171726703643799, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8872724175453186, + "num_tokens": 794918577.0, + "step": 20831 + }, + { + "epoch": 2.6500445235975065, + "ewc_loss": 0.008525432087481022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525432349415496e-05, + "grad_norm": 4.314955711364746, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8927066326141357, + "num_tokens": 794953566.0, + "step": 20832 + }, + { + "epoch": 2.650171733876097, + "ewc_loss": 0.008615301921963692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615302067482844e-05, + "grad_norm": 4.284230709075928, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8903892040252686, + "num_tokens": 794990547.0, + "step": 20833 + }, + { + "epoch": 2.6502989441546876, + "ewc_loss": 0.008539318107068539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539318514522165e-05, + "grad_norm": 4.234340667724609, + "learning_rate": 1e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8987478613853455, + "num_tokens": 795028419.0, + "step": 20834 + }, + { + "epoch": 2.650426154433278, + "ewc_loss": 0.008513830602169037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51383083499968e-05, + "grad_norm": 4.267139911651611, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8988355398178101, + "num_tokens": 795064303.0, + "step": 20835 + }, + { + "epoch": 2.6505533647118686, + "ewc_loss": 0.008555509150028229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555508975405246e-05, + "grad_norm": 4.274509906768799, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8628047704696655, + "num_tokens": 795104280.0, + "step": 20836 + }, + { + "epoch": 2.650680574990459, + "ewc_loss": 0.008538136258721352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53813617140986e-05, + "grad_norm": 4.25739049911499, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8761741518974304, + "num_tokens": 795142547.0, + "step": 20837 + }, + { + "epoch": 2.6508077852690497, + "ewc_loss": 0.008517256937921047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5172570834402e-05, + "grad_norm": 4.203973293304443, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8816606402397156, + "num_tokens": 795186193.0, + "step": 20838 + }, + { + "epoch": 2.6509349955476402, + "ewc_loss": 0.00848038587719202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480385440634564e-05, + "grad_norm": 4.294844627380371, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.8999929428100586, + "num_tokens": 795215453.0, + "step": 20839 + }, + { + "epoch": 2.6510622058262308, + "ewc_loss": 0.008549255318939686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549255289835855e-05, + "grad_norm": 4.236232757568359, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8794693946838379, + "num_tokens": 795252820.0, + "step": 20840 + }, + { + "epoch": 2.6511894161048213, + "ewc_loss": 0.00848404597491026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484045974910259e-05, + "grad_norm": 4.248464107513428, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8812289237976074, + "num_tokens": 795290000.0, + "step": 20841 + }, + { + "epoch": 2.651316626383412, + "ewc_loss": 0.008527291938662529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527292084181681e-05, + "grad_norm": 4.188473701477051, + "learning_rate": 1e-06, + "loss": 0.283, + "mean_token_accuracy": 0.900038480758667, + "num_tokens": 795330389.0, + "step": 20842 + }, + { + "epoch": 2.6514438366620023, + "ewc_loss": 0.00847602542489767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.476025686832145e-05, + "grad_norm": 4.270805358886719, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.881268322467804, + "num_tokens": 795366693.0, + "step": 20843 + }, + { + "epoch": 2.651571046940593, + "ewc_loss": 0.008550797589123249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550797792850062e-05, + "grad_norm": 4.206006050109863, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8839872479438782, + "num_tokens": 795405748.0, + "step": 20844 + }, + { + "epoch": 2.6516982572191834, + "ewc_loss": 0.008487804792821407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487804734613746e-05, + "grad_norm": 4.225613117218018, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8952662348747253, + "num_tokens": 795446179.0, + "step": 20845 + }, + { + "epoch": 2.651825467497774, + "ewc_loss": 0.00851727370172739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517273818142712e-05, + "grad_norm": 4.2201128005981445, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8798058032989502, + "num_tokens": 795488234.0, + "step": 20846 + }, + { + "epoch": 2.6519526777763645, + "ewc_loss": 0.0084989657625556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498965325998142e-05, + "grad_norm": 4.259146213531494, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8813151121139526, + "num_tokens": 795526508.0, + "step": 20847 + }, + { + "epoch": 2.652079888054955, + "ewc_loss": 0.008526463992893696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526464080205187e-05, + "grad_norm": 4.21402645111084, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8907479047775269, + "num_tokens": 795566299.0, + "step": 20848 + }, + { + "epoch": 2.6522070983335455, + "ewc_loss": 0.008474801667034626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.474801870761439e-05, + "grad_norm": 4.257811069488525, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8787434101104736, + "num_tokens": 795603892.0, + "step": 20849 + }, + { + "epoch": 2.6523343086121356, + "ewc_loss": 0.008509824983775616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509824692737311e-05, + "grad_norm": 4.232917785644531, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8909057974815369, + "num_tokens": 795639622.0, + "step": 20850 + }, + { + "epoch": 2.6524615188907266, + "ewc_loss": 0.008490539155900478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490539039485157e-05, + "grad_norm": 4.226648330688477, + "learning_rate": 1e-06, + "loss": 0.2734, + "mean_token_accuracy": 0.9037808179855347, + "num_tokens": 795677299.0, + "step": 20851 + }, + { + "epoch": 2.6525887291693167, + "ewc_loss": 0.008480462245643139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.480462565785274e-05, + "grad_norm": 4.23258113861084, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8848146796226501, + "num_tokens": 795714304.0, + "step": 20852 + }, + { + "epoch": 2.6527159394479076, + "ewc_loss": 0.008492538705468178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492538472637534e-05, + "grad_norm": 4.273680686950684, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.880614697933197, + "num_tokens": 795752330.0, + "step": 20853 + }, + { + "epoch": 2.6528431497264977, + "ewc_loss": 0.00851170253008604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511702617397532e-05, + "grad_norm": 4.271764278411865, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8638140559196472, + "num_tokens": 795789503.0, + "step": 20854 + }, + { + "epoch": 2.6529703600050887, + "ewc_loss": 0.008499311283230782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499310933984816e-05, + "grad_norm": 4.221343040466309, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8837317228317261, + "num_tokens": 795828449.0, + "step": 20855 + }, + { + "epoch": 2.6530975702836788, + "ewc_loss": 0.008488158695399761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488158346153796e-05, + "grad_norm": 4.230734825134277, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8827217817306519, + "num_tokens": 795868454.0, + "step": 20856 + }, + { + "epoch": 2.6532247805622693, + "ewc_loss": 0.008520841598510742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520841220160946e-05, + "grad_norm": 4.329869270324707, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8703092336654663, + "num_tokens": 795904629.0, + "step": 20857 + }, + { + "epoch": 2.65335199084086, + "ewc_loss": 0.008555661886930466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555661770515144e-05, + "grad_norm": 4.257246971130371, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.870612621307373, + "num_tokens": 795943197.0, + "step": 20858 + }, + { + "epoch": 2.6534792011194503, + "ewc_loss": 0.00848354585468769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.483546116622165e-05, + "grad_norm": 4.197225093841553, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.879213809967041, + "num_tokens": 795984672.0, + "step": 20859 + }, + { + "epoch": 2.653606411398041, + "ewc_loss": 0.008507446385920048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507446182193235e-05, + "grad_norm": 4.293005466461182, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8667473793029785, + "num_tokens": 796022599.0, + "step": 20860 + }, + { + "epoch": 2.6537336216766314, + "ewc_loss": 0.008569227531552315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569227793486789e-05, + "grad_norm": 4.358948230743408, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8828791379928589, + "num_tokens": 796057798.0, + "step": 20861 + }, + { + "epoch": 2.653860831955222, + "ewc_loss": 0.008578750304877758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578750566812232e-05, + "grad_norm": 4.267514705657959, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8824357986450195, + "num_tokens": 796094928.0, + "step": 20862 + }, + { + "epoch": 2.6539880422338125, + "ewc_loss": 0.008503743447363377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503743447363377e-05, + "grad_norm": 4.196149826049805, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8862514495849609, + "num_tokens": 796132614.0, + "step": 20863 + }, + { + "epoch": 2.654115252512403, + "ewc_loss": 0.00853858795017004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538588008377701e-05, + "grad_norm": 4.235795021057129, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8819329142570496, + "num_tokens": 796171653.0, + "step": 20864 + }, + { + "epoch": 2.6542424627909935, + "ewc_loss": 0.008568545803427696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56854603625834e-05, + "grad_norm": 4.221553325653076, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8781609535217285, + "num_tokens": 796215218.0, + "step": 20865 + }, + { + "epoch": 2.654369673069584, + "ewc_loss": 0.008544246666133404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544246520614251e-05, + "grad_norm": 4.193243026733398, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8943617343902588, + "num_tokens": 796249165.0, + "step": 20866 + }, + { + "epoch": 2.6544968833481746, + "ewc_loss": 0.008543428964912891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543428702978417e-05, + "grad_norm": 4.269001007080078, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8798428773880005, + "num_tokens": 796283878.0, + "step": 20867 + }, + { + "epoch": 2.654624093626765, + "ewc_loss": 0.008598693646490574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598693239036947e-05, + "grad_norm": 4.172766208648682, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8712462782859802, + "num_tokens": 796330760.0, + "step": 20868 + }, + { + "epoch": 2.6547513039053556, + "ewc_loss": 0.008518395945429802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518395770806819e-05, + "grad_norm": 4.221357345581055, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8737459182739258, + "num_tokens": 796371547.0, + "step": 20869 + }, + { + "epoch": 2.654878514183946, + "ewc_loss": 0.008590348064899445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590347715653479e-05, + "grad_norm": 4.244406223297119, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8707079887390137, + "num_tokens": 796409893.0, + "step": 20870 + }, + { + "epoch": 2.6550057244625367, + "ewc_loss": 0.008592109195888042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592109224991873e-05, + "grad_norm": 4.328032970428467, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8692386150360107, + "num_tokens": 796442493.0, + "step": 20871 + }, + { + "epoch": 2.655132934741127, + "ewc_loss": 0.008615504018962383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615504339104518e-05, + "grad_norm": 4.239326000213623, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8847923874855042, + "num_tokens": 796480853.0, + "step": 20872 + }, + { + "epoch": 2.6552601450197177, + "ewc_loss": 0.008545608259737492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545607852283865e-05, + "grad_norm": 4.2343034744262695, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8889090418815613, + "num_tokens": 796520057.0, + "step": 20873 + }, + { + "epoch": 2.6553873552983083, + "ewc_loss": 0.00857196282595396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57196282595396e-05, + "grad_norm": 4.290078639984131, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8872581720352173, + "num_tokens": 796549779.0, + "step": 20874 + }, + { + "epoch": 2.6555145655768984, + "ewc_loss": 0.008612184785306454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612185047240928e-05, + "grad_norm": 4.214754104614258, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8864401578903198, + "num_tokens": 796590923.0, + "step": 20875 + }, + { + "epoch": 2.6556417758554893, + "ewc_loss": 0.008536244742572308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536244422430173e-05, + "grad_norm": 4.242721080780029, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8787612318992615, + "num_tokens": 796629455.0, + "step": 20876 + }, + { + "epoch": 2.6557689861340794, + "ewc_loss": 0.008587706834077835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58770654303953e-05, + "grad_norm": 4.272602558135986, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8796130418777466, + "num_tokens": 796669950.0, + "step": 20877 + }, + { + "epoch": 2.6558961964126704, + "ewc_loss": 0.00859412457793951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594124665251002e-05, + "grad_norm": 4.217494964599609, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8734418153762817, + "num_tokens": 796712263.0, + "step": 20878 + }, + { + "epoch": 2.6560234066912605, + "ewc_loss": 0.008541399613022804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541399438399822e-05, + "grad_norm": 4.244884014129639, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8818696737289429, + "num_tokens": 796753025.0, + "step": 20879 + }, + { + "epoch": 2.6561506169698514, + "ewc_loss": 0.008591732941567898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59173305798322e-05, + "grad_norm": 4.254824161529541, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8948758840560913, + "num_tokens": 796788810.0, + "step": 20880 + }, + { + "epoch": 2.6562778272484415, + "ewc_loss": 0.008560547605156898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560547576053068e-05, + "grad_norm": 4.248322486877441, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8870096206665039, + "num_tokens": 796826275.0, + "step": 20881 + }, + { + "epoch": 2.656405037527032, + "ewc_loss": 0.008549273014068604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549273479729891e-05, + "grad_norm": 4.322105407714844, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8793916702270508, + "num_tokens": 796859587.0, + "step": 20882 + }, + { + "epoch": 2.6565322478056226, + "ewc_loss": 0.008597737178206444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597737178206444e-05, + "grad_norm": 4.197330474853516, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8893747329711914, + "num_tokens": 796899440.0, + "step": 20883 + }, + { + "epoch": 2.656659458084213, + "ewc_loss": 0.008482593111693859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482592966174707e-05, + "grad_norm": 4.247735023498535, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.873670220375061, + "num_tokens": 796938703.0, + "step": 20884 + }, + { + "epoch": 2.6567866683628036, + "ewc_loss": 0.008555661886930466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555661770515144e-05, + "grad_norm": 4.262103080749512, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8706002831459045, + "num_tokens": 796982287.0, + "step": 20885 + }, + { + "epoch": 2.656913878641394, + "ewc_loss": 0.00854590255767107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54590252856724e-05, + "grad_norm": 4.282001972198486, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8839112520217896, + "num_tokens": 797017640.0, + "step": 20886 + }, + { + "epoch": 2.6570410889199847, + "ewc_loss": 0.00855331216007471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553312363801524e-05, + "grad_norm": 4.258980751037598, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.868056058883667, + "num_tokens": 797054050.0, + "step": 20887 + }, + { + "epoch": 2.6571682991985752, + "ewc_loss": 0.008541995659470558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541995339328423e-05, + "grad_norm": 4.22373104095459, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.887466549873352, + "num_tokens": 797092429.0, + "step": 20888 + }, + { + "epoch": 2.6572955094771658, + "ewc_loss": 0.008524775505065918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524775330442935e-05, + "grad_norm": 4.3378167152404785, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8802772164344788, + "num_tokens": 797125646.0, + "step": 20889 + }, + { + "epoch": 2.6574227197557563, + "ewc_loss": 0.00861444603651762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614445687271655e-05, + "grad_norm": 4.208348274230957, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8847545385360718, + "num_tokens": 797164560.0, + "step": 20890 + }, + { + "epoch": 2.657549930034347, + "ewc_loss": 0.008500492200255394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500491821905598e-05, + "grad_norm": 4.331282615661621, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8826653957366943, + "num_tokens": 797194168.0, + "step": 20891 + }, + { + "epoch": 2.6576771403129373, + "ewc_loss": 0.008625103160738945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62510350998491e-05, + "grad_norm": 4.26050329208374, + "learning_rate": 1e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.9018819332122803, + "num_tokens": 797227045.0, + "step": 20892 + }, + { + "epoch": 2.657804350591528, + "ewc_loss": 0.00855529960244894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555299427825958e-05, + "grad_norm": 4.218050479888916, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8959867358207703, + "num_tokens": 797267607.0, + "step": 20893 + }, + { + "epoch": 2.6579315608701184, + "ewc_loss": 0.008548518642783165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5485189629253e-05, + "grad_norm": 4.246981143951416, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8716270923614502, + "num_tokens": 797309320.0, + "step": 20894 + }, + { + "epoch": 2.658058771148709, + "ewc_loss": 0.00860634632408619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606346091255546e-05, + "grad_norm": 4.265119552612305, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8820779323577881, + "num_tokens": 797348258.0, + "step": 20895 + }, + { + "epoch": 2.6581859814272994, + "ewc_loss": 0.008580891415476799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580891153542325e-05, + "grad_norm": 4.237742900848389, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8829934597015381, + "num_tokens": 797391563.0, + "step": 20896 + }, + { + "epoch": 2.65831319170589, + "ewc_loss": 0.00855892151594162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558921399526298e-05, + "grad_norm": 4.227717399597168, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.891086220741272, + "num_tokens": 797428314.0, + "step": 20897 + }, + { + "epoch": 2.6584404019844805, + "ewc_loss": 0.008577508851885796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577508560847491e-05, + "grad_norm": 4.351676940917969, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8649662733078003, + "num_tokens": 797464480.0, + "step": 20898 + }, + { + "epoch": 2.658567612263071, + "ewc_loss": 0.008625296875834465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625297050457448e-05, + "grad_norm": 4.18609619140625, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8833356499671936, + "num_tokens": 797504135.0, + "step": 20899 + }, + { + "epoch": 2.658694822541661, + "ewc_loss": 0.008505314588546753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505314326612279e-05, + "grad_norm": 4.262052536010742, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8858399391174316, + "num_tokens": 797544222.0, + "step": 20900 + }, + { + "epoch": 2.658822032820252, + "ewc_loss": 0.008597305975854397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597305713919923e-05, + "grad_norm": 4.201240539550781, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8890883922576904, + "num_tokens": 797583695.0, + "step": 20901 + }, + { + "epoch": 2.658949243098842, + "ewc_loss": 0.008536096662282944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536096720490605e-05, + "grad_norm": 4.238203525543213, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8707431554794312, + "num_tokens": 797625660.0, + "step": 20902 + }, + { + "epoch": 2.659076453377433, + "ewc_loss": 0.00856892578303814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5689258412458e-05, + "grad_norm": 4.235552787780762, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8832612037658691, + "num_tokens": 797665807.0, + "step": 20903 + }, + { + "epoch": 2.6592036636560232, + "ewc_loss": 0.008545666933059692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54566678754054e-05, + "grad_norm": 4.277544975280762, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8694602251052856, + "num_tokens": 797707935.0, + "step": 20904 + }, + { + "epoch": 2.6593308739346138, + "ewc_loss": 0.008559372276067734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559372508898377e-05, + "grad_norm": 4.278026103973389, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8766650557518005, + "num_tokens": 797745502.0, + "step": 20905 + }, + { + "epoch": 2.6594580842132043, + "ewc_loss": 0.008548558689653873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548558980692178e-05, + "grad_norm": 4.336953639984131, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8809394240379333, + "num_tokens": 797779412.0, + "step": 20906 + }, + { + "epoch": 2.659585294491795, + "ewc_loss": 0.008556599728763103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556599641451612e-05, + "grad_norm": 4.232273578643799, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8850668668746948, + "num_tokens": 797816677.0, + "step": 20907 + }, + { + "epoch": 2.6597125047703853, + "ewc_loss": 0.008494134061038494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494134090142325e-05, + "grad_norm": 4.303544998168945, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8665433526039124, + "num_tokens": 797854260.0, + "step": 20908 + }, + { + "epoch": 2.659839715048976, + "ewc_loss": 0.008582435548305511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582435839343816e-05, + "grad_norm": 4.245759963989258, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8689357042312622, + "num_tokens": 797892349.0, + "step": 20909 + }, + { + "epoch": 2.6599669253275664, + "ewc_loss": 0.008505959995090961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505960431648418e-05, + "grad_norm": 4.240267753601074, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8872413039207458, + "num_tokens": 797928028.0, + "step": 20910 + }, + { + "epoch": 2.660094135606157, + "ewc_loss": 0.008533844724297523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533844811609015e-05, + "grad_norm": 4.347003936767578, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8686434030532837, + "num_tokens": 797963885.0, + "step": 20911 + }, + { + "epoch": 2.6602213458847475, + "ewc_loss": 0.0086008096113801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600809815106913e-05, + "grad_norm": 4.223112106323242, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8709304332733154, + "num_tokens": 798009842.0, + "step": 20912 + }, + { + "epoch": 2.660348556163338, + "ewc_loss": 0.008502187207341194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502187120029703e-05, + "grad_norm": 4.228889465332031, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8948793411254883, + "num_tokens": 798043207.0, + "step": 20913 + }, + { + "epoch": 2.6604757664419285, + "ewc_loss": 0.008533810265362263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533809887012467e-05, + "grad_norm": 4.217143535614014, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8980403542518616, + "num_tokens": 798080887.0, + "step": 20914 + }, + { + "epoch": 2.660602976720519, + "ewc_loss": 0.008546330034732819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546330354874954e-05, + "grad_norm": 4.234862804412842, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.8912493586540222, + "num_tokens": 798126393.0, + "step": 20915 + }, + { + "epoch": 2.6607301869991096, + "ewc_loss": 0.008548488840460777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548489131499082e-05, + "grad_norm": 4.205902576446533, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8973848819732666, + "num_tokens": 798166099.0, + "step": 20916 + }, + { + "epoch": 2.6608573972777, + "ewc_loss": 0.008533237501978874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53323726914823e-05, + "grad_norm": 4.283824443817139, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8774633407592773, + "num_tokens": 798202285.0, + "step": 20917 + }, + { + "epoch": 2.6609846075562906, + "ewc_loss": 0.008577696979045868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577697008149698e-05, + "grad_norm": 4.242303848266602, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8884649276733398, + "num_tokens": 798243463.0, + "step": 20918 + }, + { + "epoch": 2.661111817834881, + "ewc_loss": 0.008517920039594173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517919923178852e-05, + "grad_norm": 4.306338787078857, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8840867280960083, + "num_tokens": 798275507.0, + "step": 20919 + }, + { + "epoch": 2.6612390281134717, + "ewc_loss": 0.00858272984623909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582729788031429e-05, + "grad_norm": 4.250824928283691, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8983596563339233, + "num_tokens": 798310536.0, + "step": 20920 + }, + { + "epoch": 2.661366238392062, + "ewc_loss": 0.008506076410412788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506076846970245e-05, + "grad_norm": 4.288150787353516, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8792218565940857, + "num_tokens": 798347874.0, + "step": 20921 + }, + { + "epoch": 2.6614934486706527, + "ewc_loss": 0.008576860651373863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576860273024067e-05, + "grad_norm": 4.260034561157227, + "learning_rate": 1e-06, + "loss": 0.2695, + "mean_token_accuracy": 0.9045723676681519, + "num_tokens": 798381865.0, + "step": 20922 + }, + { + "epoch": 2.661620658949243, + "ewc_loss": 0.008534222841262817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534223161404952e-05, + "grad_norm": 4.215202808380127, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8816031813621521, + "num_tokens": 798423378.0, + "step": 20923 + }, + { + "epoch": 2.661747869227834, + "ewc_loss": 0.008533240295946598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533240179531276e-05, + "grad_norm": 4.303932189941406, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8901465535163879, + "num_tokens": 798460278.0, + "step": 20924 + }, + { + "epoch": 2.661875079506424, + "ewc_loss": 0.008583100512623787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583100861869752e-05, + "grad_norm": 4.275213241577148, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8842850923538208, + "num_tokens": 798496622.0, + "step": 20925 + }, + { + "epoch": 2.662002289785015, + "ewc_loss": 0.008531698025763035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531698404112831e-05, + "grad_norm": 4.281185150146484, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8807485103607178, + "num_tokens": 798533061.0, + "step": 20926 + }, + { + "epoch": 2.662129500063605, + "ewc_loss": 0.008548375219106674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548374898964539e-05, + "grad_norm": 4.2298903465271, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8743352293968201, + "num_tokens": 798573689.0, + "step": 20927 + }, + { + "epoch": 2.662256710342196, + "ewc_loss": 0.008513401262462139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513401553500444e-05, + "grad_norm": 4.203515529632568, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8911987543106079, + "num_tokens": 798614058.0, + "step": 20928 + }, + { + "epoch": 2.662383920620786, + "ewc_loss": 0.008522621355950832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522621646989137e-05, + "grad_norm": 4.247095584869385, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8718266487121582, + "num_tokens": 798657769.0, + "step": 20929 + }, + { + "epoch": 2.6625111308993765, + "ewc_loss": 0.008565905503928661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565905591240153e-05, + "grad_norm": 4.2011590003967285, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8855524063110352, + "num_tokens": 798701084.0, + "step": 20930 + }, + { + "epoch": 2.662638341177967, + "ewc_loss": 0.008508370257914066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508370228810236e-05, + "grad_norm": 4.302728176116943, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.890144407749176, + "num_tokens": 798738029.0, + "step": 20931 + }, + { + "epoch": 2.6627655514565576, + "ewc_loss": 0.00859950203448534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599502325523645e-05, + "grad_norm": 4.260627269744873, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.877964198589325, + "num_tokens": 798773537.0, + "step": 20932 + }, + { + "epoch": 2.662892761735148, + "ewc_loss": 0.008528931066393852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528930629836395e-05, + "grad_norm": 4.255476474761963, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8806265592575073, + "num_tokens": 798810397.0, + "step": 20933 + }, + { + "epoch": 2.6630199720137386, + "ewc_loss": 0.00857329647988081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57329650898464e-05, + "grad_norm": 4.2645440101623535, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8812156915664673, + "num_tokens": 798850796.0, + "step": 20934 + }, + { + "epoch": 2.663147182292329, + "ewc_loss": 0.008555416949093342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555416570743546e-05, + "grad_norm": 4.218273162841797, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.882180392742157, + "num_tokens": 798892379.0, + "step": 20935 + }, + { + "epoch": 2.6632743925709197, + "ewc_loss": 0.008520595729351044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520595292793587e-05, + "grad_norm": 4.31002140045166, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8720262050628662, + "num_tokens": 798929444.0, + "step": 20936 + }, + { + "epoch": 2.66340160284951, + "ewc_loss": 0.008586282841861248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586282638134435e-05, + "grad_norm": 4.243748664855957, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8881511688232422, + "num_tokens": 798964086.0, + "step": 20937 + }, + { + "epoch": 2.6635288131281007, + "ewc_loss": 0.00851319171488285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513192005921155e-05, + "grad_norm": 4.171333312988281, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8909649848937988, + "num_tokens": 799007408.0, + "step": 20938 + }, + { + "epoch": 2.6636560234066913, + "ewc_loss": 0.008532333187758923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532333595212549e-05, + "grad_norm": 4.251172065734863, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8864120244979858, + "num_tokens": 799045930.0, + "step": 20939 + }, + { + "epoch": 2.663783233685282, + "ewc_loss": 0.008563206531107426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56320621096529e-05, + "grad_norm": 4.246931552886963, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8918555378913879, + "num_tokens": 799083568.0, + "step": 20940 + }, + { + "epoch": 2.6639104439638723, + "ewc_loss": 0.008548560552299023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548560435883701e-05, + "grad_norm": 4.227133274078369, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8802409172058105, + "num_tokens": 799123681.0, + "step": 20941 + }, + { + "epoch": 2.664037654242463, + "ewc_loss": 0.008530515246093273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530515333404765e-05, + "grad_norm": 4.227356433868408, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8794231414794922, + "num_tokens": 799164623.0, + "step": 20942 + }, + { + "epoch": 2.6641648645210534, + "ewc_loss": 0.00853762961924076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537629764759913e-05, + "grad_norm": 4.26963996887207, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8821972012519836, + "num_tokens": 799200408.0, + "step": 20943 + }, + { + "epoch": 2.664292074799644, + "ewc_loss": 0.008564743213355541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564742893213406e-05, + "grad_norm": 4.248944282531738, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8717166185379028, + "num_tokens": 799245106.0, + "step": 20944 + }, + { + "epoch": 2.6644192850782344, + "ewc_loss": 0.00852601695805788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526017336407676e-05, + "grad_norm": 4.179677486419678, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8900899887084961, + "num_tokens": 799290429.0, + "step": 20945 + }, + { + "epoch": 2.664546495356825, + "ewc_loss": 0.008478859439492226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478859672322869e-05, + "grad_norm": 4.300090789794922, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.881854772567749, + "num_tokens": 799322951.0, + "step": 20946 + }, + { + "epoch": 2.6646737056354155, + "ewc_loss": 0.008597592823207378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597593114245683e-05, + "grad_norm": 4.195233345031738, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8802244663238525, + "num_tokens": 799365498.0, + "step": 20947 + }, + { + "epoch": 2.6648009159140056, + "ewc_loss": 0.008462761528789997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.462761616101488e-05, + "grad_norm": 4.216808319091797, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8787633776664734, + "num_tokens": 799405894.0, + "step": 20948 + }, + { + "epoch": 2.6649281261925966, + "ewc_loss": 0.008541394025087357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54139361763373e-05, + "grad_norm": 4.289179801940918, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8821627497673035, + "num_tokens": 799439750.0, + "step": 20949 + }, + { + "epoch": 2.6650553364711866, + "ewc_loss": 0.008547527715563774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547527249902487e-05, + "grad_norm": 4.330667018890381, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8748799562454224, + "num_tokens": 799475170.0, + "step": 20950 + }, + { + "epoch": 2.6651825467497776, + "ewc_loss": 0.008564446121454239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564446034142748e-05, + "grad_norm": 4.295963764190674, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8713393807411194, + "num_tokens": 799514231.0, + "step": 20951 + }, + { + "epoch": 2.6653097570283677, + "ewc_loss": 0.008535273373126984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53527380968444e-05, + "grad_norm": 4.273801326751709, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8842347860336304, + "num_tokens": 799549512.0, + "step": 20952 + }, + { + "epoch": 2.6654369673069587, + "ewc_loss": 0.00854472815990448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54472818900831e-05, + "grad_norm": 4.185281276702881, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8782802224159241, + "num_tokens": 799592144.0, + "step": 20953 + }, + { + "epoch": 2.6655641775855488, + "ewc_loss": 0.008507881313562393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507881284458563e-05, + "grad_norm": 4.189981937408447, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8696001172065735, + "num_tokens": 799634779.0, + "step": 20954 + }, + { + "epoch": 2.6656913878641393, + "ewc_loss": 0.0085524283349514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552428334951401e-05, + "grad_norm": 4.207072734832764, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8931480646133423, + "num_tokens": 799677548.0, + "step": 20955 + }, + { + "epoch": 2.66581859814273, + "ewc_loss": 0.008551482111215591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551481732865795e-05, + "grad_norm": 4.234795093536377, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8661706447601318, + "num_tokens": 799715679.0, + "step": 20956 + }, + { + "epoch": 2.6659458084213203, + "ewc_loss": 0.008566576056182384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56657643453218e-05, + "grad_norm": 4.201066017150879, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8752676248550415, + "num_tokens": 799755701.0, + "step": 20957 + }, + { + "epoch": 2.666073018699911, + "ewc_loss": 0.008562622591853142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562622679164633e-05, + "grad_norm": 4.265359878540039, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.878616988658905, + "num_tokens": 799795151.0, + "step": 20958 + }, + { + "epoch": 2.6662002289785014, + "ewc_loss": 0.008610022254288197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610021905042231e-05, + "grad_norm": 4.210638046264648, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8897888660430908, + "num_tokens": 799832249.0, + "step": 20959 + }, + { + "epoch": 2.666327439257092, + "ewc_loss": 0.0085524283349514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552428334951401e-05, + "grad_norm": 4.309356212615967, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8645050525665283, + "num_tokens": 799867762.0, + "step": 20960 + }, + { + "epoch": 2.6664546495356825, + "ewc_loss": 0.008637608960270882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637609425932169e-05, + "grad_norm": 4.183662414550781, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.879461407661438, + "num_tokens": 799909012.0, + "step": 20961 + }, + { + "epoch": 2.666581859814273, + "ewc_loss": 0.008536068722605705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53606834425591e-05, + "grad_norm": 4.261086463928223, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8795785903930664, + "num_tokens": 799950316.0, + "step": 20962 + }, + { + "epoch": 2.6667090700928635, + "ewc_loss": 0.008629431948065758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629431977169588e-05, + "grad_norm": 4.172479152679443, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8961467146873474, + "num_tokens": 799990037.0, + "step": 20963 + }, + { + "epoch": 2.666836280371454, + "ewc_loss": 0.00854526087641716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545260789105669e-05, + "grad_norm": 4.284313678741455, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8912798762321472, + "num_tokens": 800023935.0, + "step": 20964 + }, + { + "epoch": 2.6669634906500446, + "ewc_loss": 0.0086339320987463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63393215695396e-05, + "grad_norm": 4.198252201080322, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8865411281585693, + "num_tokens": 800066357.0, + "step": 20965 + }, + { + "epoch": 2.667090700928635, + "ewc_loss": 0.008533916436135769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533916115993634e-05, + "grad_norm": 4.295060634613037, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8691209554672241, + "num_tokens": 800100667.0, + "step": 20966 + }, + { + "epoch": 2.6672179112072256, + "ewc_loss": 0.008621716871857643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62171727931127e-05, + "grad_norm": 4.26693868637085, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8861139416694641, + "num_tokens": 800135101.0, + "step": 20967 + }, + { + "epoch": 2.667345121485816, + "ewc_loss": 0.008572476916015148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572476508561522e-05, + "grad_norm": 4.20040225982666, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8965505361557007, + "num_tokens": 800176308.0, + "step": 20968 + }, + { + "epoch": 2.6674723317644067, + "ewc_loss": 0.008547068573534489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547068864572793e-05, + "grad_norm": 4.301052570343018, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8928902745246887, + "num_tokens": 800205585.0, + "step": 20969 + }, + { + "epoch": 2.667599542042997, + "ewc_loss": 0.00862459558993578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62459564814344e-05, + "grad_norm": 4.245845317840576, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8834564685821533, + "num_tokens": 800241400.0, + "step": 20970 + }, + { + "epoch": 2.6677267523215877, + "ewc_loss": 0.008548631332814693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548631012672558e-05, + "grad_norm": 4.29410457611084, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8918565511703491, + "num_tokens": 800279060.0, + "step": 20971 + }, + { + "epoch": 2.6678539626001783, + "ewc_loss": 0.0086082573980093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608257485320792e-05, + "grad_norm": 4.2300801277160645, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8913856148719788, + "num_tokens": 800318606.0, + "step": 20972 + }, + { + "epoch": 2.6679811728787683, + "ewc_loss": 0.008551159873604774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551159407943487e-05, + "grad_norm": 4.243196964263916, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8809252977371216, + "num_tokens": 800356795.0, + "step": 20973 + }, + { + "epoch": 2.6681083831573593, + "ewc_loss": 0.008564404211938381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564404561184347e-05, + "grad_norm": 4.2378387451171875, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8917942047119141, + "num_tokens": 800393786.0, + "step": 20974 + }, + { + "epoch": 2.6682355934359494, + "ewc_loss": 0.00855973269790411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55973266880028e-05, + "grad_norm": 4.176143169403076, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8804810643196106, + "num_tokens": 800440153.0, + "step": 20975 + }, + { + "epoch": 2.6683628037145404, + "ewc_loss": 0.008524945005774498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524944860255346e-05, + "grad_norm": 4.2791876792907715, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.866454005241394, + "num_tokens": 800476343.0, + "step": 20976 + }, + { + "epoch": 2.6684900139931305, + "ewc_loss": 0.008619879372417927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619879372417927e-05, + "grad_norm": 4.211291313171387, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8939616680145264, + "num_tokens": 800514227.0, + "step": 20977 + }, + { + "epoch": 2.668617224271721, + "ewc_loss": 0.008531873114407063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531873027095571e-05, + "grad_norm": 4.2619242668151855, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8805310130119324, + "num_tokens": 800552357.0, + "step": 20978 + }, + { + "epoch": 2.6687444345503115, + "ewc_loss": 0.008593359962105751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593359962105751e-05, + "grad_norm": 4.252869129180908, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8883367776870728, + "num_tokens": 800587624.0, + "step": 20979 + }, + { + "epoch": 2.668871644828902, + "ewc_loss": 0.008567184209823608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.567183976992965e-05, + "grad_norm": 4.239717960357666, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.875609278678894, + "num_tokens": 800631183.0, + "step": 20980 + }, + { + "epoch": 2.6689988551074926, + "ewc_loss": 0.008561216294765472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561216236557811e-05, + "grad_norm": 4.243710517883301, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8868802189826965, + "num_tokens": 800670334.0, + "step": 20981 + }, + { + "epoch": 2.669126065386083, + "ewc_loss": 0.008547469042241573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547469042241573e-05, + "grad_norm": 4.329921245574951, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8804531693458557, + "num_tokens": 800703500.0, + "step": 20982 + }, + { + "epoch": 2.6692532756646736, + "ewc_loss": 0.008593861013650894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593860547989607e-05, + "grad_norm": 4.219484329223633, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.876899003982544, + "num_tokens": 800742904.0, + "step": 20983 + }, + { + "epoch": 2.669380485943264, + "ewc_loss": 0.00849235337227583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492352935718372e-05, + "grad_norm": 4.234319686889648, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8816478252410889, + "num_tokens": 800777556.0, + "step": 20984 + }, + { + "epoch": 2.6695076962218547, + "ewc_loss": 0.00857450533658266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574505773140118e-05, + "grad_norm": 4.242680549621582, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8759132027626038, + "num_tokens": 800817532.0, + "step": 20985 + }, + { + "epoch": 2.669634906500445, + "ewc_loss": 0.008553711697459221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553711813874543e-05, + "grad_norm": 4.200638294219971, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8918241262435913, + "num_tokens": 800856567.0, + "step": 20986 + }, + { + "epoch": 2.6697621167790357, + "ewc_loss": 0.008533736690878868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533736399840564e-05, + "grad_norm": 4.273364543914795, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8619874119758606, + "num_tokens": 800901234.0, + "step": 20987 + }, + { + "epoch": 2.6698893270576263, + "ewc_loss": 0.008595981635153294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595981489634141e-05, + "grad_norm": 4.194749355316162, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8799763321876526, + "num_tokens": 800942647.0, + "step": 20988 + }, + { + "epoch": 2.670016537336217, + "ewc_loss": 0.00851143803447485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511437772540376e-05, + "grad_norm": 4.261960983276367, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8893532752990723, + "num_tokens": 800980194.0, + "step": 20989 + }, + { + "epoch": 2.6701437476148073, + "ewc_loss": 0.008561596274375916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561596041545272e-05, + "grad_norm": 4.243143558502197, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8865634799003601, + "num_tokens": 801016978.0, + "step": 20990 + }, + { + "epoch": 2.670270957893398, + "ewc_loss": 0.008543361909687519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543361764168367e-05, + "grad_norm": 4.272832870483398, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8709568381309509, + "num_tokens": 801051921.0, + "step": 20991 + }, + { + "epoch": 2.6703981681719884, + "ewc_loss": 0.00856216624379158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562166476622224e-05, + "grad_norm": 4.287058353424072, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8593763113021851, + "num_tokens": 801087189.0, + "step": 20992 + }, + { + "epoch": 2.670525378450579, + "ewc_loss": 0.008580099791288376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580099529353902e-05, + "grad_norm": 4.275207042694092, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8872310519218445, + "num_tokens": 801121574.0, + "step": 20993 + }, + { + "epoch": 2.6706525887291694, + "ewc_loss": 0.008551385253667831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551384962629527e-05, + "grad_norm": 4.200307846069336, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.886663019657135, + "num_tokens": 801160497.0, + "step": 20994 + }, + { + "epoch": 2.67077979900776, + "ewc_loss": 0.008518047630786896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5180472524371e-05, + "grad_norm": 4.224859714508057, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8842573165893555, + "num_tokens": 801198556.0, + "step": 20995 + }, + { + "epoch": 2.6709070092863505, + "ewc_loss": 0.00858501996845007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585020259488374e-05, + "grad_norm": 4.236496925354004, + "learning_rate": 1e-06, + "loss": 0.2654, + "mean_token_accuracy": 0.9057495594024658, + "num_tokens": 801230616.0, + "step": 20996 + }, + { + "epoch": 2.671034219564941, + "ewc_loss": 0.008578469976782799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578469714848325e-05, + "grad_norm": 4.248762607574463, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8801780939102173, + "num_tokens": 801266582.0, + "step": 20997 + }, + { + "epoch": 2.671161429843531, + "ewc_loss": 0.008570561185479164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570561476517469e-05, + "grad_norm": 4.170154094696045, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8913154602050781, + "num_tokens": 801305127.0, + "step": 20998 + }, + { + "epoch": 2.671288640122122, + "ewc_loss": 0.008544467389583588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544467709725723e-05, + "grad_norm": 4.212998867034912, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8871110677719116, + "num_tokens": 801348284.0, + "step": 20999 + }, + { + "epoch": 2.671415850400712, + "ewc_loss": 0.008605808950960636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605809125583619e-05, + "grad_norm": 4.237715244293213, + "learning_rate": 1e-06, + "loss": 0.2805, + "mean_token_accuracy": 0.9004801511764526, + "num_tokens": 801385298.0, + "step": 21000 + }, + { + "epoch": 2.671543060679303, + "ewc_loss": 0.008579050190746784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579050336265936e-05, + "grad_norm": 4.299051284790039, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8735193014144897, + "num_tokens": 801417020.0, + "step": 21001 + }, + { + "epoch": 2.6716702709578932, + "ewc_loss": 0.008619551546871662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619551954325289e-05, + "grad_norm": 4.265768527984619, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8761720657348633, + "num_tokens": 801455328.0, + "step": 21002 + }, + { + "epoch": 2.6717974812364838, + "ewc_loss": 0.008566448464989662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566448377678171e-05, + "grad_norm": 4.202815532684326, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8926498293876648, + "num_tokens": 801494342.0, + "step": 21003 + }, + { + "epoch": 2.6719246915150743, + "ewc_loss": 0.008538386784493923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538386464351788e-05, + "grad_norm": 4.254331588745117, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.883887767791748, + "num_tokens": 801528182.0, + "step": 21004 + }, + { + "epoch": 2.672051901793665, + "ewc_loss": 0.008587416261434555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587416232330725e-05, + "grad_norm": 4.248963832855225, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8834980726242065, + "num_tokens": 801569424.0, + "step": 21005 + }, + { + "epoch": 2.6721791120722553, + "ewc_loss": 0.008553965017199516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553965017199516e-05, + "grad_norm": 4.227897644042969, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8794487118721008, + "num_tokens": 801608809.0, + "step": 21006 + }, + { + "epoch": 2.672306322350846, + "ewc_loss": 0.008540749549865723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540749695384875e-05, + "grad_norm": 4.223984718322754, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8746453523635864, + "num_tokens": 801651910.0, + "step": 21007 + }, + { + "epoch": 2.6724335326294364, + "ewc_loss": 0.008557340130209923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557340333936736e-05, + "grad_norm": 4.22898006439209, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.881195068359375, + "num_tokens": 801692961.0, + "step": 21008 + }, + { + "epoch": 2.672560742908027, + "ewc_loss": 0.008552363142371178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552363578928635e-05, + "grad_norm": 4.2528181076049805, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8707724213600159, + "num_tokens": 801731278.0, + "step": 21009 + }, + { + "epoch": 2.6726879531866174, + "ewc_loss": 0.008559606969356537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559606794733554e-05, + "grad_norm": 4.2622599601745605, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8796142935752869, + "num_tokens": 801769562.0, + "step": 21010 + }, + { + "epoch": 2.672815163465208, + "ewc_loss": 0.00855441577732563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554415398975834e-05, + "grad_norm": 4.251600742340088, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.876288652420044, + "num_tokens": 801811046.0, + "step": 21011 + }, + { + "epoch": 2.6729423737437985, + "ewc_loss": 0.008535020984709263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535020606359467e-05, + "grad_norm": 4.293593883514404, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8607000112533569, + "num_tokens": 801847833.0, + "step": 21012 + }, + { + "epoch": 2.673069584022389, + "ewc_loss": 0.008567328564822674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.567328768549487e-05, + "grad_norm": 4.24842643737793, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8758508563041687, + "num_tokens": 801887819.0, + "step": 21013 + }, + { + "epoch": 2.6731967943009796, + "ewc_loss": 0.008530130609869957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530130435246974e-05, + "grad_norm": 4.2335968017578125, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.893059492111206, + "num_tokens": 801922535.0, + "step": 21014 + }, + { + "epoch": 2.67332400457957, + "ewc_loss": 0.008544283919036388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544283627998084e-05, + "grad_norm": 4.258083343505859, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.887648344039917, + "num_tokens": 801958025.0, + "step": 21015 + }, + { + "epoch": 2.6734512148581606, + "ewc_loss": 0.00856499932706356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564999734517187e-05, + "grad_norm": 4.244424343109131, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8874254822731018, + "num_tokens": 801991291.0, + "step": 21016 + }, + { + "epoch": 2.673578425136751, + "ewc_loss": 0.008539315313100815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53931560413912e-05, + "grad_norm": 4.26706600189209, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8844287395477295, + "num_tokens": 802024952.0, + "step": 21017 + }, + { + "epoch": 2.6737056354153417, + "ewc_loss": 0.008564976043999195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564976451452821e-05, + "grad_norm": 4.239516258239746, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.881916880607605, + "num_tokens": 802065741.0, + "step": 21018 + }, + { + "epoch": 2.673832845693932, + "ewc_loss": 0.008540064096450806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540064300177619e-05, + "grad_norm": 4.197012901306152, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8830681443214417, + "num_tokens": 802107153.0, + "step": 21019 + }, + { + "epoch": 2.6739600559725227, + "ewc_loss": 0.008542494848370552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542494470020756e-05, + "grad_norm": 4.220426559448242, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8884660005569458, + "num_tokens": 802148177.0, + "step": 21020 + }, + { + "epoch": 2.674087266251113, + "ewc_loss": 0.008557772263884544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557771798223257e-05, + "grad_norm": 4.271142482757568, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8829421997070312, + "num_tokens": 802186144.0, + "step": 21021 + }, + { + "epoch": 2.674214476529704, + "ewc_loss": 0.00856226496398449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562264702050015e-05, + "grad_norm": 4.229015827178955, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8955875635147095, + "num_tokens": 802224054.0, + "step": 21022 + }, + { + "epoch": 2.674341686808294, + "ewc_loss": 0.008524860255420208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524860459147021e-05, + "grad_norm": 4.246622085571289, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8664098978042603, + "num_tokens": 802264483.0, + "step": 21023 + }, + { + "epoch": 2.674468897086885, + "ewc_loss": 0.00854905042797327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549050835426897e-05, + "grad_norm": 4.221794128417969, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8896234631538391, + "num_tokens": 802302933.0, + "step": 21024 + }, + { + "epoch": 2.674596107365475, + "ewc_loss": 0.008526266552507877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526266901753843e-05, + "grad_norm": 4.239202976226807, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8755167722702026, + "num_tokens": 802347354.0, + "step": 21025 + }, + { + "epoch": 2.674723317644066, + "ewc_loss": 0.008555279113352299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555279055144638e-05, + "grad_norm": 4.181198596954346, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8913812041282654, + "num_tokens": 802392159.0, + "step": 21026 + }, + { + "epoch": 2.674850527922656, + "ewc_loss": 0.008491630665957928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491630433127284e-05, + "grad_norm": 4.2230424880981445, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8867791891098022, + "num_tokens": 802432629.0, + "step": 21027 + }, + { + "epoch": 2.6749777382012465, + "ewc_loss": 0.008529610000550747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52961020427756e-05, + "grad_norm": 4.20211935043335, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8896546363830566, + "num_tokens": 802471776.0, + "step": 21028 + }, + { + "epoch": 2.675104948479837, + "ewc_loss": 0.008500967174768448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500966941937804e-05, + "grad_norm": 4.2778143882751465, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8772988319396973, + "num_tokens": 802507314.0, + "step": 21029 + }, + { + "epoch": 2.6752321587584276, + "ewc_loss": 0.008549613878130913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549613994546235e-05, + "grad_norm": 4.224376678466797, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.891009509563446, + "num_tokens": 802550587.0, + "step": 21030 + }, + { + "epoch": 2.675359369037018, + "ewc_loss": 0.008479091338813305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479091047775e-05, + "grad_norm": 4.218383312225342, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8848364949226379, + "num_tokens": 802591544.0, + "step": 21031 + }, + { + "epoch": 2.6754865793156086, + "ewc_loss": 0.008478130213916302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.478129893774167e-05, + "grad_norm": 4.261305332183838, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8893033862113953, + "num_tokens": 802627336.0, + "step": 21032 + }, + { + "epoch": 2.675613789594199, + "ewc_loss": 0.008512266911566257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512266504112631e-05, + "grad_norm": 4.218747138977051, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8812798261642456, + "num_tokens": 802668358.0, + "step": 21033 + }, + { + "epoch": 2.6757409998727897, + "ewc_loss": 0.008467325009405613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467325096717104e-05, + "grad_norm": 4.22725772857666, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8892420530319214, + "num_tokens": 802708023.0, + "step": 21034 + }, + { + "epoch": 2.67586821015138, + "ewc_loss": 0.008491488173604012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491488551953807e-05, + "grad_norm": 4.2271037101745605, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.9010825157165527, + "num_tokens": 802745856.0, + "step": 21035 + }, + { + "epoch": 2.6759954204299707, + "ewc_loss": 0.008489248342812061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.4892482846044e-05, + "grad_norm": 4.280148506164551, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8938370943069458, + "num_tokens": 802779032.0, + "step": 21036 + }, + { + "epoch": 2.6761226307085613, + "ewc_loss": 0.008501025848090649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501025877194479e-05, + "grad_norm": 4.232200622558594, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.883784830570221, + "num_tokens": 802815736.0, + "step": 21037 + }, + { + "epoch": 2.676249840987152, + "ewc_loss": 0.008458157069981098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.458156662527472e-05, + "grad_norm": 4.255059719085693, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8792587518692017, + "num_tokens": 802854133.0, + "step": 21038 + }, + { + "epoch": 2.6763770512657423, + "ewc_loss": 0.008503780700266361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503780554747209e-05, + "grad_norm": 4.229949951171875, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.890454113483429, + "num_tokens": 802892969.0, + "step": 21039 + }, + { + "epoch": 2.676504261544333, + "ewc_loss": 0.00847869087010622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47869087010622e-05, + "grad_norm": 4.164239406585693, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8963345289230347, + "num_tokens": 802936998.0, + "step": 21040 + }, + { + "epoch": 2.6766314718229234, + "ewc_loss": 0.008447115309536457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.447115396847948e-05, + "grad_norm": 4.210261821746826, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8910607695579529, + "num_tokens": 802979040.0, + "step": 21041 + }, + { + "epoch": 2.676758682101514, + "ewc_loss": 0.008501267060637474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501266711391509e-05, + "grad_norm": 4.299341201782227, + "learning_rate": 1e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8933736085891724, + "num_tokens": 803016516.0, + "step": 21042 + }, + { + "epoch": 2.6768858923801044, + "ewc_loss": 0.008507450111210346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.507449820172042e-05, + "grad_norm": 4.226335048675537, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8807621002197266, + "num_tokens": 803055566.0, + "step": 21043 + }, + { + "epoch": 2.677013102658695, + "ewc_loss": 0.008434325456619263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.434324990957975e-05, + "grad_norm": 4.256791591644287, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8850653767585754, + "num_tokens": 803093683.0, + "step": 21044 + }, + { + "epoch": 2.6771403129372855, + "ewc_loss": 0.008491598069667816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49159769131802e-05, + "grad_norm": 4.204803943634033, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8916604518890381, + "num_tokens": 803133691.0, + "step": 21045 + }, + { + "epoch": 2.6772675232158756, + "ewc_loss": 0.008434368297457695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.43436864670366e-05, + "grad_norm": 4.245298385620117, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8898491859436035, + "num_tokens": 803169870.0, + "step": 21046 + }, + { + "epoch": 2.6773947334944665, + "ewc_loss": 0.008485427126288414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.485426951665431e-05, + "grad_norm": 4.305443286895752, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8767796158790588, + "num_tokens": 803209038.0, + "step": 21047 + }, + { + "epoch": 2.6775219437730566, + "ewc_loss": 0.00849930103868246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499300747644156e-05, + "grad_norm": 4.3715715408325195, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8706246614456177, + "num_tokens": 803239725.0, + "step": 21048 + }, + { + "epoch": 2.6776491540516476, + "ewc_loss": 0.008517570793628693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517570677213371e-05, + "grad_norm": 4.264937877655029, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8828085064888, + "num_tokens": 803273580.0, + "step": 21049 + }, + { + "epoch": 2.6777763643302377, + "ewc_loss": 0.008451885543763638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.451885514659807e-05, + "grad_norm": 4.209978103637695, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8880438208580017, + "num_tokens": 803313583.0, + "step": 21050 + }, + { + "epoch": 2.6779035746088287, + "ewc_loss": 0.008443479426205158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.443479600828141e-05, + "grad_norm": 4.187215805053711, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8922094106674194, + "num_tokens": 803347517.0, + "step": 21051 + }, + { + "epoch": 2.6780307848874187, + "ewc_loss": 0.00850093737244606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500937838107347e-05, + "grad_norm": 4.2452192306518555, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8692673444747925, + "num_tokens": 803388183.0, + "step": 21052 + }, + { + "epoch": 2.6781579951660093, + "ewc_loss": 0.008511795662343502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511795749654993e-05, + "grad_norm": 4.224776268005371, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8858563303947449, + "num_tokens": 803428377.0, + "step": 21053 + }, + { + "epoch": 2.6782852054446, + "ewc_loss": 0.008500554598867893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50055439514108e-05, + "grad_norm": 4.303099155426025, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8911532759666443, + "num_tokens": 803460930.0, + "step": 21054 + }, + { + "epoch": 2.6784124157231903, + "ewc_loss": 0.008545031771063805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545031596440822e-05, + "grad_norm": 4.205657482147217, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8917891979217529, + "num_tokens": 803497971.0, + "step": 21055 + }, + { + "epoch": 2.678539626001781, + "ewc_loss": 0.00849506538361311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49506541271694e-05, + "grad_norm": 4.280418395996094, + "learning_rate": 1e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8988202810287476, + "num_tokens": 803535514.0, + "step": 21056 + }, + { + "epoch": 2.6786668362803714, + "ewc_loss": 0.008559966459870338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559966227039695e-05, + "grad_norm": 4.217026710510254, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8815295696258545, + "num_tokens": 803580767.0, + "step": 21057 + }, + { + "epoch": 2.678794046558962, + "ewc_loss": 0.008481756784021854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.481756958644837e-05, + "grad_norm": 4.23845100402832, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8877320289611816, + "num_tokens": 803619168.0, + "step": 21058 + }, + { + "epoch": 2.6789212568375524, + "ewc_loss": 0.00853981077671051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539811096852645e-05, + "grad_norm": 4.277125835418701, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8715975284576416, + "num_tokens": 803654785.0, + "step": 21059 + }, + { + "epoch": 2.679048467116143, + "ewc_loss": 0.008546097204089165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546096796635538e-05, + "grad_norm": 4.231698513031006, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.891218900680542, + "num_tokens": 803692505.0, + "step": 21060 + }, + { + "epoch": 2.6791756773947335, + "ewc_loss": 0.008518511429429054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518511458532885e-05, + "grad_norm": 4.254767417907715, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8914483785629272, + "num_tokens": 803732592.0, + "step": 21061 + }, + { + "epoch": 2.679302887673324, + "ewc_loss": 0.008522111922502518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522112329956144e-05, + "grad_norm": 4.282347679138184, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8737989664077759, + "num_tokens": 803767601.0, + "step": 21062 + }, + { + "epoch": 2.6794300979519146, + "ewc_loss": 0.008540814742445946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540815179003403e-05, + "grad_norm": 4.289055347442627, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8785210847854614, + "num_tokens": 803804329.0, + "step": 21063 + }, + { + "epoch": 2.679557308230505, + "ewc_loss": 0.0085439532995224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5439532995224e-05, + "grad_norm": 4.262825965881348, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8787875175476074, + "num_tokens": 803846182.0, + "step": 21064 + }, + { + "epoch": 2.6796845185090956, + "ewc_loss": 0.008523482829332352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523482392774895e-05, + "grad_norm": 4.301806449890137, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8777897357940674, + "num_tokens": 803883561.0, + "step": 21065 + }, + { + "epoch": 2.679811728787686, + "ewc_loss": 0.008550550788640976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550551137886941e-05, + "grad_norm": 4.26747465133667, + "learning_rate": 1e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.9009981155395508, + "num_tokens": 803916860.0, + "step": 21066 + }, + { + "epoch": 2.6799389390662767, + "ewc_loss": 0.008515516296029091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515515946783125e-05, + "grad_norm": 4.284388542175293, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8851903676986694, + "num_tokens": 803951949.0, + "step": 21067 + }, + { + "epoch": 2.680066149344867, + "ewc_loss": 0.008549313060939312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549312769901007e-05, + "grad_norm": 4.207490921020508, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8797181844711304, + "num_tokens": 803992094.0, + "step": 21068 + }, + { + "epoch": 2.6801933596234577, + "ewc_loss": 0.008516144938766956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516144589520991e-05, + "grad_norm": 4.222980499267578, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8829281330108643, + "num_tokens": 804034058.0, + "step": 21069 + }, + { + "epoch": 2.6803205699020483, + "ewc_loss": 0.008556059561669827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55605976539664e-05, + "grad_norm": 4.23441743850708, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8763911724090576, + "num_tokens": 804072722.0, + "step": 21070 + }, + { + "epoch": 2.6804477801806383, + "ewc_loss": 0.008554140105843544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554140367778018e-05, + "grad_norm": 4.203652858734131, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8879992961883545, + "num_tokens": 804114253.0, + "step": 21071 + }, + { + "epoch": 2.6805749904592293, + "ewc_loss": 0.008534329012036324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53432939038612e-05, + "grad_norm": 4.228623390197754, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8854497671127319, + "num_tokens": 804154877.0, + "step": 21072 + }, + { + "epoch": 2.6807022007378194, + "ewc_loss": 0.008562451228499413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562450966564938e-05, + "grad_norm": 4.312496662139893, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8781028985977173, + "num_tokens": 804191060.0, + "step": 21073 + }, + { + "epoch": 2.6808294110164104, + "ewc_loss": 0.008597536012530327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597535634180531e-05, + "grad_norm": 4.23287296295166, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8773322105407715, + "num_tokens": 804231574.0, + "step": 21074 + }, + { + "epoch": 2.6809566212950005, + "ewc_loss": 0.008517776615917683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517776586813852e-05, + "grad_norm": 4.313049793243408, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8622763156890869, + "num_tokens": 804264027.0, + "step": 21075 + }, + { + "epoch": 2.681083831573591, + "ewc_loss": 0.008622972294688225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622972381999716e-05, + "grad_norm": 4.273977756500244, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8857192993164062, + "num_tokens": 804302906.0, + "step": 21076 + }, + { + "epoch": 2.6812110418521815, + "ewc_loss": 0.00855336431413889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553364750696346e-05, + "grad_norm": 4.2717485427856445, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8775781989097595, + "num_tokens": 804339107.0, + "step": 21077 + }, + { + "epoch": 2.681338252130772, + "ewc_loss": 0.008586280047893524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58628045534715e-05, + "grad_norm": 4.259993076324463, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8885223865509033, + "num_tokens": 804374292.0, + "step": 21078 + }, + { + "epoch": 2.6814654624093626, + "ewc_loss": 0.008590912446379662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59091232996434e-05, + "grad_norm": 4.210363864898682, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8840858936309814, + "num_tokens": 804420287.0, + "step": 21079 + }, + { + "epoch": 2.681592672687953, + "ewc_loss": 0.008575981482863426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575981337344274e-05, + "grad_norm": 4.267387390136719, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8894039988517761, + "num_tokens": 804453977.0, + "step": 21080 + }, + { + "epoch": 2.6817198829665436, + "ewc_loss": 0.008613553829491138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613553654868156e-05, + "grad_norm": 4.263335227966309, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8703575134277344, + "num_tokens": 804494958.0, + "step": 21081 + }, + { + "epoch": 2.681847093245134, + "ewc_loss": 0.008588016033172607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588016498833895e-05, + "grad_norm": 4.25636625289917, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8641840219497681, + "num_tokens": 804532209.0, + "step": 21082 + }, + { + "epoch": 2.6819743035237247, + "ewc_loss": 0.008607308380305767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607308700447902e-05, + "grad_norm": 4.2786173820495605, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.884115993976593, + "num_tokens": 804572937.0, + "step": 21083 + }, + { + "epoch": 2.682101513802315, + "ewc_loss": 0.0086109209805727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610921213403344e-05, + "grad_norm": 4.26023530960083, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8818405270576477, + "num_tokens": 804610263.0, + "step": 21084 + }, + { + "epoch": 2.6822287240809057, + "ewc_loss": 0.008579402230679989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579402492614463e-05, + "grad_norm": 4.255553722381592, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8866622447967529, + "num_tokens": 804645516.0, + "step": 21085 + }, + { + "epoch": 2.6823559343594963, + "ewc_loss": 0.008581899106502533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581898873671889e-05, + "grad_norm": 4.210177898406982, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8940072655677795, + "num_tokens": 804684540.0, + "step": 21086 + }, + { + "epoch": 2.682483144638087, + "ewc_loss": 0.00855349749326706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553497173124924e-05, + "grad_norm": 4.292057991027832, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8835378289222717, + "num_tokens": 804723030.0, + "step": 21087 + }, + { + "epoch": 2.6826103549166773, + "ewc_loss": 0.008598127402365208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598127169534564e-05, + "grad_norm": 4.248974800109863, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8906435966491699, + "num_tokens": 804759202.0, + "step": 21088 + }, + { + "epoch": 2.682737565195268, + "ewc_loss": 0.008522524498403072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522524876752868e-05, + "grad_norm": 4.221344470977783, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8808505535125732, + "num_tokens": 804803199.0, + "step": 21089 + }, + { + "epoch": 2.6828647754738584, + "ewc_loss": 0.00852942280471325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529422484571114e-05, + "grad_norm": 4.334117889404297, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8812236785888672, + "num_tokens": 804835675.0, + "step": 21090 + }, + { + "epoch": 2.682991985752449, + "ewc_loss": 0.008596418425440788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596418047090992e-05, + "grad_norm": 4.264669895172119, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8723850250244141, + "num_tokens": 804876583.0, + "step": 21091 + }, + { + "epoch": 2.6831191960310394, + "ewc_loss": 0.008515110239386559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515110675944015e-05, + "grad_norm": 4.356095790863037, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8766154050827026, + "num_tokens": 804906110.0, + "step": 21092 + }, + { + "epoch": 2.68324640630963, + "ewc_loss": 0.008594109676778316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594109385740012e-05, + "grad_norm": 4.2699408531188965, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8754062652587891, + "num_tokens": 804942196.0, + "step": 21093 + }, + { + "epoch": 2.6833736165882205, + "ewc_loss": 0.008528156206011772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528156467946246e-05, + "grad_norm": 4.269037246704102, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8812247514724731, + "num_tokens": 804978760.0, + "step": 21094 + }, + { + "epoch": 2.683500826866811, + "ewc_loss": 0.00857242289930582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572422666475177e-05, + "grad_norm": 4.323013782501221, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8769130110740662, + "num_tokens": 805014212.0, + "step": 21095 + }, + { + "epoch": 2.683628037145401, + "ewc_loss": 0.00861101970076561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611020166426897e-05, + "grad_norm": 4.184560775756836, + "learning_rate": 1e-06, + "loss": 0.2703, + "mean_token_accuracy": 0.9053846597671509, + "num_tokens": 805050528.0, + "step": 21096 + }, + { + "epoch": 2.683755247423992, + "ewc_loss": 0.008526825346052647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526824967702851e-05, + "grad_norm": 4.279883861541748, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8827126026153564, + "num_tokens": 805087355.0, + "step": 21097 + }, + { + "epoch": 2.683882457702582, + "ewc_loss": 0.008620250038802624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62025044625625e-05, + "grad_norm": 4.366602420806885, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8694012761116028, + "num_tokens": 805117837.0, + "step": 21098 + }, + { + "epoch": 2.684009667981173, + "ewc_loss": 0.008665009401738644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.665009227115661e-05, + "grad_norm": 4.212015151977539, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8834784030914307, + "num_tokens": 805157734.0, + "step": 21099 + }, + { + "epoch": 2.684136878259763, + "ewc_loss": 0.00857994519174099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579945279052481e-05, + "grad_norm": 4.289107322692871, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8891263008117676, + "num_tokens": 805190657.0, + "step": 21100 + }, + { + "epoch": 2.6842640885383537, + "ewc_loss": 0.00867652427405119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.676524157635868e-05, + "grad_norm": 4.258035182952881, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8768351078033447, + "num_tokens": 805229654.0, + "step": 21101 + }, + { + "epoch": 2.6843912988169443, + "ewc_loss": 0.008608363568782806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608363714301959e-05, + "grad_norm": 4.2314653396606445, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8827787041664124, + "num_tokens": 805269199.0, + "step": 21102 + }, + { + "epoch": 2.684518509095535, + "ewc_loss": 0.008619536645710468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619536674814299e-05, + "grad_norm": 4.216650009155273, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8783888220787048, + "num_tokens": 805308918.0, + "step": 21103 + }, + { + "epoch": 2.6846457193741253, + "ewc_loss": 0.00861684512346983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616845298092812e-05, + "grad_norm": 4.280412673950195, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8808698654174805, + "num_tokens": 805345014.0, + "step": 21104 + }, + { + "epoch": 2.684772929652716, + "ewc_loss": 0.008656737394630909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.656737190904096e-05, + "grad_norm": 4.199102401733398, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8965195417404175, + "num_tokens": 805383674.0, + "step": 21105 + }, + { + "epoch": 2.6849001399313064, + "ewc_loss": 0.008573448285460472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573448576498777e-05, + "grad_norm": 4.2628397941589355, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8873708844184875, + "num_tokens": 805419765.0, + "step": 21106 + }, + { + "epoch": 2.685027350209897, + "ewc_loss": 0.008636299520730972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636299753561616e-05, + "grad_norm": 4.241879940032959, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8837413787841797, + "num_tokens": 805461346.0, + "step": 21107 + }, + { + "epoch": 2.6851545604884874, + "ewc_loss": 0.008618870750069618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618870924692601e-05, + "grad_norm": 4.295548439025879, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8902958035469055, + "num_tokens": 805494519.0, + "step": 21108 + }, + { + "epoch": 2.685281770767078, + "ewc_loss": 0.00862735603004694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627356146462262e-05, + "grad_norm": 4.286236763000488, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8729257583618164, + "num_tokens": 805530542.0, + "step": 21109 + }, + { + "epoch": 2.6854089810456685, + "ewc_loss": 0.00861172191798687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611722296336666e-05, + "grad_norm": 4.293559551239014, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8814181089401245, + "num_tokens": 805564531.0, + "step": 21110 + }, + { + "epoch": 2.685536191324259, + "ewc_loss": 0.008618420921266079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618421270512044e-05, + "grad_norm": 4.22836971282959, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8809301853179932, + "num_tokens": 805606011.0, + "step": 21111 + }, + { + "epoch": 2.6856634016028496, + "ewc_loss": 0.00858935248106718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589352364651859e-05, + "grad_norm": 4.282100677490234, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8769183158874512, + "num_tokens": 805643078.0, + "step": 21112 + }, + { + "epoch": 2.68579061188144, + "ewc_loss": 0.008644572459161282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644572517368942e-05, + "grad_norm": 4.233399391174316, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8872946500778198, + "num_tokens": 805678554.0, + "step": 21113 + }, + { + "epoch": 2.6859178221600306, + "ewc_loss": 0.008582960814237595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582960435887799e-05, + "grad_norm": 4.227489948272705, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8937234282493591, + "num_tokens": 805713736.0, + "step": 21114 + }, + { + "epoch": 2.686045032438621, + "ewc_loss": 0.008618365973234177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618365973234177e-05, + "grad_norm": 4.252930641174316, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8761565685272217, + "num_tokens": 805751798.0, + "step": 21115 + }, + { + "epoch": 2.6861722427172117, + "ewc_loss": 0.008632167242467403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63216700963676e-05, + "grad_norm": 4.237651824951172, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.889687180519104, + "num_tokens": 805789205.0, + "step": 21116 + }, + { + "epoch": 2.686299452995802, + "ewc_loss": 0.008616317994892597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616317791165784e-05, + "grad_norm": 4.254583835601807, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.884997546672821, + "num_tokens": 805824910.0, + "step": 21117 + }, + { + "epoch": 2.6864266632743927, + "ewc_loss": 0.008626693859696388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626694034319371e-05, + "grad_norm": 4.253706455230713, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8817054033279419, + "num_tokens": 805861335.0, + "step": 21118 + }, + { + "epoch": 2.686553873552983, + "ewc_loss": 0.008627397939562798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627398347016424e-05, + "grad_norm": 4.289905548095703, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8799370527267456, + "num_tokens": 805901488.0, + "step": 21119 + }, + { + "epoch": 2.686681083831574, + "ewc_loss": 0.00864527840167284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.645278285257518e-05, + "grad_norm": 4.3090949058532715, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8486818075180054, + "num_tokens": 805941336.0, + "step": 21120 + }, + { + "epoch": 2.686808294110164, + "ewc_loss": 0.008636937476694584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636937127448618e-05, + "grad_norm": 4.268424987792969, + "learning_rate": 1e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.8910461664199829, + "num_tokens": 805974222.0, + "step": 21121 + }, + { + "epoch": 2.686935504388755, + "ewc_loss": 0.008606793358922005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606793562648818e-05, + "grad_norm": 4.245278358459473, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.879865825176239, + "num_tokens": 806014942.0, + "step": 21122 + }, + { + "epoch": 2.687062714667345, + "ewc_loss": 0.008605225943028927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605226321378723e-05, + "grad_norm": 4.258993625640869, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8913352489471436, + "num_tokens": 806048894.0, + "step": 21123 + }, + { + "epoch": 2.687189924945936, + "ewc_loss": 0.008631754666566849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631754462840036e-05, + "grad_norm": 4.242709636688232, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8798366785049438, + "num_tokens": 806089714.0, + "step": 21124 + }, + { + "epoch": 2.687317135224526, + "ewc_loss": 0.008583307266235352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583307499065995e-05, + "grad_norm": 4.255549907684326, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.873022198677063, + "num_tokens": 806128974.0, + "step": 21125 + }, + { + "epoch": 2.6874443455031165, + "ewc_loss": 0.008606678806245327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606678602518514e-05, + "grad_norm": 4.254362106323242, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8820430636405945, + "num_tokens": 806169857.0, + "step": 21126 + }, + { + "epoch": 2.687571555781707, + "ewc_loss": 0.008602287620306015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602287562098354e-05, + "grad_norm": 4.291860103607178, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.884320855140686, + "num_tokens": 806202826.0, + "step": 21127 + }, + { + "epoch": 2.6876987660602976, + "ewc_loss": 0.008614382706582546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614382386440411e-05, + "grad_norm": 4.236351490020752, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8802535533905029, + "num_tokens": 806242850.0, + "step": 21128 + }, + { + "epoch": 2.687825976338888, + "ewc_loss": 0.00859054084867239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590541256126016e-05, + "grad_norm": 4.31123685836792, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.884255588054657, + "num_tokens": 806279024.0, + "step": 21129 + }, + { + "epoch": 2.6879531866174786, + "ewc_loss": 0.008645879104733467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64587927935645e-05, + "grad_norm": 4.318307399749756, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8689640164375305, + "num_tokens": 806313669.0, + "step": 21130 + }, + { + "epoch": 2.688080396896069, + "ewc_loss": 0.008613957092165947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613956742919981e-05, + "grad_norm": 4.2694172859191895, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8867509365081787, + "num_tokens": 806353128.0, + "step": 21131 + }, + { + "epoch": 2.6882076071746597, + "ewc_loss": 0.008577442727982998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577442349633202e-05, + "grad_norm": 4.31793212890625, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.88226318359375, + "num_tokens": 806385044.0, + "step": 21132 + }, + { + "epoch": 2.68833481745325, + "ewc_loss": 0.008632357232272625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63235691213049e-05, + "grad_norm": 4.32703161239624, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8744593858718872, + "num_tokens": 806422157.0, + "step": 21133 + }, + { + "epoch": 2.6884620277318407, + "ewc_loss": 0.008612081408500671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612081001047045e-05, + "grad_norm": 4.216396808624268, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8814368844032288, + "num_tokens": 806463180.0, + "step": 21134 + }, + { + "epoch": 2.6885892380104313, + "ewc_loss": 0.008547582663595676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547582547180355e-05, + "grad_norm": 4.2637128829956055, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8781259655952454, + "num_tokens": 806502031.0, + "step": 21135 + }, + { + "epoch": 2.688716448289022, + "ewc_loss": 0.008623928762972355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62392844283022e-05, + "grad_norm": 4.247129440307617, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8838275671005249, + "num_tokens": 806539585.0, + "step": 21136 + }, + { + "epoch": 2.6888436585676123, + "ewc_loss": 0.008578373119235039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578372944612056e-05, + "grad_norm": 4.239259243011475, + "learning_rate": 1e-06, + "loss": 0.2828, + "mean_token_accuracy": 0.9006698131561279, + "num_tokens": 806573147.0, + "step": 21137 + }, + { + "epoch": 2.688970868846203, + "ewc_loss": 0.008578049018979073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578049164498225e-05, + "grad_norm": 4.251298904418945, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8935684561729431, + "num_tokens": 806609785.0, + "step": 21138 + }, + { + "epoch": 2.6890980791247934, + "ewc_loss": 0.008581564761698246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581564907217398e-05, + "grad_norm": 4.267609596252441, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8909657001495361, + "num_tokens": 806647709.0, + "step": 21139 + }, + { + "epoch": 2.689225289403384, + "ewc_loss": 0.008571564219892025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571564103476703e-05, + "grad_norm": 4.25346565246582, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8861982822418213, + "num_tokens": 806684448.0, + "step": 21140 + }, + { + "epoch": 2.6893524996819744, + "ewc_loss": 0.008560489863157272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560490095987916e-05, + "grad_norm": 4.221109390258789, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8866955041885376, + "num_tokens": 806725326.0, + "step": 21141 + }, + { + "epoch": 2.689479709960565, + "ewc_loss": 0.008560115471482277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560115384170786e-05, + "grad_norm": 4.268198013305664, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8840828537940979, + "num_tokens": 806759797.0, + "step": 21142 + }, + { + "epoch": 2.6896069202391555, + "ewc_loss": 0.008594410493969917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594410610385239e-05, + "grad_norm": 4.248691558837891, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8891406655311584, + "num_tokens": 806799986.0, + "step": 21143 + }, + { + "epoch": 2.6897341305177456, + "ewc_loss": 0.008552267216145992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552267536288127e-05, + "grad_norm": 4.249808311462402, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8772436380386353, + "num_tokens": 806843055.0, + "step": 21144 + }, + { + "epoch": 2.6898613407963365, + "ewc_loss": 0.00856116134673357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561160939279944e-05, + "grad_norm": 4.217362403869629, + "learning_rate": 1e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.8953667283058167, + "num_tokens": 806875897.0, + "step": 21145 + }, + { + "epoch": 2.6899885510749266, + "ewc_loss": 0.008525773882865906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525773591827601e-05, + "grad_norm": 4.262034893035889, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8914854526519775, + "num_tokens": 806915583.0, + "step": 21146 + }, + { + "epoch": 2.6901157613535176, + "ewc_loss": 0.008566376753151417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566377073293552e-05, + "grad_norm": 4.26310920715332, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.886499285697937, + "num_tokens": 806953851.0, + "step": 21147 + }, + { + "epoch": 2.6902429716321077, + "ewc_loss": 0.008528710342943668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528710168320686e-05, + "grad_norm": 4.262979030609131, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8786036372184753, + "num_tokens": 806988519.0, + "step": 21148 + }, + { + "epoch": 2.6903701819106987, + "ewc_loss": 0.00855430867522955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554308442398906e-05, + "grad_norm": 4.224869251251221, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.886479377746582, + "num_tokens": 807029612.0, + "step": 21149 + }, + { + "epoch": 2.6904973921892887, + "ewc_loss": 0.008491684682667255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49168500280939e-05, + "grad_norm": 4.299703121185303, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8684249520301819, + "num_tokens": 807064009.0, + "step": 21150 + }, + { + "epoch": 2.6906246024678793, + "ewc_loss": 0.008594312705099583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594313112553209e-05, + "grad_norm": 4.2235493659973145, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8804018497467041, + "num_tokens": 807104694.0, + "step": 21151 + }, + { + "epoch": 2.69075181274647, + "ewc_loss": 0.008506243117153645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50624346639961e-05, + "grad_norm": 4.291578769683838, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8644355535507202, + "num_tokens": 807142500.0, + "step": 21152 + }, + { + "epoch": 2.6908790230250603, + "ewc_loss": 0.008565283380448818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56528349686414e-05, + "grad_norm": 4.335120677947998, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8814389109611511, + "num_tokens": 807172123.0, + "step": 21153 + }, + { + "epoch": 2.691006233303651, + "ewc_loss": 0.008586172945797443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58617277117446e-05, + "grad_norm": 4.336386203765869, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8852848410606384, + "num_tokens": 807206422.0, + "step": 21154 + }, + { + "epoch": 2.6911334435822414, + "ewc_loss": 0.00858309492468834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583095041103661e-05, + "grad_norm": 4.298539161682129, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8694533109664917, + "num_tokens": 807240867.0, + "step": 21155 + }, + { + "epoch": 2.691260653860832, + "ewc_loss": 0.008560952730476856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560952846892178e-05, + "grad_norm": 4.263844013214111, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.892729640007019, + "num_tokens": 807275838.0, + "step": 21156 + }, + { + "epoch": 2.6913878641394224, + "ewc_loss": 0.008587688207626343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587688353145495e-05, + "grad_norm": 4.190556526184082, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8908593654632568, + "num_tokens": 807315164.0, + "step": 21157 + }, + { + "epoch": 2.691515074418013, + "ewc_loss": 0.008553757332265377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55375692481175e-05, + "grad_norm": 4.223801612854004, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8750003576278687, + "num_tokens": 807358016.0, + "step": 21158 + }, + { + "epoch": 2.6916422846966035, + "ewc_loss": 0.008603151887655258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603151945862919e-05, + "grad_norm": 4.229024887084961, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8811746835708618, + "num_tokens": 807402751.0, + "step": 21159 + }, + { + "epoch": 2.691769494975194, + "ewc_loss": 0.008593888953328133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593888924224302e-05, + "grad_norm": 4.2651848793029785, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8857194781303406, + "num_tokens": 807442761.0, + "step": 21160 + }, + { + "epoch": 2.6918967052537845, + "ewc_loss": 0.008607540279626846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607540075900033e-05, + "grad_norm": 4.250840187072754, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8891953825950623, + "num_tokens": 807478550.0, + "step": 21161 + }, + { + "epoch": 2.692023915532375, + "ewc_loss": 0.008603246882557869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603246533311903e-05, + "grad_norm": 4.207193374633789, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8896455764770508, + "num_tokens": 807522756.0, + "step": 21162 + }, + { + "epoch": 2.6921511258109656, + "ewc_loss": 0.008543533273041248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543533476768062e-05, + "grad_norm": 4.226943016052246, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8873574733734131, + "num_tokens": 807561701.0, + "step": 21163 + }, + { + "epoch": 2.692278336089556, + "ewc_loss": 0.008595610037446022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595609688200057e-05, + "grad_norm": 4.247779369354248, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8830091953277588, + "num_tokens": 807604058.0, + "step": 21164 + }, + { + "epoch": 2.6924055463681467, + "ewc_loss": 0.008583126589655876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583126327721402e-05, + "grad_norm": 4.286872386932373, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8882271647453308, + "num_tokens": 807637344.0, + "step": 21165 + }, + { + "epoch": 2.692532756646737, + "ewc_loss": 0.008585232309997082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585231989854947e-05, + "grad_norm": 4.248523235321045, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.87657630443573, + "num_tokens": 807676867.0, + "step": 21166 + }, + { + "epoch": 2.6926599669253277, + "ewc_loss": 0.008553443476557732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55344333103858e-05, + "grad_norm": 4.2949113845825195, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8843637704849243, + "num_tokens": 807707183.0, + "step": 21167 + }, + { + "epoch": 2.6927871772039182, + "ewc_loss": 0.008595767430961132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595767576480284e-05, + "grad_norm": 4.3310136795043945, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8748458623886108, + "num_tokens": 807737617.0, + "step": 21168 + }, + { + "epoch": 2.6929143874825083, + "ewc_loss": 0.00861769262701273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617692219559103e-05, + "grad_norm": 4.236965656280518, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8759564161300659, + "num_tokens": 807774141.0, + "step": 21169 + }, + { + "epoch": 2.6930415977610993, + "ewc_loss": 0.008555995300412178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555995009373873e-05, + "grad_norm": 4.198004245758057, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8872718214988708, + "num_tokens": 807813375.0, + "step": 21170 + }, + { + "epoch": 2.6931688080396894, + "ewc_loss": 0.008593513630330563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59351348481141e-05, + "grad_norm": 4.26271390914917, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.870824933052063, + "num_tokens": 807852317.0, + "step": 21171 + }, + { + "epoch": 2.6932960183182804, + "ewc_loss": 0.008623662404716015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623662142781541e-05, + "grad_norm": 4.218455791473389, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8833926320075989, + "num_tokens": 807895266.0, + "step": 21172 + }, + { + "epoch": 2.6934232285968704, + "ewc_loss": 0.00858613383024931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586133481003344e-05, + "grad_norm": 4.194490909576416, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8843293786048889, + "num_tokens": 807938600.0, + "step": 21173 + }, + { + "epoch": 2.693550438875461, + "ewc_loss": 0.00858977623283863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589776552980766e-05, + "grad_norm": 4.266671657562256, + "learning_rate": 1e-06, + "loss": 0.2804, + "mean_token_accuracy": 0.9016410112380981, + "num_tokens": 807974329.0, + "step": 21174 + }, + { + "epoch": 2.6936776491540515, + "ewc_loss": 0.00863610953092575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636109123472124e-05, + "grad_norm": 4.210588455200195, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8946409821510315, + "num_tokens": 808015666.0, + "step": 21175 + }, + { + "epoch": 2.693804859432642, + "ewc_loss": 0.008570849895477295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57084960443899e-05, + "grad_norm": 4.267789840698242, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8926934599876404, + "num_tokens": 808050176.0, + "step": 21176 + }, + { + "epoch": 2.6939320697112326, + "ewc_loss": 0.008611598052084446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611597877461463e-05, + "grad_norm": 4.217744827270508, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.877237856388092, + "num_tokens": 808091910.0, + "step": 21177 + }, + { + "epoch": 2.694059279989823, + "ewc_loss": 0.008560729213058949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560729474993423e-05, + "grad_norm": 4.252220153808594, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8782726526260376, + "num_tokens": 808128646.0, + "step": 21178 + }, + { + "epoch": 2.6941864902684136, + "ewc_loss": 0.008604761213064194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604761387687176e-05, + "grad_norm": 4.2705254554748535, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8864931464195251, + "num_tokens": 808166544.0, + "step": 21179 + }, + { + "epoch": 2.694313700547004, + "ewc_loss": 0.008573370054364204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573369996156543e-05, + "grad_norm": 4.319772243499756, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.873458743095398, + "num_tokens": 808200051.0, + "step": 21180 + }, + { + "epoch": 2.6944409108255947, + "ewc_loss": 0.008602309972047806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602310117566958e-05, + "grad_norm": 4.227628707885742, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8855313062667847, + "num_tokens": 808239251.0, + "step": 21181 + }, + { + "epoch": 2.694568121104185, + "ewc_loss": 0.008533909916877747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533909567631781e-05, + "grad_norm": 4.273562431335449, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.877983808517456, + "num_tokens": 808280639.0, + "step": 21182 + }, + { + "epoch": 2.6946953313827757, + "ewc_loss": 0.008582726120948792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582726150052622e-05, + "grad_norm": 4.231621742248535, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8893266916275024, + "num_tokens": 808316141.0, + "step": 21183 + }, + { + "epoch": 2.6948225416613663, + "ewc_loss": 0.008553193882107735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553193765692413e-05, + "grad_norm": 4.234687805175781, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8823894262313843, + "num_tokens": 808359172.0, + "step": 21184 + }, + { + "epoch": 2.694949751939957, + "ewc_loss": 0.00855404045432806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554040687158704e-05, + "grad_norm": 4.282656669616699, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8560360670089722, + "num_tokens": 808397823.0, + "step": 21185 + }, + { + "epoch": 2.6950769622185473, + "ewc_loss": 0.008584669791162014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58466955833137e-05, + "grad_norm": 4.239681243896484, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8921724557876587, + "num_tokens": 808435355.0, + "step": 21186 + }, + { + "epoch": 2.695204172497138, + "ewc_loss": 0.008536040782928467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536040695616975e-05, + "grad_norm": 4.2456183433532715, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8897444009780884, + "num_tokens": 808470792.0, + "step": 21187 + }, + { + "epoch": 2.6953313827757284, + "ewc_loss": 0.008565935306251049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565935422666371e-05, + "grad_norm": 4.319254398345947, + "learning_rate": 1e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.8979465961456299, + "num_tokens": 808504591.0, + "step": 21188 + }, + { + "epoch": 2.695458593054319, + "ewc_loss": 0.008590254001319408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590253855800256e-05, + "grad_norm": 4.269552707672119, + "learning_rate": 1e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8943787813186646, + "num_tokens": 808537167.0, + "step": 21189 + }, + { + "epoch": 2.6955858033329094, + "ewc_loss": 0.008540106005966663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54010577313602e-05, + "grad_norm": 4.3259100914001465, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8980002999305725, + "num_tokens": 808572055.0, + "step": 21190 + }, + { + "epoch": 2.6957130136115, + "ewc_loss": 0.0085803447291255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5803447291255e-05, + "grad_norm": 4.244421005249023, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8946334719657898, + "num_tokens": 808605239.0, + "step": 21191 + }, + { + "epoch": 2.6958402238900905, + "ewc_loss": 0.008520622737705708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520622941432521e-05, + "grad_norm": 4.302676677703857, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8606115579605103, + "num_tokens": 808643227.0, + "step": 21192 + }, + { + "epoch": 2.695967434168681, + "ewc_loss": 0.008579212240874767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579212590120733e-05, + "grad_norm": 4.256284713745117, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8794703483581543, + "num_tokens": 808683855.0, + "step": 21193 + }, + { + "epoch": 2.696094644447271, + "ewc_loss": 0.008537095040082932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53709498187527e-05, + "grad_norm": 4.19297981262207, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8886751532554626, + "num_tokens": 808723467.0, + "step": 21194 + }, + { + "epoch": 2.696221854725862, + "ewc_loss": 0.00850036833435297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500368130626157e-05, + "grad_norm": 4.268399238586426, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8738258481025696, + "num_tokens": 808759805.0, + "step": 21195 + }, + { + "epoch": 2.696349065004452, + "ewc_loss": 0.008588675409555435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58867570059374e-05, + "grad_norm": 4.250757694244385, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8683532476425171, + "num_tokens": 808802024.0, + "step": 21196 + }, + { + "epoch": 2.696476275283043, + "ewc_loss": 0.00852917693555355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529176557203755e-05, + "grad_norm": 4.264739990234375, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8774263858795166, + "num_tokens": 808837086.0, + "step": 21197 + }, + { + "epoch": 2.696603485561633, + "ewc_loss": 0.008581261150538921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581261499784887e-05, + "grad_norm": 4.237432479858398, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8822784423828125, + "num_tokens": 808875261.0, + "step": 21198 + }, + { + "epoch": 2.6967306958402237, + "ewc_loss": 0.008542872034013271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542872092220932e-05, + "grad_norm": 4.204498767852783, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8899009227752686, + "num_tokens": 808913044.0, + "step": 21199 + }, + { + "epoch": 2.6968579061188143, + "ewc_loss": 0.008560530841350555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560530841350555e-05, + "grad_norm": 4.222181797027588, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8907788991928101, + "num_tokens": 808952374.0, + "step": 21200 + }, + { + "epoch": 2.696985116397405, + "ewc_loss": 0.008580800145864487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580800204072148e-05, + "grad_norm": 4.242860794067383, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8848192691802979, + "num_tokens": 808994397.0, + "step": 21201 + }, + { + "epoch": 2.6971123266759953, + "ewc_loss": 0.008578309789299965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578309643780813e-05, + "grad_norm": 4.21621561050415, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8846458196640015, + "num_tokens": 809036401.0, + "step": 21202 + }, + { + "epoch": 2.697239536954586, + "ewc_loss": 0.008546381257474422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546381286578253e-05, + "grad_norm": 4.283357620239258, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8886013627052307, + "num_tokens": 809073073.0, + "step": 21203 + }, + { + "epoch": 2.6973667472331764, + "ewc_loss": 0.0085949981585145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594997780164704e-05, + "grad_norm": 4.200849533081055, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8823356628417969, + "num_tokens": 809117348.0, + "step": 21204 + }, + { + "epoch": 2.697493957511767, + "ewc_loss": 0.008528105914592743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528105536242947e-05, + "grad_norm": 4.293431282043457, + "learning_rate": 1e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8955803513526917, + "num_tokens": 809151357.0, + "step": 21205 + }, + { + "epoch": 2.6976211677903574, + "ewc_loss": 0.00860282126814127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602821617387235e-05, + "grad_norm": 4.318263053894043, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8770674467086792, + "num_tokens": 809189000.0, + "step": 21206 + }, + { + "epoch": 2.697748378068948, + "ewc_loss": 0.008596737869083881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596737461630255e-05, + "grad_norm": 4.270622253417969, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8846533894538879, + "num_tokens": 809227380.0, + "step": 21207 + }, + { + "epoch": 2.6978755883475385, + "ewc_loss": 0.008531954139471054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531953790225089e-05, + "grad_norm": 4.283073902130127, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8837091326713562, + "num_tokens": 809265131.0, + "step": 21208 + }, + { + "epoch": 2.698002798626129, + "ewc_loss": 0.008561152033507824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561152208130807e-05, + "grad_norm": 4.337722301483154, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8809307813644409, + "num_tokens": 809297190.0, + "step": 21209 + }, + { + "epoch": 2.6981300089047195, + "ewc_loss": 0.0085843401029706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584339957451448e-05, + "grad_norm": 4.259688854217529, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.890943706035614, + "num_tokens": 809336774.0, + "step": 21210 + }, + { + "epoch": 2.69825721918331, + "ewc_loss": 0.008509897626936436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509897452313453e-05, + "grad_norm": 4.269545078277588, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8681963682174683, + "num_tokens": 809376612.0, + "step": 21211 + }, + { + "epoch": 2.6983844294619006, + "ewc_loss": 0.00854292418807745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542923751519993e-05, + "grad_norm": 4.188538074493408, + "learning_rate": 1e-06, + "loss": 0.2665, + "mean_token_accuracy": 0.9042327404022217, + "num_tokens": 809416058.0, + "step": 21212 + }, + { + "epoch": 2.698511639740491, + "ewc_loss": 0.008516557514667511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516557863913476e-05, + "grad_norm": 4.289571285247803, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8925601243972778, + "num_tokens": 809449580.0, + "step": 21213 + }, + { + "epoch": 2.6986388500190817, + "ewc_loss": 0.00858013704419136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580137364333495e-05, + "grad_norm": 4.276820182800293, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8858429789543152, + "num_tokens": 809484983.0, + "step": 21214 + }, + { + "epoch": 2.698766060297672, + "ewc_loss": 0.00852513499557972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525134762749076e-05, + "grad_norm": 4.2210373878479, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8853292465209961, + "num_tokens": 809521086.0, + "step": 21215 + }, + { + "epoch": 2.6988932705762627, + "ewc_loss": 0.008516163565218449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516163507010788e-05, + "grad_norm": 4.249691486358643, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8806095123291016, + "num_tokens": 809562475.0, + "step": 21216 + }, + { + "epoch": 2.699020480854853, + "ewc_loss": 0.008553064428269863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55306425364688e-05, + "grad_norm": 4.291378498077393, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8742432594299316, + "num_tokens": 809596686.0, + "step": 21217 + }, + { + "epoch": 2.6991476911334438, + "ewc_loss": 0.008568390272557735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568390330765396e-05, + "grad_norm": 4.232875347137451, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8851830959320068, + "num_tokens": 809636614.0, + "step": 21218 + }, + { + "epoch": 2.699274901412034, + "ewc_loss": 0.008522138930857182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522139250999317e-05, + "grad_norm": 4.319087028503418, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8730286359786987, + "num_tokens": 809674542.0, + "step": 21219 + }, + { + "epoch": 2.699402111690625, + "ewc_loss": 0.008597434498369694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597434498369694e-05, + "grad_norm": 4.192018985748291, + "learning_rate": 1e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8990026712417603, + "num_tokens": 809716726.0, + "step": 21220 + }, + { + "epoch": 2.699529321969215, + "ewc_loss": 0.008486340753734112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486340811941773e-05, + "grad_norm": 4.290852069854736, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8920773267745972, + "num_tokens": 809756790.0, + "step": 21221 + }, + { + "epoch": 2.699656532247806, + "ewc_loss": 0.008577643893659115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577643893659115e-05, + "grad_norm": 4.268449306488037, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8799242973327637, + "num_tokens": 809794616.0, + "step": 21222 + }, + { + "epoch": 2.699783742526396, + "ewc_loss": 0.008537269197404385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53726887726225e-05, + "grad_norm": 4.223330974578857, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8891555070877075, + "num_tokens": 809836662.0, + "step": 21223 + }, + { + "epoch": 2.6999109528049865, + "ewc_loss": 0.008501395583152771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50139549584128e-05, + "grad_norm": 4.270314693450928, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8797707557678223, + "num_tokens": 809873812.0, + "step": 21224 + }, + { + "epoch": 2.700038163083577, + "ewc_loss": 0.008565318770706654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56531914905645e-05, + "grad_norm": 4.250054836273193, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8840185403823853, + "num_tokens": 809913139.0, + "step": 21225 + }, + { + "epoch": 2.7001653733621676, + "ewc_loss": 0.008533425629138947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533425716450438e-05, + "grad_norm": 4.311285972595215, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8845927715301514, + "num_tokens": 809947741.0, + "step": 21226 + }, + { + "epoch": 2.700292583640758, + "ewc_loss": 0.008554598316550255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55459802551195e-05, + "grad_norm": 4.253060340881348, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8757773637771606, + "num_tokens": 809987120.0, + "step": 21227 + }, + { + "epoch": 2.7004197939193486, + "ewc_loss": 0.008515074849128723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515075023751706e-05, + "grad_norm": 4.277601718902588, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8784234523773193, + "num_tokens": 810026972.0, + "step": 21228 + }, + { + "epoch": 2.700547004197939, + "ewc_loss": 0.008542260155081749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54226018418558e-05, + "grad_norm": 4.270705223083496, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8838834762573242, + "num_tokens": 810062767.0, + "step": 21229 + }, + { + "epoch": 2.7006742144765297, + "ewc_loss": 0.008516816422343254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.516816888004541e-05, + "grad_norm": 4.235029220581055, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8907960653305054, + "num_tokens": 810096930.0, + "step": 21230 + }, + { + "epoch": 2.70080142475512, + "ewc_loss": 0.008527102880179882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527102909283713e-05, + "grad_norm": 4.298711776733398, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8706059455871582, + "num_tokens": 810134600.0, + "step": 21231 + }, + { + "epoch": 2.7009286350337107, + "ewc_loss": 0.008558272384107113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558272384107113e-05, + "grad_norm": 4.247063159942627, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8852149248123169, + "num_tokens": 810169535.0, + "step": 21232 + }, + { + "epoch": 2.7010558453123013, + "ewc_loss": 0.00851786695420742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517866808688268e-05, + "grad_norm": 4.286133766174316, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.875015139579773, + "num_tokens": 810205288.0, + "step": 21233 + }, + { + "epoch": 2.701183055590892, + "ewc_loss": 0.008573590777814388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573590457672253e-05, + "grad_norm": 4.246694087982178, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8813294172286987, + "num_tokens": 810242868.0, + "step": 21234 + }, + { + "epoch": 2.7013102658694823, + "ewc_loss": 0.008539971895515919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539971895515919e-05, + "grad_norm": 4.256157875061035, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8715249300003052, + "num_tokens": 810286539.0, + "step": 21235 + }, + { + "epoch": 2.701437476148073, + "ewc_loss": 0.008546054363250732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546054596081376e-05, + "grad_norm": 4.197389125823975, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8874500393867493, + "num_tokens": 810331598.0, + "step": 21236 + }, + { + "epoch": 2.7015646864266634, + "ewc_loss": 0.008515548892319202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515548688592389e-05, + "grad_norm": 4.2599382400512695, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8914332389831543, + "num_tokens": 810369501.0, + "step": 21237 + }, + { + "epoch": 2.701691896705254, + "ewc_loss": 0.008559467270970345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559467096347362e-05, + "grad_norm": 4.232444763183594, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8798815011978149, + "num_tokens": 810410727.0, + "step": 21238 + }, + { + "epoch": 2.7018191069838444, + "ewc_loss": 0.00852729007601738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527289901394397e-05, + "grad_norm": 4.31350564956665, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8752827048301697, + "num_tokens": 810450315.0, + "step": 21239 + }, + { + "epoch": 2.701946317262435, + "ewc_loss": 0.008570045232772827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570045611122623e-05, + "grad_norm": 4.230891704559326, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.877374529838562, + "num_tokens": 810493972.0, + "step": 21240 + }, + { + "epoch": 2.7020735275410255, + "ewc_loss": 0.008485215716063976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.48521594889462e-05, + "grad_norm": 4.2700347900390625, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8937740921974182, + "num_tokens": 810526038.0, + "step": 21241 + }, + { + "epoch": 2.7022007378196156, + "ewc_loss": 0.008545258082449436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545257878722623e-05, + "grad_norm": 4.291735649108887, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8868298530578613, + "num_tokens": 810560772.0, + "step": 21242 + }, + { + "epoch": 2.7023279480982065, + "ewc_loss": 0.00851394236087799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513942157151178e-05, + "grad_norm": 4.200796604156494, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8906384706497192, + "num_tokens": 810598322.0, + "step": 21243 + }, + { + "epoch": 2.7024551583767966, + "ewc_loss": 0.008459838107228279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.45983813633211e-05, + "grad_norm": 4.1901726722717285, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.900560736656189, + "num_tokens": 810638652.0, + "step": 21244 + }, + { + "epoch": 2.7025823686553876, + "ewc_loss": 0.008512649685144424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512649219483137e-05, + "grad_norm": 4.232429504394531, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8817326426506042, + "num_tokens": 810680318.0, + "step": 21245 + }, + { + "epoch": 2.7027095789339777, + "ewc_loss": 0.008509363979101181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509364124620333e-05, + "grad_norm": 4.273576736450195, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8896250128746033, + "num_tokens": 810715001.0, + "step": 21246 + }, + { + "epoch": 2.7028367892125686, + "ewc_loss": 0.008528894744813442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528894977644086e-05, + "grad_norm": 4.3312249183654785, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8771063089370728, + "num_tokens": 810748897.0, + "step": 21247 + }, + { + "epoch": 2.7029639994911587, + "ewc_loss": 0.008553189225494862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553189400117844e-05, + "grad_norm": 4.268115997314453, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8783323168754578, + "num_tokens": 810788954.0, + "step": 21248 + }, + { + "epoch": 2.7030912097697493, + "ewc_loss": 0.008492023684084415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492024062434211e-05, + "grad_norm": 4.252914905548096, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8858517408370972, + "num_tokens": 810828897.0, + "step": 21249 + }, + { + "epoch": 2.70321842004834, + "ewc_loss": 0.008528271690011024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52827142807655e-05, + "grad_norm": 4.226810932159424, + "learning_rate": 1e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8985100984573364, + "num_tokens": 810866698.0, + "step": 21250 + }, + { + "epoch": 2.7033456303269303, + "ewc_loss": 0.008510186336934566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510186307830736e-05, + "grad_norm": 4.259334087371826, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8780640363693237, + "num_tokens": 810903140.0, + "step": 21251 + }, + { + "epoch": 2.703472840605521, + "ewc_loss": 0.008536052890121937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53605306474492e-05, + "grad_norm": 4.246845245361328, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8865292072296143, + "num_tokens": 810940664.0, + "step": 21252 + }, + { + "epoch": 2.7036000508841114, + "ewc_loss": 0.008533039130270481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533039363101125e-05, + "grad_norm": 4.214652061462402, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8806780576705933, + "num_tokens": 810981818.0, + "step": 21253 + }, + { + "epoch": 2.703727261162702, + "ewc_loss": 0.008525634184479713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525633893441409e-05, + "grad_norm": 4.241625785827637, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8829779028892517, + "num_tokens": 811019897.0, + "step": 21254 + }, + { + "epoch": 2.7038544714412924, + "ewc_loss": 0.008546309545636177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546309982193634e-05, + "grad_norm": 4.2928619384765625, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8882958889007568, + "num_tokens": 811056310.0, + "step": 21255 + }, + { + "epoch": 2.703981681719883, + "ewc_loss": 0.008582940325140953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582940063206479e-05, + "grad_norm": 4.324691295623779, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8738577365875244, + "num_tokens": 811091505.0, + "step": 21256 + }, + { + "epoch": 2.7041088919984735, + "ewc_loss": 0.008557340130209923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557340333936736e-05, + "grad_norm": 4.194814205169678, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8935033679008484, + "num_tokens": 811134861.0, + "step": 21257 + }, + { + "epoch": 2.704236102277064, + "ewc_loss": 0.008504892699420452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504893048666418e-05, + "grad_norm": 4.316425323486328, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.875892162322998, + "num_tokens": 811170001.0, + "step": 21258 + }, + { + "epoch": 2.7043633125556545, + "ewc_loss": 0.008614898659288883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614898979431018e-05, + "grad_norm": 4.2978949546813965, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8863863945007324, + "num_tokens": 811204203.0, + "step": 21259 + }, + { + "epoch": 2.704490522834245, + "ewc_loss": 0.008565222844481468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565222378820181e-05, + "grad_norm": 4.234065055847168, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8874650597572327, + "num_tokens": 811244598.0, + "step": 21260 + }, + { + "epoch": 2.7046177331128356, + "ewc_loss": 0.00852497573941946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524975419277325e-05, + "grad_norm": 4.238065242767334, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8927667737007141, + "num_tokens": 811283352.0, + "step": 21261 + }, + { + "epoch": 2.704744943391426, + "ewc_loss": 0.0085675660520792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56756596476771e-05, + "grad_norm": 4.323940753936768, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8730329275131226, + "num_tokens": 811317605.0, + "step": 21262 + }, + { + "epoch": 2.7048721536700167, + "ewc_loss": 0.008596831001341343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596830593887717e-05, + "grad_norm": 4.249916076660156, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8979136347770691, + "num_tokens": 811352117.0, + "step": 21263 + }, + { + "epoch": 2.704999363948607, + "ewc_loss": 0.008538097143173218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538097608834505e-05, + "grad_norm": 4.284496784210205, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8765336275100708, + "num_tokens": 811386118.0, + "step": 21264 + }, + { + "epoch": 2.7051265742271977, + "ewc_loss": 0.008595388382673264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595388499088585e-05, + "grad_norm": 4.284752368927002, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8936449289321899, + "num_tokens": 811424479.0, + "step": 21265 + }, + { + "epoch": 2.7052537845057882, + "ewc_loss": 0.008579031564295292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57903141877614e-05, + "grad_norm": 4.248891353607178, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8716096878051758, + "num_tokens": 811464683.0, + "step": 21266 + }, + { + "epoch": 2.7053809947843783, + "ewc_loss": 0.008557352237403393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557351975468919e-05, + "grad_norm": 4.220310211181641, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8969463109970093, + "num_tokens": 811504801.0, + "step": 21267 + }, + { + "epoch": 2.7055082050629693, + "ewc_loss": 0.00854839850217104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548398182028905e-05, + "grad_norm": 4.252768516540527, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8759816884994507, + "num_tokens": 811546278.0, + "step": 21268 + }, + { + "epoch": 2.7056354153415594, + "ewc_loss": 0.008584793657064438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584793977206573e-05, + "grad_norm": 4.280436992645264, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8713368773460388, + "num_tokens": 811587019.0, + "step": 21269 + }, + { + "epoch": 2.7057626256201504, + "ewc_loss": 0.008548148907721043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548148616682738e-05, + "grad_norm": 4.370284080505371, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8622903823852539, + "num_tokens": 811619843.0, + "step": 21270 + }, + { + "epoch": 2.7058898358987404, + "ewc_loss": 0.00859152153134346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591521327616647e-05, + "grad_norm": 4.235583305358887, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8878985047340393, + "num_tokens": 811655797.0, + "step": 21271 + }, + { + "epoch": 2.706017046177331, + "ewc_loss": 0.008502181619405746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.502182026859373e-05, + "grad_norm": 4.265044689178467, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8755275011062622, + "num_tokens": 811691626.0, + "step": 21272 + }, + { + "epoch": 2.7061442564559215, + "ewc_loss": 0.00856551993638277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565519965486601e-05, + "grad_norm": 4.281464099884033, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8847783803939819, + "num_tokens": 811726046.0, + "step": 21273 + }, + { + "epoch": 2.706271466734512, + "ewc_loss": 0.008573489263653755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573489321861416e-05, + "grad_norm": 4.263198375701904, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8737627267837524, + "num_tokens": 811765686.0, + "step": 21274 + }, + { + "epoch": 2.7063986770131025, + "ewc_loss": 0.00855653453618288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556534885428846e-05, + "grad_norm": 4.2570271492004395, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8909033536911011, + "num_tokens": 811801143.0, + "step": 21275 + }, + { + "epoch": 2.706525887291693, + "ewc_loss": 0.008585629984736443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585629984736443e-05, + "grad_norm": 4.283204078674316, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8889603018760681, + "num_tokens": 811834807.0, + "step": 21276 + }, + { + "epoch": 2.7066530975702836, + "ewc_loss": 0.008590089157223701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590089419158176e-05, + "grad_norm": 4.286167621612549, + "learning_rate": 1e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.9004485607147217, + "num_tokens": 811865676.0, + "step": 21277 + }, + { + "epoch": 2.706780307848874, + "ewc_loss": 0.008593661710619926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59366191434674e-05, + "grad_norm": 4.29691219329834, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8807775974273682, + "num_tokens": 811901259.0, + "step": 21278 + }, + { + "epoch": 2.7069075181274647, + "ewc_loss": 0.00861030537635088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610305667389184e-05, + "grad_norm": 4.265157699584961, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8707625269889832, + "num_tokens": 811937768.0, + "step": 21279 + }, + { + "epoch": 2.707034728406055, + "ewc_loss": 0.00858315359801054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583153248764575e-05, + "grad_norm": 4.289921760559082, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8822392225265503, + "num_tokens": 811967805.0, + "step": 21280 + }, + { + "epoch": 2.7071619386846457, + "ewc_loss": 0.008623442612588406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623442408861592e-05, + "grad_norm": 4.2643866539001465, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8786501884460449, + "num_tokens": 812007468.0, + "step": 21281 + }, + { + "epoch": 2.7072891489632362, + "ewc_loss": 0.008591154590249062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591154619352892e-05, + "grad_norm": 4.215187072753906, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8841342329978943, + "num_tokens": 812048127.0, + "step": 21282 + }, + { + "epoch": 2.7074163592418268, + "ewc_loss": 0.008595493622124195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59549327287823e-05, + "grad_norm": 4.292387962341309, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8821265697479248, + "num_tokens": 812083904.0, + "step": 21283 + }, + { + "epoch": 2.7075435695204173, + "ewc_loss": 0.008642459288239479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642459579277784e-05, + "grad_norm": 4.247473239898682, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8908819556236267, + "num_tokens": 812119643.0, + "step": 21284 + }, + { + "epoch": 2.707670779799008, + "ewc_loss": 0.008609217591583729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609217184130102e-05, + "grad_norm": 4.275249004364014, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8817682862281799, + "num_tokens": 812155338.0, + "step": 21285 + }, + { + "epoch": 2.7077979900775984, + "ewc_loss": 0.008627815172076225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627815259387717e-05, + "grad_norm": 4.227495193481445, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8827089667320251, + "num_tokens": 812193075.0, + "step": 21286 + }, + { + "epoch": 2.707925200356189, + "ewc_loss": 0.008618827909231186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618827996542677e-05, + "grad_norm": 4.293034076690674, + "learning_rate": 1e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8913906812667847, + "num_tokens": 812228163.0, + "step": 21287 + }, + { + "epoch": 2.7080524106347794, + "ewc_loss": 0.008666249923408031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.666249777888879e-05, + "grad_norm": 4.277486801147461, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8790897727012634, + "num_tokens": 812268565.0, + "step": 21288 + }, + { + "epoch": 2.70817962091337, + "ewc_loss": 0.008626214228570461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626214548712596e-05, + "grad_norm": 4.2691802978515625, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8892194628715515, + "num_tokens": 812307292.0, + "step": 21289 + }, + { + "epoch": 2.7083068311919605, + "ewc_loss": 0.008608720265328884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608720236225054e-05, + "grad_norm": 4.269506931304932, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8942784070968628, + "num_tokens": 812346180.0, + "step": 21290 + }, + { + "epoch": 2.708434041470551, + "ewc_loss": 0.008621401153504848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621400775155053e-05, + "grad_norm": 4.33026123046875, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8814566135406494, + "num_tokens": 812376033.0, + "step": 21291 + }, + { + "epoch": 2.708561251749141, + "ewc_loss": 0.00864015705883503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640157466288656e-05, + "grad_norm": 4.273861885070801, + "learning_rate": 1e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.8976081609725952, + "num_tokens": 812407925.0, + "step": 21292 + }, + { + "epoch": 2.708688462027732, + "ewc_loss": 0.008579757995903492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579757559346035e-05, + "grad_norm": 4.2055206298828125, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8877140283584595, + "num_tokens": 812452354.0, + "step": 21293 + }, + { + "epoch": 2.708815672306322, + "ewc_loss": 0.008567318320274353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.567318582208827e-05, + "grad_norm": 4.254101753234863, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8843280076980591, + "num_tokens": 812490015.0, + "step": 21294 + }, + { + "epoch": 2.708942882584913, + "ewc_loss": 0.008631565608084202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631566015537828e-05, + "grad_norm": 4.248176574707031, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8798633217811584, + "num_tokens": 812529919.0, + "step": 21295 + }, + { + "epoch": 2.709070092863503, + "ewc_loss": 0.008586988784372807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586989133618772e-05, + "grad_norm": 4.230453014373779, + "learning_rate": 1e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8947533369064331, + "num_tokens": 812568541.0, + "step": 21296 + }, + { + "epoch": 2.7091973031420937, + "ewc_loss": 0.008591797202825546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591797086410224e-05, + "grad_norm": 4.2135748863220215, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8747820854187012, + "num_tokens": 812612098.0, + "step": 21297 + }, + { + "epoch": 2.7093245134206843, + "ewc_loss": 0.00857319962233305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573199738748372e-05, + "grad_norm": 4.305475234985352, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8790048360824585, + "num_tokens": 812648843.0, + "step": 21298 + }, + { + "epoch": 2.709451723699275, + "ewc_loss": 0.008631751872599125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631752280052751e-05, + "grad_norm": 4.1875505447387695, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8832125663757324, + "num_tokens": 812692080.0, + "step": 21299 + }, + { + "epoch": 2.7095789339778653, + "ewc_loss": 0.008514367043972015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514367073075846e-05, + "grad_norm": 4.222580909729004, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8960000872612, + "num_tokens": 812729441.0, + "step": 21300 + }, + { + "epoch": 2.709706144256456, + "ewc_loss": 0.008589639328420162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589639764977619e-05, + "grad_norm": 4.2671613693237305, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8763453960418701, + "num_tokens": 812768908.0, + "step": 21301 + }, + { + "epoch": 2.7098333545350464, + "ewc_loss": 0.008576207794249058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576207619626075e-05, + "grad_norm": 4.225614070892334, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8834991455078125, + "num_tokens": 812808384.0, + "step": 21302 + }, + { + "epoch": 2.709960564813637, + "ewc_loss": 0.008541017770767212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541017450625077e-05, + "grad_norm": 4.2604594230651855, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8750075101852417, + "num_tokens": 812847658.0, + "step": 21303 + }, + { + "epoch": 2.7100877750922274, + "ewc_loss": 0.008564692921936512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564692689105868e-05, + "grad_norm": 4.224545478820801, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8850798010826111, + "num_tokens": 812889394.0, + "step": 21304 + }, + { + "epoch": 2.710214985370818, + "ewc_loss": 0.008536892011761665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536891982657835e-05, + "grad_norm": 4.312659740447998, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8740260601043701, + "num_tokens": 812924202.0, + "step": 21305 + }, + { + "epoch": 2.7103421956494085, + "ewc_loss": 0.008609466254711151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609466021880507e-05, + "grad_norm": 4.299856185913086, + "learning_rate": 1e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.9013973474502563, + "num_tokens": 812954391.0, + "step": 21306 + }, + { + "epoch": 2.710469405927999, + "ewc_loss": 0.008558612316846848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558612171327695e-05, + "grad_norm": 4.264986038208008, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8768783211708069, + "num_tokens": 812989498.0, + "step": 21307 + }, + { + "epoch": 2.7105966162065895, + "ewc_loss": 0.008569275960326195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569275814807042e-05, + "grad_norm": 4.216976165771484, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8901798725128174, + "num_tokens": 813030968.0, + "step": 21308 + }, + { + "epoch": 2.71072382648518, + "ewc_loss": 0.008569869212806225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569868805352598e-05, + "grad_norm": 4.244571208953857, + "learning_rate": 1e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.9037985801696777, + "num_tokens": 813065843.0, + "step": 21309 + }, + { + "epoch": 2.7108510367637706, + "ewc_loss": 0.008573737926781178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573738159611821e-05, + "grad_norm": 4.211154460906982, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8901842832565308, + "num_tokens": 813107410.0, + "step": 21310 + }, + { + "epoch": 2.710978247042361, + "ewc_loss": 0.008569487370550632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569487545173615e-05, + "grad_norm": 4.230024337768555, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8879365921020508, + "num_tokens": 813155959.0, + "step": 21311 + }, + { + "epoch": 2.7111054573209517, + "ewc_loss": 0.008566935546696186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566935866838321e-05, + "grad_norm": 4.243382930755615, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8908971548080444, + "num_tokens": 813194133.0, + "step": 21312 + }, + { + "epoch": 2.711232667599542, + "ewc_loss": 0.008554758504033089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554758096579462e-05, + "grad_norm": 4.275990009307861, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8709498643875122, + "num_tokens": 813231015.0, + "step": 21313 + }, + { + "epoch": 2.7113598778781327, + "ewc_loss": 0.008582675829529762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582675945945084e-05, + "grad_norm": 4.191357612609863, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.8984683156013489, + "num_tokens": 813271497.0, + "step": 21314 + }, + { + "epoch": 2.711487088156723, + "ewc_loss": 0.008501091040670872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501091360813007e-05, + "grad_norm": 4.297311305999756, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.885332465171814, + "num_tokens": 813305758.0, + "step": 21315 + }, + { + "epoch": 2.7116142984353138, + "ewc_loss": 0.008620203472673893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62020388012752e-05, + "grad_norm": 4.239823818206787, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8820597529411316, + "num_tokens": 813345958.0, + "step": 21316 + }, + { + "epoch": 2.711741508713904, + "ewc_loss": 0.008518695831298828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518695540260524e-05, + "grad_norm": 4.27952766418457, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8882326483726501, + "num_tokens": 813381630.0, + "step": 21317 + }, + { + "epoch": 2.711868718992495, + "ewc_loss": 0.008561803959310055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561804133933038e-05, + "grad_norm": 4.273905277252197, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8851551413536072, + "num_tokens": 813421320.0, + "step": 21318 + }, + { + "epoch": 2.711995929271085, + "ewc_loss": 0.008552261628210545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552261715522036e-05, + "grad_norm": 4.291127681732178, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8767421841621399, + "num_tokens": 813458323.0, + "step": 21319 + }, + { + "epoch": 2.712123139549676, + "ewc_loss": 0.008550358936190605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550359052605927e-05, + "grad_norm": 4.254769325256348, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8653701543807983, + "num_tokens": 813501436.0, + "step": 21320 + }, + { + "epoch": 2.712250349828266, + "ewc_loss": 0.008509410545229912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509410690749064e-05, + "grad_norm": 4.217325210571289, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8854067325592041, + "num_tokens": 813545942.0, + "step": 21321 + }, + { + "epoch": 2.7123775601068565, + "ewc_loss": 0.008511832915246487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511832857038826e-05, + "grad_norm": 4.242048263549805, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8796347379684448, + "num_tokens": 813582202.0, + "step": 21322 + }, + { + "epoch": 2.712504770385447, + "ewc_loss": 0.008538179099559784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538179099559784e-05, + "grad_norm": 4.265472888946533, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8697331547737122, + "num_tokens": 813621711.0, + "step": 21323 + }, + { + "epoch": 2.7126319806640375, + "ewc_loss": 0.008526584133505821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526584133505821e-05, + "grad_norm": 4.209094047546387, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8786559700965881, + "num_tokens": 813665404.0, + "step": 21324 + }, + { + "epoch": 2.712759190942628, + "ewc_loss": 0.008498969487845898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498969691572711e-05, + "grad_norm": 4.3038506507873535, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8899027109146118, + "num_tokens": 813697470.0, + "step": 21325 + }, + { + "epoch": 2.7128864012212186, + "ewc_loss": 0.008573959581553936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573959348723292e-05, + "grad_norm": 4.2261576652526855, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.889430046081543, + "num_tokens": 813733308.0, + "step": 21326 + }, + { + "epoch": 2.713013611499809, + "ewc_loss": 0.008500260300934315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500260446453467e-05, + "grad_norm": 4.289723873138428, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8701868057250977, + "num_tokens": 813770881.0, + "step": 21327 + }, + { + "epoch": 2.7131408217783997, + "ewc_loss": 0.008571377955377102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57137783896178e-05, + "grad_norm": 4.273199558258057, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8900178670883179, + "num_tokens": 813806495.0, + "step": 21328 + }, + { + "epoch": 2.71326803205699, + "ewc_loss": 0.008552493527531624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552493090974167e-05, + "grad_norm": 4.2578816413879395, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8880845308303833, + "num_tokens": 813842408.0, + "step": 21329 + }, + { + "epoch": 2.7133952423355807, + "ewc_loss": 0.008559993468225002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559993875678629e-05, + "grad_norm": 4.234921932220459, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8794271945953369, + "num_tokens": 813883041.0, + "step": 21330 + }, + { + "epoch": 2.7135224526141712, + "ewc_loss": 0.008560745045542717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560745482100174e-05, + "grad_norm": 4.231071949005127, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.881098210811615, + "num_tokens": 813922901.0, + "step": 21331 + }, + { + "epoch": 2.7136496628927618, + "ewc_loss": 0.00857680756598711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576807886129245e-05, + "grad_norm": 4.267004013061523, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8865336179733276, + "num_tokens": 813961948.0, + "step": 21332 + }, + { + "epoch": 2.7137768731713523, + "ewc_loss": 0.008605172857642174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605173206888139e-05, + "grad_norm": 4.277350425720215, + "learning_rate": 1e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.8969053626060486, + "num_tokens": 813999194.0, + "step": 21333 + }, + { + "epoch": 2.713904083449943, + "ewc_loss": 0.008574510924518108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574510866310447e-05, + "grad_norm": 4.266571044921875, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8739088773727417, + "num_tokens": 814038986.0, + "step": 21334 + }, + { + "epoch": 2.7140312937285334, + "ewc_loss": 0.008567395620048046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.567395707359537e-05, + "grad_norm": 4.238490104675293, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8961679339408875, + "num_tokens": 814074351.0, + "step": 21335 + }, + { + "epoch": 2.714158504007124, + "ewc_loss": 0.008581157773733139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581158181186765e-05, + "grad_norm": 4.314651966094971, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8840776681900024, + "num_tokens": 814109255.0, + "step": 21336 + }, + { + "epoch": 2.7142857142857144, + "ewc_loss": 0.008614888414740562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614888793090358e-05, + "grad_norm": 4.280800819396973, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.877894937992096, + "num_tokens": 814146601.0, + "step": 21337 + }, + { + "epoch": 2.714412924564305, + "ewc_loss": 0.008555255830287933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555255772080272e-05, + "grad_norm": 4.291240692138672, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8890243172645569, + "num_tokens": 814181299.0, + "step": 21338 + }, + { + "epoch": 2.7145401348428955, + "ewc_loss": 0.008589536882936954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589537173975259e-05, + "grad_norm": 4.212835311889648, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8913637399673462, + "num_tokens": 814218178.0, + "step": 21339 + }, + { + "epoch": 2.7146673451214856, + "ewc_loss": 0.008533387444913387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533387881470844e-05, + "grad_norm": 4.252851963043213, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8865671157836914, + "num_tokens": 814254161.0, + "step": 21340 + }, + { + "epoch": 2.7147945554000765, + "ewc_loss": 0.008591636084020138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591636287746951e-05, + "grad_norm": 4.26613712310791, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8835768699645996, + "num_tokens": 814290164.0, + "step": 21341 + }, + { + "epoch": 2.7149217656786666, + "ewc_loss": 0.008588405326008797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588405034970492e-05, + "grad_norm": 4.252427577972412, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8647893667221069, + "num_tokens": 814334459.0, + "step": 21342 + }, + { + "epoch": 2.7150489759572576, + "ewc_loss": 0.008565349504351616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565349708078429e-05, + "grad_norm": 4.246942043304443, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.875584602355957, + "num_tokens": 814374291.0, + "step": 21343 + }, + { + "epoch": 2.7151761862358477, + "ewc_loss": 0.008586985059082508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586984768044204e-05, + "grad_norm": 4.299404144287109, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8789182901382446, + "num_tokens": 814413196.0, + "step": 21344 + }, + { + "epoch": 2.7153033965144386, + "ewc_loss": 0.008604947477579117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6049476522021e-05, + "grad_norm": 4.2410712242126465, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8816406726837158, + "num_tokens": 814455312.0, + "step": 21345 + }, + { + "epoch": 2.7154306067930287, + "ewc_loss": 0.008546491153538227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546491153538227e-05, + "grad_norm": 4.245968341827393, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8795926570892334, + "num_tokens": 814496586.0, + "step": 21346 + }, + { + "epoch": 2.7155578170716193, + "ewc_loss": 0.008559889160096645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559889101888984e-05, + "grad_norm": 4.250493049621582, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8899586200714111, + "num_tokens": 814531566.0, + "step": 21347 + }, + { + "epoch": 2.71568502735021, + "ewc_loss": 0.008565673604607582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56567348819226e-05, + "grad_norm": 4.295546531677246, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8897528052330017, + "num_tokens": 814566168.0, + "step": 21348 + }, + { + "epoch": 2.7158122376288003, + "ewc_loss": 0.008580711670219898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580711437389255e-05, + "grad_norm": 4.218647003173828, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8844864368438721, + "num_tokens": 814604927.0, + "step": 21349 + }, + { + "epoch": 2.715939447907391, + "ewc_loss": 0.008513723500072956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513723878422752e-05, + "grad_norm": 4.254113674163818, + "learning_rate": 1e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8962501883506775, + "num_tokens": 814641401.0, + "step": 21350 + }, + { + "epoch": 2.7160666581859814, + "ewc_loss": 0.008579453453421593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579453424317762e-05, + "grad_norm": 4.2765913009643555, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8890116214752197, + "num_tokens": 814674252.0, + "step": 21351 + }, + { + "epoch": 2.716193868464572, + "ewc_loss": 0.008582868613302708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58286875882186e-05, + "grad_norm": 4.335236072540283, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8836181163787842, + "num_tokens": 814704965.0, + "step": 21352 + }, + { + "epoch": 2.7163210787431624, + "ewc_loss": 0.00859916117042303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59916108311154e-05, + "grad_norm": 4.263288974761963, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.856823742389679, + "num_tokens": 814743740.0, + "step": 21353 + }, + { + "epoch": 2.716448289021753, + "ewc_loss": 0.008571431040763855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571430953452364e-05, + "grad_norm": 4.254615783691406, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8896591663360596, + "num_tokens": 814784598.0, + "step": 21354 + }, + { + "epoch": 2.7165754993003435, + "ewc_loss": 0.008587255142629147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58725470607169e-05, + "grad_norm": 4.232057571411133, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8774334192276001, + "num_tokens": 814821437.0, + "step": 21355 + }, + { + "epoch": 2.716702709578934, + "ewc_loss": 0.0085804658010602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580465510021895e-05, + "grad_norm": 4.224132537841797, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8848841190338135, + "num_tokens": 814861638.0, + "step": 21356 + }, + { + "epoch": 2.7168299198575245, + "ewc_loss": 0.008599415421485901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599415741628036e-05, + "grad_norm": 4.302324295043945, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8836084008216858, + "num_tokens": 814893799.0, + "step": 21357 + }, + { + "epoch": 2.716957130136115, + "ewc_loss": 0.00865885242819786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.658852311782539e-05, + "grad_norm": 4.21323299407959, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8912485837936401, + "num_tokens": 814930726.0, + "step": 21358 + }, + { + "epoch": 2.7170843404147056, + "ewc_loss": 0.00855928473174572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559285197407007e-05, + "grad_norm": 4.198409080505371, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8820360898971558, + "num_tokens": 814970041.0, + "step": 21359 + }, + { + "epoch": 2.717211550693296, + "ewc_loss": 0.008596556261181831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596556290285662e-05, + "grad_norm": 4.239715099334717, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8868417143821716, + "num_tokens": 815008587.0, + "step": 21360 + }, + { + "epoch": 2.7173387609718866, + "ewc_loss": 0.008613122627139091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613122190581635e-05, + "grad_norm": 4.247732162475586, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8717716932296753, + "num_tokens": 815047659.0, + "step": 21361 + }, + { + "epoch": 2.717465971250477, + "ewc_loss": 0.008592613972723484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592614176450297e-05, + "grad_norm": 4.271729946136475, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8843430280685425, + "num_tokens": 815082631.0, + "step": 21362 + }, + { + "epoch": 2.7175931815290677, + "ewc_loss": 0.008619691245257854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61969092511572e-05, + "grad_norm": 4.262876510620117, + "learning_rate": 1e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.9020639657974243, + "num_tokens": 815117568.0, + "step": 21363 + }, + { + "epoch": 2.7177203918076582, + "ewc_loss": 0.008577272295951843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57727209222503e-05, + "grad_norm": 4.2500762939453125, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8851502537727356, + "num_tokens": 815155019.0, + "step": 21364 + }, + { + "epoch": 2.7178476020862483, + "ewc_loss": 0.008569257333874702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569257624913007e-05, + "grad_norm": 4.2454352378845215, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.884495735168457, + "num_tokens": 815195386.0, + "step": 21365 + }, + { + "epoch": 2.7179748123648393, + "ewc_loss": 0.008565518073737621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565517782699317e-05, + "grad_norm": 4.323328495025635, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8890838623046875, + "num_tokens": 815228304.0, + "step": 21366 + }, + { + "epoch": 2.7181020226434294, + "ewc_loss": 0.008605689741671085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605689799878746e-05, + "grad_norm": 4.281480312347412, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.9025741219520569, + "num_tokens": 815260111.0, + "step": 21367 + }, + { + "epoch": 2.7182292329220203, + "ewc_loss": 0.008562763221561909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562763105146587e-05, + "grad_norm": 4.25541877746582, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8837487101554871, + "num_tokens": 815297272.0, + "step": 21368 + }, + { + "epoch": 2.7183564432006104, + "ewc_loss": 0.008541620336472988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541620627511293e-05, + "grad_norm": 4.286703109741211, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.882483720779419, + "num_tokens": 815336016.0, + "step": 21369 + }, + { + "epoch": 2.718483653479201, + "ewc_loss": 0.008581273257732391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58127314131707e-05, + "grad_norm": 4.363279342651367, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.85923171043396, + "num_tokens": 815373940.0, + "step": 21370 + }, + { + "epoch": 2.7186108637577915, + "ewc_loss": 0.00861036404967308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610363875050098e-05, + "grad_norm": 4.280084133148193, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8743125200271606, + "num_tokens": 815410836.0, + "step": 21371 + }, + { + "epoch": 2.718738074036382, + "ewc_loss": 0.008541596122086048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541595889255404e-05, + "grad_norm": 4.244543075561523, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8882606029510498, + "num_tokens": 815449577.0, + "step": 21372 + }, + { + "epoch": 2.7188652843149725, + "ewc_loss": 0.008564788848161697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564788731746376e-05, + "grad_norm": 4.320013046264648, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8849880695343018, + "num_tokens": 815485869.0, + "step": 21373 + }, + { + "epoch": 2.718992494593563, + "ewc_loss": 0.008624473586678505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624473412055522e-05, + "grad_norm": 4.284811973571777, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8659683465957642, + "num_tokens": 815523614.0, + "step": 21374 + }, + { + "epoch": 2.7191197048721536, + "ewc_loss": 0.008564116433262825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564116433262825e-05, + "grad_norm": 4.2861127853393555, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8652155995368958, + "num_tokens": 815561835.0, + "step": 21375 + }, + { + "epoch": 2.719246915150744, + "ewc_loss": 0.008592387661337852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592387894168496e-05, + "grad_norm": 4.2135491371154785, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8689898252487183, + "num_tokens": 815605596.0, + "step": 21376 + }, + { + "epoch": 2.7193741254293347, + "ewc_loss": 0.008538953959941864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538953989045694e-05, + "grad_norm": 4.286755084991455, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8792848587036133, + "num_tokens": 815638720.0, + "step": 21377 + }, + { + "epoch": 2.719501335707925, + "ewc_loss": 0.008623004890978336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62300512380898e-05, + "grad_norm": 4.232123374938965, + "learning_rate": 1e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.8974000215530396, + "num_tokens": 815673996.0, + "step": 21378 + }, + { + "epoch": 2.7196285459865157, + "ewc_loss": 0.00857473909854889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574739331379533e-05, + "grad_norm": 4.272241592407227, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8723338842391968, + "num_tokens": 815716571.0, + "step": 21379 + }, + { + "epoch": 2.7197557562651062, + "ewc_loss": 0.008609971031546593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609970973338932e-05, + "grad_norm": 4.256814956665039, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8903555274009705, + "num_tokens": 815748973.0, + "step": 21380 + }, + { + "epoch": 2.7198829665436968, + "ewc_loss": 0.008600212633609772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600212458986789e-05, + "grad_norm": 4.212100505828857, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8870251178741455, + "num_tokens": 815794754.0, + "step": 21381 + }, + { + "epoch": 2.7200101768222873, + "ewc_loss": 0.00857740268111229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577403059462085e-05, + "grad_norm": 4.243828296661377, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.88763427734375, + "num_tokens": 815831051.0, + "step": 21382 + }, + { + "epoch": 2.720137387100878, + "ewc_loss": 0.00862276740372181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622767199994996e-05, + "grad_norm": 4.214948654174805, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8851722478866577, + "num_tokens": 815874291.0, + "step": 21383 + }, + { + "epoch": 2.7202645973794684, + "ewc_loss": 0.008564014919102192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564015297451988e-05, + "grad_norm": 4.283790111541748, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8818837404251099, + "num_tokens": 815912484.0, + "step": 21384 + }, + { + "epoch": 2.720391807658059, + "ewc_loss": 0.008628115057945251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.628115028841421e-05, + "grad_norm": 4.297923564910889, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.867462694644928, + "num_tokens": 815948626.0, + "step": 21385 + }, + { + "epoch": 2.7205190179366494, + "ewc_loss": 0.008583509363234043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583509043091908e-05, + "grad_norm": 4.247609615325928, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8919631838798523, + "num_tokens": 815983154.0, + "step": 21386 + }, + { + "epoch": 2.72064622821524, + "ewc_loss": 0.008575722575187683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57572304084897e-05, + "grad_norm": 4.3164472579956055, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8885904550552368, + "num_tokens": 816017524.0, + "step": 21387 + }, + { + "epoch": 2.7207734384938305, + "ewc_loss": 0.008625555783510208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625555346952751e-05, + "grad_norm": 4.2799577713012695, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.879907488822937, + "num_tokens": 816055204.0, + "step": 21388 + }, + { + "epoch": 2.720900648772421, + "ewc_loss": 0.008551406674087048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55140679050237e-05, + "grad_norm": 4.297109603881836, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8896686434745789, + "num_tokens": 816086671.0, + "step": 21389 + }, + { + "epoch": 2.721027859051011, + "ewc_loss": 0.008602302521467209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602302841609344e-05, + "grad_norm": 4.474571704864502, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8644962310791016, + "num_tokens": 816114173.0, + "step": 21390 + }, + { + "epoch": 2.721155069329602, + "ewc_loss": 0.008698675781488419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.698675810592249e-05, + "grad_norm": 4.236824035644531, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8943194150924683, + "num_tokens": 816156434.0, + "step": 21391 + }, + { + "epoch": 2.721282279608192, + "ewc_loss": 0.008525165729224682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.525166049366817e-05, + "grad_norm": 4.301397800445557, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8790454864501953, + "num_tokens": 816191037.0, + "step": 21392 + }, + { + "epoch": 2.721409489886783, + "ewc_loss": 0.008651954121887684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.651953976368532e-05, + "grad_norm": 4.220149517059326, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8955591917037964, + "num_tokens": 816227031.0, + "step": 21393 + }, + { + "epoch": 2.721536700165373, + "ewc_loss": 0.008575218729674816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575218816986308e-05, + "grad_norm": 4.224844932556152, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8861995339393616, + "num_tokens": 816264308.0, + "step": 21394 + }, + { + "epoch": 2.7216639104439637, + "ewc_loss": 0.008622352033853531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622351742815226e-05, + "grad_norm": 4.303266525268555, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8840956687927246, + "num_tokens": 816300955.0, + "step": 21395 + }, + { + "epoch": 2.7217911207225542, + "ewc_loss": 0.00866122916340828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.661229367135093e-05, + "grad_norm": 4.324239253997803, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8865447640419006, + "num_tokens": 816332881.0, + "step": 21396 + }, + { + "epoch": 2.7219183310011448, + "ewc_loss": 0.008654509671032429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.654509292682633e-05, + "grad_norm": 4.235044002532959, + "learning_rate": 1e-06, + "loss": 0.2831, + "mean_token_accuracy": 0.9012526273727417, + "num_tokens": 816367446.0, + "step": 21397 + }, + { + "epoch": 2.7220455412797353, + "ewc_loss": 0.008609005250036716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60900545376353e-05, + "grad_norm": 4.290927410125732, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8679023385047913, + "num_tokens": 816407286.0, + "step": 21398 + }, + { + "epoch": 2.722172751558326, + "ewc_loss": 0.008671754971146584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67175476741977e-05, + "grad_norm": 4.221549987792969, + "learning_rate": 1e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8919281363487244, + "num_tokens": 816445277.0, + "step": 21399 + }, + { + "epoch": 2.7222999618369164, + "ewc_loss": 0.008612855337560177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612855162937194e-05, + "grad_norm": 4.290165424346924, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8878533840179443, + "num_tokens": 816481638.0, + "step": 21400 + }, + { + "epoch": 2.722427172115507, + "ewc_loss": 0.008692736737430096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.692737173987553e-05, + "grad_norm": 4.281883716583252, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8838667869567871, + "num_tokens": 816521386.0, + "step": 21401 + }, + { + "epoch": 2.7225543823940974, + "ewc_loss": 0.008655577898025513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655578130856156e-05, + "grad_norm": 4.300116539001465, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8944648504257202, + "num_tokens": 816554760.0, + "step": 21402 + }, + { + "epoch": 2.722681592672688, + "ewc_loss": 0.008643289096653461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6432890384458e-05, + "grad_norm": 4.237396717071533, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8911962509155273, + "num_tokens": 816595195.0, + "step": 21403 + }, + { + "epoch": 2.7228088029512785, + "ewc_loss": 0.008599923923611641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599924331065267e-05, + "grad_norm": 4.2463860511779785, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8865044713020325, + "num_tokens": 816633046.0, + "step": 21404 + }, + { + "epoch": 2.722936013229869, + "ewc_loss": 0.00862732995301485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62732995301485e-05, + "grad_norm": 4.202407360076904, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8979129791259766, + "num_tokens": 816675283.0, + "step": 21405 + }, + { + "epoch": 2.7230632235084595, + "ewc_loss": 0.008599674329161644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59967403812334e-05, + "grad_norm": 4.258794784545898, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8757892847061157, + "num_tokens": 816716499.0, + "step": 21406 + }, + { + "epoch": 2.72319043378705, + "ewc_loss": 0.008624798618257046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624798647360876e-05, + "grad_norm": 4.214517116546631, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8777132034301758, + "num_tokens": 816761825.0, + "step": 21407 + }, + { + "epoch": 2.7233176440656406, + "ewc_loss": 0.008574073202908039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574072853662074e-05, + "grad_norm": 4.347954750061035, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8889092206954956, + "num_tokens": 816792080.0, + "step": 21408 + }, + { + "epoch": 2.723444854344231, + "ewc_loss": 0.0086400480940938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640048326924443e-05, + "grad_norm": 4.255453109741211, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8734862804412842, + "num_tokens": 816832121.0, + "step": 21409 + }, + { + "epoch": 2.7235720646228216, + "ewc_loss": 0.008558500558137894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558500121580437e-05, + "grad_norm": 4.286430358886719, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8780804872512817, + "num_tokens": 816874032.0, + "step": 21410 + }, + { + "epoch": 2.723699274901412, + "ewc_loss": 0.008576294407248497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576294203521684e-05, + "grad_norm": 4.2016730308532715, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8666759133338928, + "num_tokens": 816920471.0, + "step": 21411 + }, + { + "epoch": 2.7238264851800027, + "ewc_loss": 0.008528768084943295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528768375981599e-05, + "grad_norm": 4.279804229736328, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8851616382598877, + "num_tokens": 816958042.0, + "step": 21412 + }, + { + "epoch": 2.723953695458593, + "ewc_loss": 0.008592446334660053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59244610182941e-05, + "grad_norm": 4.344334602355957, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8756814002990723, + "num_tokens": 816992158.0, + "step": 21413 + }, + { + "epoch": 2.7240809057371838, + "ewc_loss": 0.008606985211372375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606985647929832e-05, + "grad_norm": 4.216848373413086, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8727153539657593, + "num_tokens": 817032552.0, + "step": 21414 + }, + { + "epoch": 2.724208116015774, + "ewc_loss": 0.008511796593666077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511796477250755e-05, + "grad_norm": 4.249761581420898, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8812285661697388, + "num_tokens": 817072240.0, + "step": 21415 + }, + { + "epoch": 2.724335326294365, + "ewc_loss": 0.008578377775847912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578378037782386e-05, + "grad_norm": 4.304058074951172, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8744049668312073, + "num_tokens": 817104741.0, + "step": 21416 + }, + { + "epoch": 2.724462536572955, + "ewc_loss": 0.008603411726653576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603411697549745e-05, + "grad_norm": 4.2448039054870605, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8708351850509644, + "num_tokens": 817143876.0, + "step": 21417 + }, + { + "epoch": 2.724589746851546, + "ewc_loss": 0.008558058179914951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558057743357494e-05, + "grad_norm": 4.1944684982299805, + "learning_rate": 1e-06, + "loss": 0.2692, + "mean_token_accuracy": 0.9049310684204102, + "num_tokens": 817183647.0, + "step": 21418 + }, + { + "epoch": 2.724716957130136, + "ewc_loss": 0.008566772565245628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566772157792002e-05, + "grad_norm": 4.3434882164001465, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8782286047935486, + "num_tokens": 817214520.0, + "step": 21419 + }, + { + "epoch": 2.7248441674087265, + "ewc_loss": 0.008667761459946632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667760994285345e-05, + "grad_norm": 4.248063087463379, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8742897510528564, + "num_tokens": 817256936.0, + "step": 21420 + }, + { + "epoch": 2.724971377687317, + "ewc_loss": 0.008548976853489876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548976620659232e-05, + "grad_norm": 4.234889507293701, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8847773671150208, + "num_tokens": 817292840.0, + "step": 21421 + }, + { + "epoch": 2.7250985879659075, + "ewc_loss": 0.008595108985900879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595109102316201e-05, + "grad_norm": 4.272015571594238, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8801953792572021, + "num_tokens": 817332872.0, + "step": 21422 + }, + { + "epoch": 2.725225798244498, + "ewc_loss": 0.00858171284198761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581712609156966e-05, + "grad_norm": 4.256583213806152, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8917750120162964, + "num_tokens": 817367668.0, + "step": 21423 + }, + { + "epoch": 2.7253530085230886, + "ewc_loss": 0.008577710017561913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577710104873404e-05, + "grad_norm": 4.206233024597168, + "learning_rate": 1e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.901029109954834, + "num_tokens": 817407726.0, + "step": 21424 + }, + { + "epoch": 2.725480218801679, + "ewc_loss": 0.008565641939640045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565641473978758e-05, + "grad_norm": 4.2709832191467285, + "learning_rate": 1e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.893244743347168, + "num_tokens": 817441003.0, + "step": 21425 + }, + { + "epoch": 2.7256074290802697, + "ewc_loss": 0.008595352061092854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595352119300514e-05, + "grad_norm": 4.214193820953369, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8892543911933899, + "num_tokens": 817482587.0, + "step": 21426 + }, + { + "epoch": 2.72573463935886, + "ewc_loss": 0.008570239879190922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570239879190922e-05, + "grad_norm": 4.271327495574951, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8886749744415283, + "num_tokens": 817519727.0, + "step": 21427 + }, + { + "epoch": 2.7258618496374507, + "ewc_loss": 0.008593596518039703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593596430728212e-05, + "grad_norm": 4.297756195068359, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8821473121643066, + "num_tokens": 817553809.0, + "step": 21428 + }, + { + "epoch": 2.7259890599160412, + "ewc_loss": 0.008589518256485462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589518256485462e-05, + "grad_norm": 4.317958354949951, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.869051992893219, + "num_tokens": 817588893.0, + "step": 21429 + }, + { + "epoch": 2.7261162701946318, + "ewc_loss": 0.008597539737820625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5975399997551e-05, + "grad_norm": 4.225400924682617, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8896732330322266, + "num_tokens": 817627789.0, + "step": 21430 + }, + { + "epoch": 2.7262434804732223, + "ewc_loss": 0.008529885672032833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529885235475376e-05, + "grad_norm": 4.27637243270874, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8746350407600403, + "num_tokens": 817670048.0, + "step": 21431 + }, + { + "epoch": 2.726370690751813, + "ewc_loss": 0.00859754253178835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597542910138145e-05, + "grad_norm": 4.2585129737854, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8888946175575256, + "num_tokens": 817707108.0, + "step": 21432 + }, + { + "epoch": 2.7264979010304033, + "ewc_loss": 0.008562876842916012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562876610085368e-05, + "grad_norm": 4.224599838256836, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8829863667488098, + "num_tokens": 817748126.0, + "step": 21433 + }, + { + "epoch": 2.726625111308994, + "ewc_loss": 0.00854895357042551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548953337594867e-05, + "grad_norm": 4.277565002441406, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8860907554626465, + "num_tokens": 817781578.0, + "step": 21434 + }, + { + "epoch": 2.7267523215875844, + "ewc_loss": 0.008605711162090302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605711627751589e-05, + "grad_norm": 4.362999439239502, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8906539678573608, + "num_tokens": 817822143.0, + "step": 21435 + }, + { + "epoch": 2.726879531866175, + "ewc_loss": 0.008621317334473133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62131710164249e-05, + "grad_norm": 4.246590614318848, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8935763239860535, + "num_tokens": 817858456.0, + "step": 21436 + }, + { + "epoch": 2.7270067421447655, + "ewc_loss": 0.008527799509465694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52779921842739e-05, + "grad_norm": 4.260488033294678, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8905190825462341, + "num_tokens": 817892863.0, + "step": 21437 + }, + { + "epoch": 2.7271339524233555, + "ewc_loss": 0.008582305163145065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582304872106761e-05, + "grad_norm": 4.247992515563965, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.886388897895813, + "num_tokens": 817931120.0, + "step": 21438 + }, + { + "epoch": 2.7272611627019465, + "ewc_loss": 0.008562819100916386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562819130020216e-05, + "grad_norm": 4.29583740234375, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8867586851119995, + "num_tokens": 817963076.0, + "step": 21439 + }, + { + "epoch": 2.7273883729805366, + "ewc_loss": 0.008617367595434189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61736771184951e-05, + "grad_norm": 4.230975151062012, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8841898441314697, + "num_tokens": 818003844.0, + "step": 21440 + }, + { + "epoch": 2.7275155832591276, + "ewc_loss": 0.008578196167945862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578196138842031e-05, + "grad_norm": 4.200222015380859, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8873000741004944, + "num_tokens": 818048533.0, + "step": 21441 + }, + { + "epoch": 2.7276427935377177, + "ewc_loss": 0.008576917462050915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576917025493458e-05, + "grad_norm": 4.301496982574463, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8800754547119141, + "num_tokens": 818083290.0, + "step": 21442 + }, + { + "epoch": 2.7277700038163086, + "ewc_loss": 0.008644847199320793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644847548566759e-05, + "grad_norm": 4.246994495391846, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8913888931274414, + "num_tokens": 818120037.0, + "step": 21443 + }, + { + "epoch": 2.7278972140948987, + "ewc_loss": 0.00856181513518095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561815047869459e-05, + "grad_norm": 4.364532947540283, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8850042819976807, + "num_tokens": 818164122.0, + "step": 21444 + }, + { + "epoch": 2.7280244243734892, + "ewc_loss": 0.008646705187857151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64670510054566e-05, + "grad_norm": 4.294857978820801, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8859986662864685, + "num_tokens": 818198065.0, + "step": 21445 + }, + { + "epoch": 2.7281516346520798, + "ewc_loss": 0.008588964119553566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588964556111023e-05, + "grad_norm": 4.2232561111450195, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.885379433631897, + "num_tokens": 818236843.0, + "step": 21446 + }, + { + "epoch": 2.7282788449306703, + "ewc_loss": 0.008578198030591011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578198321629316e-05, + "grad_norm": 4.2941155433654785, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8834340572357178, + "num_tokens": 818271300.0, + "step": 21447 + }, + { + "epoch": 2.728406055209261, + "ewc_loss": 0.008630216121673584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.630216325400397e-05, + "grad_norm": 4.244049549102783, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8711265325546265, + "num_tokens": 818311799.0, + "step": 21448 + }, + { + "epoch": 2.7285332654878514, + "ewc_loss": 0.008586293086409569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586292824475095e-05, + "grad_norm": 4.279496192932129, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8888179063796997, + "num_tokens": 818346662.0, + "step": 21449 + }, + { + "epoch": 2.728660475766442, + "ewc_loss": 0.008632472716271877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632472599856555e-05, + "grad_norm": 4.314304351806641, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8866021633148193, + "num_tokens": 818384465.0, + "step": 21450 + }, + { + "epoch": 2.7287876860450324, + "ewc_loss": 0.008648941293358803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64894172991626e-05, + "grad_norm": 4.265819072723389, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8816752433776855, + "num_tokens": 818425625.0, + "step": 21451 + }, + { + "epoch": 2.728914896323623, + "ewc_loss": 0.008592311292886734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592311496613547e-05, + "grad_norm": 4.2772440910339355, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8888657093048096, + "num_tokens": 818458544.0, + "step": 21452 + }, + { + "epoch": 2.7290421066022135, + "ewc_loss": 0.0086255157366395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625515329185873e-05, + "grad_norm": 4.240328788757324, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.878964364528656, + "num_tokens": 818503507.0, + "step": 21453 + }, + { + "epoch": 2.729169316880804, + "ewc_loss": 0.00859495997428894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59495994518511e-05, + "grad_norm": 4.24457311630249, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8909264206886292, + "num_tokens": 818545411.0, + "step": 21454 + }, + { + "epoch": 2.7292965271593945, + "ewc_loss": 0.008589095436036587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589095523348078e-05, + "grad_norm": 4.303122520446777, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8810555934906006, + "num_tokens": 818582100.0, + "step": 21455 + }, + { + "epoch": 2.729423737437985, + "ewc_loss": 0.008619477041065693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619477011961862e-05, + "grad_norm": 4.313664436340332, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.881137490272522, + "num_tokens": 818615549.0, + "step": 21456 + }, + { + "epoch": 2.7295509477165756, + "ewc_loss": 0.008616266772150993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616266859462485e-05, + "grad_norm": 4.247060298919678, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8888546228408813, + "num_tokens": 818652476.0, + "step": 21457 + }, + { + "epoch": 2.729678157995166, + "ewc_loss": 0.008553331717848778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553332008887082e-05, + "grad_norm": 4.24113130569458, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8875385522842407, + "num_tokens": 818692435.0, + "step": 21458 + }, + { + "epoch": 2.7298053682737566, + "ewc_loss": 0.008572890423238277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572890510549769e-05, + "grad_norm": 4.2497429847717285, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8842360973358154, + "num_tokens": 818733206.0, + "step": 21459 + }, + { + "epoch": 2.729932578552347, + "ewc_loss": 0.00857252161949873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57252161949873e-05, + "grad_norm": 4.264836311340332, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8856117129325867, + "num_tokens": 818771717.0, + "step": 21460 + }, + { + "epoch": 2.7300597888309377, + "ewc_loss": 0.008575169369578362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575169340474531e-05, + "grad_norm": 4.259491443634033, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8826438188552856, + "num_tokens": 818806999.0, + "step": 21461 + }, + { + "epoch": 2.7301869991095282, + "ewc_loss": 0.00855657272040844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55657272040844e-05, + "grad_norm": 4.237345218658447, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8894062042236328, + "num_tokens": 818846226.0, + "step": 21462 + }, + { + "epoch": 2.7303142093881183, + "ewc_loss": 0.008543489500880241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543489093426615e-05, + "grad_norm": 4.247467041015625, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.890137255191803, + "num_tokens": 818885667.0, + "step": 21463 + }, + { + "epoch": 2.7304414196667093, + "ewc_loss": 0.00856300164014101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563001756556332e-05, + "grad_norm": 4.271368503570557, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8901273608207703, + "num_tokens": 818924009.0, + "step": 21464 + }, + { + "epoch": 2.7305686299452994, + "ewc_loss": 0.008552922867238522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552923100069165e-05, + "grad_norm": 4.243788719177246, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.8931825160980225, + "num_tokens": 818962614.0, + "step": 21465 + }, + { + "epoch": 2.7306958402238903, + "ewc_loss": 0.008515127934515476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51512813824229e-05, + "grad_norm": 4.229588985443115, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.886114239692688, + "num_tokens": 819004486.0, + "step": 21466 + }, + { + "epoch": 2.7308230505024804, + "ewc_loss": 0.008514084853231907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514084765920416e-05, + "grad_norm": 4.216419696807861, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8788949251174927, + "num_tokens": 819048364.0, + "step": 21467 + }, + { + "epoch": 2.730950260781071, + "ewc_loss": 0.008504633791744709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504634024575353e-05, + "grad_norm": 4.285231113433838, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8812666535377502, + "num_tokens": 819086250.0, + "step": 21468 + }, + { + "epoch": 2.7310774710596615, + "ewc_loss": 0.008530241437256336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530241757398471e-05, + "grad_norm": 4.208522796630859, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8947691917419434, + "num_tokens": 819126705.0, + "step": 21469 + }, + { + "epoch": 2.731204681338252, + "ewc_loss": 0.008470313623547554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470314060105011e-05, + "grad_norm": 4.276157379150391, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8850350379943848, + "num_tokens": 819165967.0, + "step": 21470 + }, + { + "epoch": 2.7313318916168425, + "ewc_loss": 0.00851553026586771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515530498698354e-05, + "grad_norm": 4.238168716430664, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8806322813034058, + "num_tokens": 819208066.0, + "step": 21471 + }, + { + "epoch": 2.731459101895433, + "ewc_loss": 0.008477555587887764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.477555820718408e-05, + "grad_norm": 4.2903337478637695, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8816199898719788, + "num_tokens": 819246269.0, + "step": 21472 + }, + { + "epoch": 2.7315863121740236, + "ewc_loss": 0.00850802380591631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508023893227801e-05, + "grad_norm": 4.217158794403076, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8848987221717834, + "num_tokens": 819286027.0, + "step": 21473 + }, + { + "epoch": 2.731713522452614, + "ewc_loss": 0.00846700370311737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467003499390557e-05, + "grad_norm": 4.2658185958862305, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8927278518676758, + "num_tokens": 819319994.0, + "step": 21474 + }, + { + "epoch": 2.7318407327312046, + "ewc_loss": 0.0085181575268507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518157119397074e-05, + "grad_norm": 4.273157596588135, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8804040551185608, + "num_tokens": 819360113.0, + "step": 21475 + }, + { + "epoch": 2.731967943009795, + "ewc_loss": 0.00850177463144064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.501774573232979e-05, + "grad_norm": 4.267134666442871, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8912556767463684, + "num_tokens": 819394575.0, + "step": 21476 + }, + { + "epoch": 2.7320951532883857, + "ewc_loss": 0.008508602157235146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508602331858128e-05, + "grad_norm": 4.282464027404785, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8799443244934082, + "num_tokens": 819434545.0, + "step": 21477 + }, + { + "epoch": 2.7322223635669762, + "ewc_loss": 0.00851132906973362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511329360771924e-05, + "grad_norm": 4.227891445159912, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8925189971923828, + "num_tokens": 819470699.0, + "step": 21478 + }, + { + "epoch": 2.7323495738455668, + "ewc_loss": 0.008490189909934998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490189793519676e-05, + "grad_norm": 4.269781589508057, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8788909912109375, + "num_tokens": 819514447.0, + "step": 21479 + }, + { + "epoch": 2.7324767841241573, + "ewc_loss": 0.00852658785879612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526587771484628e-05, + "grad_norm": 4.291258811950684, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8792035579681396, + "num_tokens": 819553434.0, + "step": 21480 + }, + { + "epoch": 2.732603994402748, + "ewc_loss": 0.00852491706609726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524917211616412e-05, + "grad_norm": 4.290018081665039, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8777804374694824, + "num_tokens": 819591594.0, + "step": 21481 + }, + { + "epoch": 2.7327312046813383, + "ewc_loss": 0.008517337031662464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517337118973956e-05, + "grad_norm": 4.208698749542236, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8795197606086731, + "num_tokens": 819634530.0, + "step": 21482 + }, + { + "epoch": 2.732858414959929, + "ewc_loss": 0.008500649593770504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500649710185826e-05, + "grad_norm": 4.327391624450684, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8757035732269287, + "num_tokens": 819669358.0, + "step": 21483 + }, + { + "epoch": 2.7329856252385194, + "ewc_loss": 0.008585641160607338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585641626268625e-05, + "grad_norm": 4.221090316772461, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8831512331962585, + "num_tokens": 819711336.0, + "step": 21484 + }, + { + "epoch": 2.73311283551711, + "ewc_loss": 0.008487354032695293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.487354352837428e-05, + "grad_norm": 4.254917621612549, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8773995041847229, + "num_tokens": 819746973.0, + "step": 21485 + }, + { + "epoch": 2.7332400457957005, + "ewc_loss": 0.008564031682908535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564031304558739e-05, + "grad_norm": 4.274130344390869, + "learning_rate": 1e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8924820423126221, + "num_tokens": 819780305.0, + "step": 21486 + }, + { + "epoch": 2.733367256074291, + "ewc_loss": 0.008546130731701851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546130993636325e-05, + "grad_norm": 4.220068454742432, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8928859233856201, + "num_tokens": 819817326.0, + "step": 21487 + }, + { + "epoch": 2.733494466352881, + "ewc_loss": 0.008533433079719543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533432992408052e-05, + "grad_norm": 4.250445365905762, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8771410584449768, + "num_tokens": 819857597.0, + "step": 21488 + }, + { + "epoch": 2.733621676631472, + "ewc_loss": 0.008573787286877632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573786908527836e-05, + "grad_norm": 4.2956461906433105, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8857824802398682, + "num_tokens": 819893471.0, + "step": 21489 + }, + { + "epoch": 2.733748886910062, + "ewc_loss": 0.008578715845942497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578715642215684e-05, + "grad_norm": 4.289138317108154, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8669818639755249, + "num_tokens": 819934175.0, + "step": 21490 + }, + { + "epoch": 2.733876097188653, + "ewc_loss": 0.008568298071622849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568297926103696e-05, + "grad_norm": 4.2432379722595215, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.891646146774292, + "num_tokens": 819971451.0, + "step": 21491 + }, + { + "epoch": 2.734003307467243, + "ewc_loss": 0.008542700670659542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542700379621238e-05, + "grad_norm": 4.233096122741699, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8785898089408875, + "num_tokens": 820011566.0, + "step": 21492 + }, + { + "epoch": 2.7341305177458337, + "ewc_loss": 0.008555127307772636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555127715226263e-05, + "grad_norm": 4.287053108215332, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.861021876335144, + "num_tokens": 820047944.0, + "step": 21493 + }, + { + "epoch": 2.7342577280244242, + "ewc_loss": 0.00859651155769825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596511179348454e-05, + "grad_norm": 4.308017730712891, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8730165958404541, + "num_tokens": 820083416.0, + "step": 21494 + }, + { + "epoch": 2.7343849383030148, + "ewc_loss": 0.008620142936706543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62014276208356e-05, + "grad_norm": 4.260197639465332, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8753879070281982, + "num_tokens": 820122476.0, + "step": 21495 + }, + { + "epoch": 2.7345121485816053, + "ewc_loss": 0.008580553345382214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580553549109027e-05, + "grad_norm": 4.2245941162109375, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8893799781799316, + "num_tokens": 820159866.0, + "step": 21496 + }, + { + "epoch": 2.734639358860196, + "ewc_loss": 0.008594066835939884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594066457590088e-05, + "grad_norm": 4.24385929107666, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8846958875656128, + "num_tokens": 820202710.0, + "step": 21497 + }, + { + "epoch": 2.7347665691387864, + "ewc_loss": 0.008614983409643173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614983380539343e-05, + "grad_norm": 4.3311991691589355, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8820511102676392, + "num_tokens": 820239156.0, + "step": 21498 + }, + { + "epoch": 2.734893779417377, + "ewc_loss": 0.008642531931400299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642532338853925e-05, + "grad_norm": 4.233674049377441, + "learning_rate": 1e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.9025453329086304, + "num_tokens": 820274010.0, + "step": 21499 + }, + { + "epoch": 2.7350209896959674, + "ewc_loss": 0.008574286475777626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57428676681593e-05, + "grad_norm": 4.210351467132568, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.880932092666626, + "num_tokens": 820319171.0, + "step": 21500 + }, + { + "epoch": 2.735148199974558, + "ewc_loss": 0.008575649000704288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575648826081306e-05, + "grad_norm": 4.305882453918457, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8699383735656738, + "num_tokens": 820355038.0, + "step": 21501 + }, + { + "epoch": 2.7352754102531485, + "ewc_loss": 0.008641291409730911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641291788080707e-05, + "grad_norm": 4.301949977874756, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8807581067085266, + "num_tokens": 820390973.0, + "step": 21502 + }, + { + "epoch": 2.735402620531739, + "ewc_loss": 0.00859201792627573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592017547925934e-05, + "grad_norm": 4.2816643714904785, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8795832991600037, + "num_tokens": 820424543.0, + "step": 21503 + }, + { + "epoch": 2.7355298308103295, + "ewc_loss": 0.008616763167083263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616763079771772e-05, + "grad_norm": 4.280174732208252, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8816969394683838, + "num_tokens": 820462702.0, + "step": 21504 + }, + { + "epoch": 2.73565704108892, + "ewc_loss": 0.008610015735030174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610015356680378e-05, + "grad_norm": 4.259329319000244, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8914246559143066, + "num_tokens": 820498840.0, + "step": 21505 + }, + { + "epoch": 2.7357842513675106, + "ewc_loss": 0.008608216419816017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608216739958152e-05, + "grad_norm": 4.331588268280029, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8763054013252258, + "num_tokens": 820534083.0, + "step": 21506 + }, + { + "epoch": 2.735911461646101, + "ewc_loss": 0.008646407164633274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646406786283478e-05, + "grad_norm": 4.25920295715332, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.887353777885437, + "num_tokens": 820572479.0, + "step": 21507 + }, + { + "epoch": 2.7360386719246916, + "ewc_loss": 0.008579747751355171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579747373005375e-05, + "grad_norm": 4.253561496734619, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8727203011512756, + "num_tokens": 820610283.0, + "step": 21508 + }, + { + "epoch": 2.736165882203282, + "ewc_loss": 0.008593950420618057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593950042268261e-05, + "grad_norm": 4.253679275512695, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8977375626564026, + "num_tokens": 820647580.0, + "step": 21509 + }, + { + "epoch": 2.7362930924818727, + "ewc_loss": 0.008600803092122078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600803266745061e-05, + "grad_norm": 4.220885753631592, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8880589008331299, + "num_tokens": 820686774.0, + "step": 21510 + }, + { + "epoch": 2.7364203027604628, + "ewc_loss": 0.008601706475019455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601706213084981e-05, + "grad_norm": 4.240118026733398, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8937146663665771, + "num_tokens": 820724635.0, + "step": 21511 + }, + { + "epoch": 2.7365475130390537, + "ewc_loss": 0.008624378591775894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624378824606538e-05, + "grad_norm": 4.247532844543457, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8906350135803223, + "num_tokens": 820761467.0, + "step": 21512 + }, + { + "epoch": 2.736674723317644, + "ewc_loss": 0.008617956191301346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617956336820498e-05, + "grad_norm": 4.247129440307617, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8729555606842041, + "num_tokens": 820801001.0, + "step": 21513 + }, + { + "epoch": 2.736801933596235, + "ewc_loss": 0.008613546378910542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613546378910542e-05, + "grad_norm": 4.281595706939697, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8790196180343628, + "num_tokens": 820841028.0, + "step": 21514 + }, + { + "epoch": 2.736929143874825, + "ewc_loss": 0.008623815141618252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623814937891439e-05, + "grad_norm": 4.257637023925781, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8917120695114136, + "num_tokens": 820881613.0, + "step": 21515 + }, + { + "epoch": 2.737056354153416, + "ewc_loss": 0.008590535260736942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590535435359925e-05, + "grad_norm": 4.275358200073242, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.888321042060852, + "num_tokens": 820918205.0, + "step": 21516 + }, + { + "epoch": 2.737183564432006, + "ewc_loss": 0.008602072484791279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602072193752974e-05, + "grad_norm": 4.318413257598877, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.877658486366272, + "num_tokens": 820950751.0, + "step": 21517 + }, + { + "epoch": 2.7373107747105965, + "ewc_loss": 0.008598879911005497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59887950355187e-05, + "grad_norm": 4.250688552856445, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8856068849563599, + "num_tokens": 820987385.0, + "step": 21518 + }, + { + "epoch": 2.737437984989187, + "ewc_loss": 0.008561533875763416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561534195905551e-05, + "grad_norm": 4.255707263946533, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8761208653450012, + "num_tokens": 821026273.0, + "step": 21519 + }, + { + "epoch": 2.7375651952677775, + "ewc_loss": 0.008579728193581104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579728455515578e-05, + "grad_norm": 4.206140518188477, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8771555423736572, + "num_tokens": 821067978.0, + "step": 21520 + }, + { + "epoch": 2.737692405546368, + "ewc_loss": 0.008552026003599167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552025974495336e-05, + "grad_norm": 4.2353925704956055, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8898926973342896, + "num_tokens": 821111925.0, + "step": 21521 + }, + { + "epoch": 2.7378196158249586, + "ewc_loss": 0.008566765114665031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566764881834388e-05, + "grad_norm": 4.2414116859436035, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8867736458778381, + "num_tokens": 821151019.0, + "step": 21522 + }, + { + "epoch": 2.737946826103549, + "ewc_loss": 0.00857878103852272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578781125834212e-05, + "grad_norm": 4.262853145599365, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8899978399276733, + "num_tokens": 821185520.0, + "step": 21523 + }, + { + "epoch": 2.7380740363821396, + "ewc_loss": 0.008580831810832024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580831490689889e-05, + "grad_norm": 4.278561592102051, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8862476348876953, + "num_tokens": 821224273.0, + "step": 21524 + }, + { + "epoch": 2.73820124666073, + "ewc_loss": 0.008570965379476547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570965292165056e-05, + "grad_norm": 4.248676776885986, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8883674740791321, + "num_tokens": 821263541.0, + "step": 21525 + }, + { + "epoch": 2.7383284569393207, + "ewc_loss": 0.008556065149605274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55606485856697e-05, + "grad_norm": 4.264050006866455, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.872506856918335, + "num_tokens": 821304053.0, + "step": 21526 + }, + { + "epoch": 2.7384556672179112, + "ewc_loss": 0.008564967662096024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564967720303684e-05, + "grad_norm": 4.293087005615234, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8831278085708618, + "num_tokens": 821339710.0, + "step": 21527 + }, + { + "epoch": 2.7385828774965018, + "ewc_loss": 0.008569072000682354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569072087993845e-05, + "grad_norm": 4.217494964599609, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8837903738021851, + "num_tokens": 821378377.0, + "step": 21528 + }, + { + "epoch": 2.7387100877750923, + "ewc_loss": 0.008508040569722652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508040627930313e-05, + "grad_norm": 4.266702651977539, + "learning_rate": 1e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8941521644592285, + "num_tokens": 821417434.0, + "step": 21529 + }, + { + "epoch": 2.738837298053683, + "ewc_loss": 0.008569767698645592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569767669541761e-05, + "grad_norm": 4.187302589416504, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8803698420524597, + "num_tokens": 821464323.0, + "step": 21530 + }, + { + "epoch": 2.7389645083322733, + "ewc_loss": 0.008522823452949524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52282319101505e-05, + "grad_norm": 4.207225322723389, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8808306455612183, + "num_tokens": 821509891.0, + "step": 21531 + }, + { + "epoch": 2.739091718610864, + "ewc_loss": 0.008514869958162308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514869841746986e-05, + "grad_norm": 4.2135467529296875, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8725168704986572, + "num_tokens": 821557309.0, + "step": 21532 + }, + { + "epoch": 2.7392189288894544, + "ewc_loss": 0.00853009708225727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530096965841949e-05, + "grad_norm": 4.2777252197265625, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.87959885597229, + "num_tokens": 821597751.0, + "step": 21533 + }, + { + "epoch": 2.739346139168045, + "ewc_loss": 0.0085451016202569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545101445633918e-05, + "grad_norm": 4.221733093261719, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8935317993164062, + "num_tokens": 821637961.0, + "step": 21534 + }, + { + "epoch": 2.7394733494466355, + "ewc_loss": 0.008500300347805023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500300464220345e-05, + "grad_norm": 4.2426862716674805, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8673268556594849, + "num_tokens": 821684728.0, + "step": 21535 + }, + { + "epoch": 2.7396005597252255, + "ewc_loss": 0.008526302874088287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526302553946152e-05, + "grad_norm": 4.354318141937256, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8885980844497681, + "num_tokens": 821715359.0, + "step": 21536 + }, + { + "epoch": 2.7397277700038165, + "ewc_loss": 0.008558390662074089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558390982216224e-05, + "grad_norm": 4.2540178298950195, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8771345615386963, + "num_tokens": 821755540.0, + "step": 21537 + }, + { + "epoch": 2.7398549802824066, + "ewc_loss": 0.008467724546790123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467724546790123e-05, + "grad_norm": 4.2785773277282715, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8838062286376953, + "num_tokens": 821788506.0, + "step": 21538 + }, + { + "epoch": 2.7399821905609976, + "ewc_loss": 0.008533723652362823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533723303116858e-05, + "grad_norm": 4.26698112487793, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8838180899620056, + "num_tokens": 821826555.0, + "step": 21539 + }, + { + "epoch": 2.7401094008395877, + "ewc_loss": 0.008499604649841785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499604882672429e-05, + "grad_norm": 4.298820972442627, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.872758150100708, + "num_tokens": 821866160.0, + "step": 21540 + }, + { + "epoch": 2.7402366111181786, + "ewc_loss": 0.008540176786482334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540177077520639e-05, + "grad_norm": 4.254537582397461, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8898965120315552, + "num_tokens": 821905595.0, + "step": 21541 + }, + { + "epoch": 2.7403638213967687, + "ewc_loss": 0.008499558083713055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.499558316543698e-05, + "grad_norm": 4.2698140144348145, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8750446438789368, + "num_tokens": 821944769.0, + "step": 21542 + }, + { + "epoch": 2.7404910316753592, + "ewc_loss": 0.00853682029992342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536819950677454e-05, + "grad_norm": 4.223957538604736, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8896031975746155, + "num_tokens": 821988889.0, + "step": 21543 + }, + { + "epoch": 2.7406182419539498, + "ewc_loss": 0.008494018577039242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.494018402416259e-05, + "grad_norm": 4.26837158203125, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8860138654708862, + "num_tokens": 822025343.0, + "step": 21544 + }, + { + "epoch": 2.7407454522325403, + "ewc_loss": 0.008546888828277588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546888420823961e-05, + "grad_norm": 4.28870153427124, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8717994689941406, + "num_tokens": 822067651.0, + "step": 21545 + }, + { + "epoch": 2.740872662511131, + "ewc_loss": 0.008523373864591122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523373981006444e-05, + "grad_norm": 4.309751987457275, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8729147911071777, + "num_tokens": 822102885.0, + "step": 21546 + }, + { + "epoch": 2.7409998727897213, + "ewc_loss": 0.008549121208488941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549121412215754e-05, + "grad_norm": 4.2504801750183105, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8765487670898438, + "num_tokens": 822142515.0, + "step": 21547 + }, + { + "epoch": 2.741127083068312, + "ewc_loss": 0.00849093310534954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.490933396387845e-05, + "grad_norm": 4.243094444274902, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8807666301727295, + "num_tokens": 822180745.0, + "step": 21548 + }, + { + "epoch": 2.7412542933469024, + "ewc_loss": 0.008529072627425194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529072511009872e-05, + "grad_norm": 4.2778849601745605, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8906459212303162, + "num_tokens": 822215117.0, + "step": 21549 + }, + { + "epoch": 2.741381503625493, + "ewc_loss": 0.00855649821460247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556497778045014e-05, + "grad_norm": 4.268978595733643, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8739324808120728, + "num_tokens": 822258883.0, + "step": 21550 + }, + { + "epoch": 2.7415087139040835, + "ewc_loss": 0.008534972555935383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534972585039213e-05, + "grad_norm": 4.269001483917236, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8820503354072571, + "num_tokens": 822295254.0, + "step": 21551 + }, + { + "epoch": 2.741635924182674, + "ewc_loss": 0.008552341721951962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552341751055792e-05, + "grad_norm": 4.249570846557617, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8853387236595154, + "num_tokens": 822335769.0, + "step": 21552 + }, + { + "epoch": 2.7417631344612645, + "ewc_loss": 0.00853827316313982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538272959413007e-05, + "grad_norm": 4.241296291351318, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8930375576019287, + "num_tokens": 822376873.0, + "step": 21553 + }, + { + "epoch": 2.741890344739855, + "ewc_loss": 0.008544690907001495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544691081624478e-05, + "grad_norm": 4.218950271606445, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8825520277023315, + "num_tokens": 822419868.0, + "step": 21554 + }, + { + "epoch": 2.7420175550184456, + "ewc_loss": 0.008524436503648758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524436270818114e-05, + "grad_norm": 4.261899948120117, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8814495801925659, + "num_tokens": 822456757.0, + "step": 21555 + }, + { + "epoch": 2.742144765297036, + "ewc_loss": 0.008549446240067482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549446647521108e-05, + "grad_norm": 4.247415065765381, + "learning_rate": 1e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8967680931091309, + "num_tokens": 822492772.0, + "step": 21556 + }, + { + "epoch": 2.7422719755756266, + "ewc_loss": 0.008537647314369678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537647227058187e-05, + "grad_norm": 4.278820037841797, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8671818971633911, + "num_tokens": 822535400.0, + "step": 21557 + }, + { + "epoch": 2.742399185854217, + "ewc_loss": 0.008545172400772572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545172022422776e-05, + "grad_norm": 4.316671848297119, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8876630067825317, + "num_tokens": 822568313.0, + "step": 21558 + }, + { + "epoch": 2.7425263961328077, + "ewc_loss": 0.008560813032090664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560813148505986e-05, + "grad_norm": 4.231314659118652, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8899810314178467, + "num_tokens": 822608487.0, + "step": 21559 + }, + { + "epoch": 2.742653606411398, + "ewc_loss": 0.008505472913384438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505472942488268e-05, + "grad_norm": 4.258487701416016, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8842201232910156, + "num_tokens": 822647938.0, + "step": 21560 + }, + { + "epoch": 2.7427808166899883, + "ewc_loss": 0.008558626286685467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558625995647162e-05, + "grad_norm": 4.281618595123291, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8784393668174744, + "num_tokens": 822685974.0, + "step": 21561 + }, + { + "epoch": 2.7429080269685793, + "ewc_loss": 0.008549187332391739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549186895834282e-05, + "grad_norm": 4.329782962799072, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8850928544998169, + "num_tokens": 822719166.0, + "step": 21562 + }, + { + "epoch": 2.7430352372471694, + "ewc_loss": 0.00857116375118494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571163925807923e-05, + "grad_norm": 4.256353855133057, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8721996545791626, + "num_tokens": 822758364.0, + "step": 21563 + }, + { + "epoch": 2.7431624475257603, + "ewc_loss": 0.008504869416356087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504869765602052e-05, + "grad_norm": 4.243721008300781, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8903535604476929, + "num_tokens": 822798002.0, + "step": 21564 + }, + { + "epoch": 2.7432896578043504, + "ewc_loss": 0.008539213798940182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539213740732521e-05, + "grad_norm": 4.272611618041992, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.885212242603302, + "num_tokens": 822837958.0, + "step": 21565 + }, + { + "epoch": 2.743416868082941, + "ewc_loss": 0.008559995330870152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559995330870152e-05, + "grad_norm": 4.301219463348389, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8835745453834534, + "num_tokens": 822873076.0, + "step": 21566 + }, + { + "epoch": 2.7435440783615315, + "ewc_loss": 0.008541950955986977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541950955986977e-05, + "grad_norm": 4.307590007781982, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.88367760181427, + "num_tokens": 822904693.0, + "step": 21567 + }, + { + "epoch": 2.743671288640122, + "ewc_loss": 0.00856150034815073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561499998904765e-05, + "grad_norm": 4.294304370880127, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8789160251617432, + "num_tokens": 822941192.0, + "step": 21568 + }, + { + "epoch": 2.7437984989187125, + "ewc_loss": 0.008549925871193409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549926133127883e-05, + "grad_norm": 4.268396377563477, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.872011661529541, + "num_tokens": 822982166.0, + "step": 21569 + }, + { + "epoch": 2.743925709197303, + "ewc_loss": 0.008552084676921368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552084909752011e-05, + "grad_norm": 4.362793445587158, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8766177892684937, + "num_tokens": 823021630.0, + "step": 21570 + }, + { + "epoch": 2.7440529194758936, + "ewc_loss": 0.008609727956354618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609727956354618e-05, + "grad_norm": 4.262551784515381, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8816858530044556, + "num_tokens": 823062133.0, + "step": 21571 + }, + { + "epoch": 2.744180129754484, + "ewc_loss": 0.008511331863701344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51133227115497e-05, + "grad_norm": 4.231777667999268, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8810175657272339, + "num_tokens": 823105163.0, + "step": 21572 + }, + { + "epoch": 2.7443073400330746, + "ewc_loss": 0.00854763388633728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547633478883654e-05, + "grad_norm": 4.270328998565674, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.864838719367981, + "num_tokens": 823146878.0, + "step": 21573 + }, + { + "epoch": 2.744434550311665, + "ewc_loss": 0.008560066111385822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560065907659009e-05, + "grad_norm": 4.274035930633545, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8996260762214661, + "num_tokens": 823182216.0, + "step": 21574 + }, + { + "epoch": 2.7445617605902557, + "ewc_loss": 0.008544452488422394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544452430214733e-05, + "grad_norm": 4.1948161125183105, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8887906074523926, + "num_tokens": 823227838.0, + "step": 21575 + }, + { + "epoch": 2.7446889708688462, + "ewc_loss": 0.008497060276567936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.497060480294749e-05, + "grad_norm": 4.262614727020264, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8764318227767944, + "num_tokens": 823268609.0, + "step": 21576 + }, + { + "epoch": 2.7448161811474368, + "ewc_loss": 0.008580280467867851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580280700698495e-05, + "grad_norm": 4.24105978012085, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8916842341423035, + "num_tokens": 823309426.0, + "step": 21577 + }, + { + "epoch": 2.7449433914260273, + "ewc_loss": 0.008542251773178577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542252180632204e-05, + "grad_norm": 4.331801891326904, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8781376481056213, + "num_tokens": 823346049.0, + "step": 21578 + }, + { + "epoch": 2.745070601704618, + "ewc_loss": 0.008575226180255413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575226092943922e-05, + "grad_norm": 4.318652629852295, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8761451840400696, + "num_tokens": 823378915.0, + "step": 21579 + }, + { + "epoch": 2.7451978119832083, + "ewc_loss": 0.00853787362575531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537873509339988e-05, + "grad_norm": 4.232016563415527, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.872666597366333, + "num_tokens": 823422985.0, + "step": 21580 + }, + { + "epoch": 2.745325022261799, + "ewc_loss": 0.008496311493217945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.496311056660488e-05, + "grad_norm": 4.276981830596924, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8721264600753784, + "num_tokens": 823460703.0, + "step": 21581 + }, + { + "epoch": 2.7454522325403894, + "ewc_loss": 0.008560499176383018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560498827137053e-05, + "grad_norm": 4.24540901184082, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8866688013076782, + "num_tokens": 823499651.0, + "step": 21582 + }, + { + "epoch": 2.74557944281898, + "ewc_loss": 0.008534613065421581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534613152733073e-05, + "grad_norm": 4.254944801330566, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8824137449264526, + "num_tokens": 823537044.0, + "step": 21583 + }, + { + "epoch": 2.7457066530975704, + "ewc_loss": 0.008536591194570065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536591485608369e-05, + "grad_norm": 4.2496562004089355, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8935269713401794, + "num_tokens": 823571832.0, + "step": 21584 + }, + { + "epoch": 2.745833863376161, + "ewc_loss": 0.00855389703065157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553896623197943e-05, + "grad_norm": 4.255535125732422, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8902490735054016, + "num_tokens": 823609852.0, + "step": 21585 + }, + { + "epoch": 2.745961073654751, + "ewc_loss": 0.008559665642678738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559665729990229e-05, + "grad_norm": 4.292844295501709, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8879473805427551, + "num_tokens": 823644419.0, + "step": 21586 + }, + { + "epoch": 2.746088283933342, + "ewc_loss": 0.008574575185775757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574575622333214e-05, + "grad_norm": 4.243557929992676, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8895018100738525, + "num_tokens": 823682451.0, + "step": 21587 + }, + { + "epoch": 2.746215494211932, + "ewc_loss": 0.00853635836392641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536357927368954e-05, + "grad_norm": 4.215399742126465, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8906698226928711, + "num_tokens": 823723051.0, + "step": 21588 + }, + { + "epoch": 2.746342704490523, + "ewc_loss": 0.008553897961974144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553898078389466e-05, + "grad_norm": 4.292141914367676, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8809038400650024, + "num_tokens": 823761353.0, + "step": 21589 + }, + { + "epoch": 2.746469914769113, + "ewc_loss": 0.008605385199189186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605384937254712e-05, + "grad_norm": 4.27949333190918, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8686156868934631, + "num_tokens": 823800281.0, + "step": 21590 + }, + { + "epoch": 2.7465971250477037, + "ewc_loss": 0.008548285812139511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548286132281646e-05, + "grad_norm": 4.176041603088379, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8963629603385925, + "num_tokens": 823843255.0, + "step": 21591 + }, + { + "epoch": 2.7467243353262942, + "ewc_loss": 0.008493756875395775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.493756467942148e-05, + "grad_norm": 4.233348846435547, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8797392845153809, + "num_tokens": 823880736.0, + "step": 21592 + }, + { + "epoch": 2.7468515456048848, + "ewc_loss": 0.008585415780544281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585416071582586e-05, + "grad_norm": 4.2430033683776855, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.881292462348938, + "num_tokens": 823923232.0, + "step": 21593 + }, + { + "epoch": 2.7469787558834753, + "ewc_loss": 0.008553513325750828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553513180231676e-05, + "grad_norm": 4.268789291381836, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8853967785835266, + "num_tokens": 823957488.0, + "step": 21594 + }, + { + "epoch": 2.747105966162066, + "ewc_loss": 0.008577732369303703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577731932746246e-05, + "grad_norm": 4.229218006134033, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8787575960159302, + "num_tokens": 823999568.0, + "step": 21595 + }, + { + "epoch": 2.7472331764406563, + "ewc_loss": 0.008545750752091408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545750461053103e-05, + "grad_norm": 4.233437538146973, + "learning_rate": 1e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8932288885116577, + "num_tokens": 824036947.0, + "step": 21596 + }, + { + "epoch": 2.747360386719247, + "ewc_loss": 0.008567800745368004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.567800978198647e-05, + "grad_norm": 4.306034564971924, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8962960839271545, + "num_tokens": 824067966.0, + "step": 21597 + }, + { + "epoch": 2.7474875969978374, + "ewc_loss": 0.008591991849243641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591992082074285e-05, + "grad_norm": 4.206299304962158, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8818631768226624, + "num_tokens": 824112216.0, + "step": 21598 + }, + { + "epoch": 2.747614807276428, + "ewc_loss": 0.008521788753569126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521788549842313e-05, + "grad_norm": 4.301359176635742, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8939740061759949, + "num_tokens": 824148749.0, + "step": 21599 + }, + { + "epoch": 2.7477420175550185, + "ewc_loss": 0.008609753102064133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609753422206268e-05, + "grad_norm": 4.299736022949219, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8821194171905518, + "num_tokens": 824186685.0, + "step": 21600 + }, + { + "epoch": 2.747869227833609, + "ewc_loss": 0.008575482293963432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575482206651941e-05, + "grad_norm": 4.2668046951293945, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8780293464660645, + "num_tokens": 824223716.0, + "step": 21601 + }, + { + "epoch": 2.7479964381121995, + "ewc_loss": 0.008544334210455418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544334559701383e-05, + "grad_norm": 4.250103950500488, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8813170194625854, + "num_tokens": 824264627.0, + "step": 21602 + }, + { + "epoch": 2.74812364839079, + "ewc_loss": 0.00855365302413702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553652878617868e-05, + "grad_norm": 4.288324356079102, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8833376169204712, + "num_tokens": 824301901.0, + "step": 21603 + }, + { + "epoch": 2.7482508586693806, + "ewc_loss": 0.00858720950782299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587209595134482e-05, + "grad_norm": 4.257185459136963, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8869976997375488, + "num_tokens": 824344621.0, + "step": 21604 + }, + { + "epoch": 2.748378068947971, + "ewc_loss": 0.008527635596692562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527635509381071e-05, + "grad_norm": 4.207614421844482, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8873649835586548, + "num_tokens": 824385850.0, + "step": 21605 + }, + { + "epoch": 2.7485052792265616, + "ewc_loss": 0.008531145751476288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531145431334153e-05, + "grad_norm": 4.2070631980896, + "learning_rate": 1e-06, + "loss": 0.2567, + "mean_token_accuracy": 0.9090070128440857, + "num_tokens": 824425545.0, + "step": 21606 + }, + { + "epoch": 2.748632489505152, + "ewc_loss": 0.008540403097867966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540402632206678e-05, + "grad_norm": 4.210566520690918, + "learning_rate": 1e-06, + "loss": 0.289, + "mean_token_accuracy": 0.8987149000167847, + "num_tokens": 824468611.0, + "step": 21607 + }, + { + "epoch": 2.7487596997837427, + "ewc_loss": 0.008533204905688763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533204527338967e-05, + "grad_norm": 4.260504245758057, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8896344304084778, + "num_tokens": 824512161.0, + "step": 21608 + }, + { + "epoch": 2.7488869100623328, + "ewc_loss": 0.008529995568096638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529995830031112e-05, + "grad_norm": 4.17711067199707, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8958559036254883, + "num_tokens": 824555502.0, + "step": 21609 + }, + { + "epoch": 2.7490141203409237, + "ewc_loss": 0.008479122072458267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.47912160679698e-05, + "grad_norm": 4.303216457366943, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8777450323104858, + "num_tokens": 824596277.0, + "step": 21610 + }, + { + "epoch": 2.749141330619514, + "ewc_loss": 0.00855239387601614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552394137950614e-05, + "grad_norm": 4.299952507019043, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.872665524482727, + "num_tokens": 824633704.0, + "step": 21611 + }, + { + "epoch": 2.749268540898105, + "ewc_loss": 0.008495689369738102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.495688962284476e-05, + "grad_norm": 4.245419502258301, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8878173828125, + "num_tokens": 824673584.0, + "step": 21612 + }, + { + "epoch": 2.749395751176695, + "ewc_loss": 0.008479289710521698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.479289681417868e-05, + "grad_norm": 4.236510276794434, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8787232637405396, + "num_tokens": 824712508.0, + "step": 21613 + }, + { + "epoch": 2.749522961455286, + "ewc_loss": 0.008498995564877987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49899515742436e-05, + "grad_norm": 4.236588954925537, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8946149349212646, + "num_tokens": 824754353.0, + "step": 21614 + }, + { + "epoch": 2.749650171733876, + "ewc_loss": 0.008463609032332897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463609265163541e-05, + "grad_norm": 4.2502241134643555, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8645062446594238, + "num_tokens": 824796333.0, + "step": 21615 + }, + { + "epoch": 2.7497773820124665, + "ewc_loss": 0.008500173687934875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.500173862557858e-05, + "grad_norm": 4.304920673370361, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8791229724884033, + "num_tokens": 824832048.0, + "step": 21616 + }, + { + "epoch": 2.749904592291057, + "ewc_loss": 0.008505336008965969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.505336154485121e-05, + "grad_norm": 4.341621398925781, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8782563209533691, + "num_tokens": 824871889.0, + "step": 21617 + }, + { + "epoch": 2.7500318025696475, + "ewc_loss": 0.008491000160574913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491000335197896e-05, + "grad_norm": 4.223465442657471, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8931005597114563, + "num_tokens": 824912547.0, + "step": 21618 + }, + { + "epoch": 2.750159012848238, + "ewc_loss": 0.008435818366706371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435818745056167e-05, + "grad_norm": 4.296660900115967, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8654360175132751, + "num_tokens": 824955626.0, + "step": 21619 + }, + { + "epoch": 2.7502862231268286, + "ewc_loss": 0.008513241074979305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51324075483717e-05, + "grad_norm": 4.225964546203613, + "learning_rate": 1e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8916969895362854, + "num_tokens": 824991813.0, + "step": 21620 + }, + { + "epoch": 2.750413433405419, + "ewc_loss": 0.008457082323729992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.457082731183618e-05, + "grad_norm": 4.30209493637085, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8932186365127563, + "num_tokens": 825024356.0, + "step": 21621 + }, + { + "epoch": 2.7505406436840096, + "ewc_loss": 0.008528540842235088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528540638508275e-05, + "grad_norm": 4.320199012756348, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8801122307777405, + "num_tokens": 825060316.0, + "step": 21622 + }, + { + "epoch": 2.7506678539626, + "ewc_loss": 0.0085135567933321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.513556531397626e-05, + "grad_norm": 4.287623405456543, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8765644431114197, + "num_tokens": 825096044.0, + "step": 21623 + }, + { + "epoch": 2.7507950642411907, + "ewc_loss": 0.008506787940859795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506787708029151e-05, + "grad_norm": 4.361445426940918, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8824992179870605, + "num_tokens": 825123999.0, + "step": 21624 + }, + { + "epoch": 2.750922274519781, + "ewc_loss": 0.008558436296880245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558436093153432e-05, + "grad_norm": 4.213633060455322, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8737281560897827, + "num_tokens": 825170155.0, + "step": 21625 + }, + { + "epoch": 2.7510494847983717, + "ewc_loss": 0.008482923731207848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.482924022246152e-05, + "grad_norm": 4.235795021057129, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8830101490020752, + "num_tokens": 825213388.0, + "step": 21626 + }, + { + "epoch": 2.7511766950769623, + "ewc_loss": 0.008559568785130978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55956895975396e-05, + "grad_norm": 4.320742130279541, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8902634382247925, + "num_tokens": 825247171.0, + "step": 21627 + }, + { + "epoch": 2.751303905355553, + "ewc_loss": 0.008570409379899502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570409409003332e-05, + "grad_norm": 4.277725696563721, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.884220540523529, + "num_tokens": 825283271.0, + "step": 21628 + }, + { + "epoch": 2.7514311156341433, + "ewc_loss": 0.008541878312826157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541878196410835e-05, + "grad_norm": 4.285146713256836, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8824036121368408, + "num_tokens": 825318976.0, + "step": 21629 + }, + { + "epoch": 2.751558325912734, + "ewc_loss": 0.008571567945182323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57156774145551e-05, + "grad_norm": 4.225521087646484, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8832368850708008, + "num_tokens": 825358019.0, + "step": 21630 + }, + { + "epoch": 2.7516855361913244, + "ewc_loss": 0.008520938456058502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520938717992976e-05, + "grad_norm": 4.245922565460205, + "learning_rate": 1e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8976247906684875, + "num_tokens": 825394025.0, + "step": 21631 + }, + { + "epoch": 2.751812746469915, + "ewc_loss": 0.008584903553128242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584903116570786e-05, + "grad_norm": 4.258504390716553, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8932045698165894, + "num_tokens": 825433021.0, + "step": 21632 + }, + { + "epoch": 2.7519399567485054, + "ewc_loss": 0.00857658963650465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57658960740082e-05, + "grad_norm": 4.217040061950684, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8849625587463379, + "num_tokens": 825477026.0, + "step": 21633 + }, + { + "epoch": 2.7520671670270955, + "ewc_loss": 0.008551114238798618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551114297006279e-05, + "grad_norm": 4.271291255950928, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8883603811264038, + "num_tokens": 825511054.0, + "step": 21634 + }, + { + "epoch": 2.7521943773056865, + "ewc_loss": 0.008609005250036716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60900545376353e-05, + "grad_norm": 4.275390148162842, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8695564270019531, + "num_tokens": 825549654.0, + "step": 21635 + }, + { + "epoch": 2.7523215875842766, + "ewc_loss": 0.008586100302636623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586100011598319e-05, + "grad_norm": 4.328221797943115, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8858249187469482, + "num_tokens": 825587269.0, + "step": 21636 + }, + { + "epoch": 2.7524487978628676, + "ewc_loss": 0.008614221587777138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614221587777138e-05, + "grad_norm": 4.242672443389893, + "learning_rate": 1e-06, + "loss": 0.2711, + "mean_token_accuracy": 0.9046200513839722, + "num_tokens": 825624864.0, + "step": 21637 + }, + { + "epoch": 2.7525760081414576, + "ewc_loss": 0.008545312099158764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545311720808968e-05, + "grad_norm": 4.237961292266846, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8764100074768066, + "num_tokens": 825667025.0, + "step": 21638 + }, + { + "epoch": 2.7527032184200486, + "ewc_loss": 0.008587542921304703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587542833993211e-05, + "grad_norm": 4.259639739990234, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8687410354614258, + "num_tokens": 825709505.0, + "step": 21639 + }, + { + "epoch": 2.7528304286986387, + "ewc_loss": 0.008569726720452309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569726924179122e-05, + "grad_norm": 4.269455432891846, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8861595392227173, + "num_tokens": 825747376.0, + "step": 21640 + }, + { + "epoch": 2.7529576389772292, + "ewc_loss": 0.008579195477068424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579195127822459e-05, + "grad_norm": 4.287490367889404, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8895310163497925, + "num_tokens": 825784817.0, + "step": 21641 + }, + { + "epoch": 2.7530848492558198, + "ewc_loss": 0.008588758297264576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58875791891478e-05, + "grad_norm": 4.31771183013916, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8824142217636108, + "num_tokens": 825815631.0, + "step": 21642 + }, + { + "epoch": 2.7532120595344103, + "ewc_loss": 0.008590631186962128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590630750404671e-05, + "grad_norm": 4.263176918029785, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8830037713050842, + "num_tokens": 825854202.0, + "step": 21643 + }, + { + "epoch": 2.753339269813001, + "ewc_loss": 0.008554957807064056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554958185413852e-05, + "grad_norm": 4.278444290161133, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8916428089141846, + "num_tokens": 825890636.0, + "step": 21644 + }, + { + "epoch": 2.7534664800915913, + "ewc_loss": 0.008601253852248192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601253648521379e-05, + "grad_norm": 4.288427829742432, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8616387844085693, + "num_tokens": 825931054.0, + "step": 21645 + }, + { + "epoch": 2.753593690370182, + "ewc_loss": 0.008602642454206944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602642628829926e-05, + "grad_norm": 4.249145030975342, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8805099725723267, + "num_tokens": 825972519.0, + "step": 21646 + }, + { + "epoch": 2.7537209006487724, + "ewc_loss": 0.008569274097681046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569273632019758e-05, + "grad_norm": 4.241152286529541, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8848813772201538, + "num_tokens": 826010010.0, + "step": 21647 + }, + { + "epoch": 2.753848110927363, + "ewc_loss": 0.008599824272096157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599823922850192e-05, + "grad_norm": 4.269684791564941, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8811955451965332, + "num_tokens": 826048348.0, + "step": 21648 + }, + { + "epoch": 2.7539753212059535, + "ewc_loss": 0.008594549261033535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594548853579909e-05, + "grad_norm": 4.208962917327881, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8792791962623596, + "num_tokens": 826087807.0, + "step": 21649 + }, + { + "epoch": 2.754102531484544, + "ewc_loss": 0.008565968833863735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565968892071396e-05, + "grad_norm": 4.29727029800415, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8805144429206848, + "num_tokens": 826124657.0, + "step": 21650 + }, + { + "epoch": 2.7542297417631345, + "ewc_loss": 0.008619220927357674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619220898253843e-05, + "grad_norm": 4.275299549102783, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8855958580970764, + "num_tokens": 826164190.0, + "step": 21651 + }, + { + "epoch": 2.754356952041725, + "ewc_loss": 0.008584820665419102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584820898249745e-05, + "grad_norm": 4.2584991455078125, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8831146359443665, + "num_tokens": 826202876.0, + "step": 21652 + }, + { + "epoch": 2.7544841623203156, + "ewc_loss": 0.008561181835830212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561182039557025e-05, + "grad_norm": 4.285776615142822, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8817564249038696, + "num_tokens": 826242955.0, + "step": 21653 + }, + { + "epoch": 2.754611372598906, + "ewc_loss": 0.008582118898630142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582118607591838e-05, + "grad_norm": 4.284804821014404, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8926073312759399, + "num_tokens": 826279925.0, + "step": 21654 + }, + { + "epoch": 2.7547385828774966, + "ewc_loss": 0.008559893816709518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559894195059314e-05, + "grad_norm": 4.280102729797363, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8921933174133301, + "num_tokens": 826314076.0, + "step": 21655 + }, + { + "epoch": 2.754865793156087, + "ewc_loss": 0.008557461202144623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557461114833131e-05, + "grad_norm": 4.264754295349121, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8827401995658875, + "num_tokens": 826354138.0, + "step": 21656 + }, + { + "epoch": 2.7549930034346777, + "ewc_loss": 0.008553404361009598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553404040867463e-05, + "grad_norm": 4.269252777099609, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8788502216339111, + "num_tokens": 826395393.0, + "step": 21657 + }, + { + "epoch": 2.755120213713268, + "ewc_loss": 0.008536156266927719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536156383343041e-05, + "grad_norm": 4.275017261505127, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8746078014373779, + "num_tokens": 826437539.0, + "step": 21658 + }, + { + "epoch": 2.7552474239918583, + "ewc_loss": 0.00854814238846302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548142795916647e-05, + "grad_norm": 4.291565418243408, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8791054487228394, + "num_tokens": 826475857.0, + "step": 21659 + }, + { + "epoch": 2.7553746342704493, + "ewc_loss": 0.008537267334759235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537267422070727e-05, + "grad_norm": 4.274533748626709, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8931094408035278, + "num_tokens": 826513956.0, + "step": 21660 + }, + { + "epoch": 2.7555018445490393, + "ewc_loss": 0.008534595370292664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534594962839037e-05, + "grad_norm": 4.263017654418945, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8830384016036987, + "num_tokens": 826552747.0, + "step": 21661 + }, + { + "epoch": 2.7556290548276303, + "ewc_loss": 0.008536850102245808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536849782103673e-05, + "grad_norm": 4.238489151000977, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8900841474533081, + "num_tokens": 826590830.0, + "step": 21662 + }, + { + "epoch": 2.7557562651062204, + "ewc_loss": 0.008511951193213463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511951455147937e-05, + "grad_norm": 4.242904186248779, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8854444622993469, + "num_tokens": 826633414.0, + "step": 21663 + }, + { + "epoch": 2.755883475384811, + "ewc_loss": 0.008531242609024048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531242929166183e-05, + "grad_norm": 4.312657356262207, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8923630118370056, + "num_tokens": 826668344.0, + "step": 21664 + }, + { + "epoch": 2.7560106856634015, + "ewc_loss": 0.008564040996134281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564040763303638e-05, + "grad_norm": 4.297726154327393, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8676667213439941, + "num_tokens": 826710587.0, + "step": 21665 + }, + { + "epoch": 2.756137895941992, + "ewc_loss": 0.008512142114341259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512142085237429e-05, + "grad_norm": 4.267632007598877, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8749152421951294, + "num_tokens": 826748854.0, + "step": 21666 + }, + { + "epoch": 2.7562651062205825, + "ewc_loss": 0.008510456420481205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510456245858222e-05, + "grad_norm": 4.295022010803223, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8719253540039062, + "num_tokens": 826789591.0, + "step": 21667 + }, + { + "epoch": 2.756392316499173, + "ewc_loss": 0.008554823696613312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55482358019799e-05, + "grad_norm": 4.2560133934021, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.879893958568573, + "num_tokens": 826831890.0, + "step": 21668 + }, + { + "epoch": 2.7565195267777636, + "ewc_loss": 0.00851005781441927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.510057523380965e-05, + "grad_norm": 4.290513515472412, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8659299612045288, + "num_tokens": 826870265.0, + "step": 21669 + }, + { + "epoch": 2.756646737056354, + "ewc_loss": 0.008546221069991589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546221215510741e-05, + "grad_norm": 4.283885955810547, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8823262453079224, + "num_tokens": 826910993.0, + "step": 21670 + }, + { + "epoch": 2.7567739473349446, + "ewc_loss": 0.008523459546267986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52345910971053e-05, + "grad_norm": 4.255082607269287, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8818734884262085, + "num_tokens": 826947492.0, + "step": 21671 + }, + { + "epoch": 2.756901157613535, + "ewc_loss": 0.0085285110399127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528510807082057e-05, + "grad_norm": 4.248427391052246, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.886696457862854, + "num_tokens": 826987215.0, + "step": 21672 + }, + { + "epoch": 2.7570283678921257, + "ewc_loss": 0.008541084825992584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541084389435127e-05, + "grad_norm": 4.262538433074951, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8804079294204712, + "num_tokens": 827024092.0, + "step": 21673 + }, + { + "epoch": 2.757155578170716, + "ewc_loss": 0.008535669185221195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535669621778652e-05, + "grad_norm": 4.314406394958496, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.867437481880188, + "num_tokens": 827060541.0, + "step": 21674 + }, + { + "epoch": 2.7572827884493067, + "ewc_loss": 0.008584141731262207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584142051404342e-05, + "grad_norm": 4.235212326049805, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.881208062171936, + "num_tokens": 827101205.0, + "step": 21675 + }, + { + "epoch": 2.7574099987278973, + "ewc_loss": 0.008532426320016384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53242672747001e-05, + "grad_norm": 4.314606189727783, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8772352337837219, + "num_tokens": 827132441.0, + "step": 21676 + }, + { + "epoch": 2.757537209006488, + "ewc_loss": 0.008595823310315609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595823601353914e-05, + "grad_norm": 4.285272598266602, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8832354545593262, + "num_tokens": 827169530.0, + "step": 21677 + }, + { + "epoch": 2.7576644192850783, + "ewc_loss": 0.008566993288695812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566993346903473e-05, + "grad_norm": 4.285787582397461, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8890255689620972, + "num_tokens": 827207352.0, + "step": 21678 + }, + { + "epoch": 2.757791629563669, + "ewc_loss": 0.008565964177250862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565963798901066e-05, + "grad_norm": 4.281131267547607, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8780275583267212, + "num_tokens": 827242790.0, + "step": 21679 + }, + { + "epoch": 2.7579188398422594, + "ewc_loss": 0.00856997910887003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569978672312573e-05, + "grad_norm": 4.24927282333374, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8834054470062256, + "num_tokens": 827281676.0, + "step": 21680 + }, + { + "epoch": 2.75804605012085, + "ewc_loss": 0.008572464808821678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572464867029339e-05, + "grad_norm": 4.308034896850586, + "learning_rate": 1e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8952714800834656, + "num_tokens": 827313676.0, + "step": 21681 + }, + { + "epoch": 2.7581732603994404, + "ewc_loss": 0.008614677004516125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614677062723786e-05, + "grad_norm": 4.232021808624268, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8822051882743835, + "num_tokens": 827355017.0, + "step": 21682 + }, + { + "epoch": 2.758300470678031, + "ewc_loss": 0.00855542067438364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555420936318114e-05, + "grad_norm": 4.2379374504089355, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.8985601663589478, + "num_tokens": 827391773.0, + "step": 21683 + }, + { + "epoch": 2.758427680956621, + "ewc_loss": 0.008582403883337975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582403825130314e-05, + "grad_norm": 4.2000322341918945, + "learning_rate": 1e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.8975514769554138, + "num_tokens": 827433034.0, + "step": 21684 + }, + { + "epoch": 2.758554891235212, + "ewc_loss": 0.008560196496546268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560196147300303e-05, + "grad_norm": 4.234035015106201, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8916296362876892, + "num_tokens": 827475729.0, + "step": 21685 + }, + { + "epoch": 2.758682101513802, + "ewc_loss": 0.008600534871220589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600534783909097e-05, + "grad_norm": 4.2642903327941895, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8886526823043823, + "num_tokens": 827514127.0, + "step": 21686 + }, + { + "epoch": 2.758809311792393, + "ewc_loss": 0.008581213653087616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581213478464633e-05, + "grad_norm": 4.29373836517334, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8928048610687256, + "num_tokens": 827546709.0, + "step": 21687 + }, + { + "epoch": 2.758936522070983, + "ewc_loss": 0.008586560375988483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586560579715297e-05, + "grad_norm": 4.289445400238037, + "learning_rate": 1e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.9013017416000366, + "num_tokens": 827581450.0, + "step": 21688 + }, + { + "epoch": 2.7590637323495737, + "ewc_loss": 0.008579197339713573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579197310609743e-05, + "grad_norm": 4.268713474273682, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8787009119987488, + "num_tokens": 827623161.0, + "step": 21689 + }, + { + "epoch": 2.7591909426281642, + "ewc_loss": 0.008570559322834015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570559293730184e-05, + "grad_norm": 4.260860919952393, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8794718384742737, + "num_tokens": 827659460.0, + "step": 21690 + }, + { + "epoch": 2.7593181529067548, + "ewc_loss": 0.008558200672268867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558200352126732e-05, + "grad_norm": 4.279962539672852, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8777302503585815, + "num_tokens": 827701110.0, + "step": 21691 + }, + { + "epoch": 2.7594453631853453, + "ewc_loss": 0.008571463637053967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571463695261627e-05, + "grad_norm": 4.342185020446777, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8810545206069946, + "num_tokens": 827738992.0, + "step": 21692 + }, + { + "epoch": 2.759572573463936, + "ewc_loss": 0.00859182607382536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591826190240681e-05, + "grad_norm": 4.265913009643555, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8701941967010498, + "num_tokens": 827775853.0, + "step": 21693 + }, + { + "epoch": 2.7596997837425263, + "ewc_loss": 0.008537930436432362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537930261809379e-05, + "grad_norm": 4.235151290893555, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.884661853313446, + "num_tokens": 827819438.0, + "step": 21694 + }, + { + "epoch": 2.759826994021117, + "ewc_loss": 0.00853913277387619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539132977603003e-05, + "grad_norm": 4.237149715423584, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8789055347442627, + "num_tokens": 827858833.0, + "step": 21695 + }, + { + "epoch": 2.7599542042997074, + "ewc_loss": 0.008569897152483463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569897181587294e-05, + "grad_norm": 4.2858476638793945, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8773396611213684, + "num_tokens": 827895069.0, + "step": 21696 + }, + { + "epoch": 2.760081414578298, + "ewc_loss": 0.008583452552556992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583453018218279e-05, + "grad_norm": 4.218225955963135, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8965371251106262, + "num_tokens": 827932970.0, + "step": 21697 + }, + { + "epoch": 2.7602086248568884, + "ewc_loss": 0.008547693490982056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54769314173609e-05, + "grad_norm": 4.1954121589660645, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8932349681854248, + "num_tokens": 827981859.0, + "step": 21698 + }, + { + "epoch": 2.760335835135479, + "ewc_loss": 0.008529978804290295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529979095328599e-05, + "grad_norm": 4.260784149169922, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8764698505401611, + "num_tokens": 828022018.0, + "step": 21699 + }, + { + "epoch": 2.7604630454140695, + "ewc_loss": 0.008578550070524216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578550477977842e-05, + "grad_norm": 4.270485877990723, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8788361549377441, + "num_tokens": 828061546.0, + "step": 21700 + }, + { + "epoch": 2.76059025569266, + "ewc_loss": 0.008547438308596611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547438483219594e-05, + "grad_norm": 4.268434047698975, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8691123723983765, + "num_tokens": 828104464.0, + "step": 21701 + }, + { + "epoch": 2.7607174659712506, + "ewc_loss": 0.008538736961781979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538737165508792e-05, + "grad_norm": 4.305468559265137, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8739965558052063, + "num_tokens": 828142527.0, + "step": 21702 + }, + { + "epoch": 2.760844676249841, + "ewc_loss": 0.008559963665902615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55996404425241e-05, + "grad_norm": 4.2065019607543945, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8823485374450684, + "num_tokens": 828189670.0, + "step": 21703 + }, + { + "epoch": 2.7609718865284316, + "ewc_loss": 0.00848868303000927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.488682942697778e-05, + "grad_norm": 4.261098861694336, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8877729177474976, + "num_tokens": 828231511.0, + "step": 21704 + }, + { + "epoch": 2.761099096807022, + "ewc_loss": 0.008553343825042248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553343650419265e-05, + "grad_norm": 4.26927375793457, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8818617463111877, + "num_tokens": 828274387.0, + "step": 21705 + }, + { + "epoch": 2.7612263070856127, + "ewc_loss": 0.008512353524565697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51235308800824e-05, + "grad_norm": 4.290474891662598, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8766961693763733, + "num_tokens": 828312263.0, + "step": 21706 + }, + { + "epoch": 2.7613535173642028, + "ewc_loss": 0.008515318855643272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.515318768331781e-05, + "grad_norm": 4.297536849975586, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8826491832733154, + "num_tokens": 828345550.0, + "step": 21707 + }, + { + "epoch": 2.7614807276427937, + "ewc_loss": 0.008511790074408054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.511789928888902e-05, + "grad_norm": 4.318744659423828, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8717621564865112, + "num_tokens": 828378633.0, + "step": 21708 + }, + { + "epoch": 2.761607937921384, + "ewc_loss": 0.008521690964698792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.521691052010283e-05, + "grad_norm": 4.295173168182373, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.877589225769043, + "num_tokens": 828414292.0, + "step": 21709 + }, + { + "epoch": 2.761735148199975, + "ewc_loss": 0.008510141633450985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51014192448929e-05, + "grad_norm": 4.253854751586914, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8822025060653687, + "num_tokens": 828452682.0, + "step": 21710 + }, + { + "epoch": 2.761862358478565, + "ewc_loss": 0.008522344753146172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522344433004037e-05, + "grad_norm": 4.232982158660889, + "learning_rate": 1e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.8966825604438782, + "num_tokens": 828494162.0, + "step": 21711 + }, + { + "epoch": 2.761989568757156, + "ewc_loss": 0.008506303653120995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506303856847808e-05, + "grad_norm": 4.322137832641602, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8865374326705933, + "num_tokens": 828525997.0, + "step": 21712 + }, + { + "epoch": 2.762116779035746, + "ewc_loss": 0.008577383123338223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577383414376527e-05, + "grad_norm": 4.260474681854248, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8812370300292969, + "num_tokens": 828562488.0, + "step": 21713 + }, + { + "epoch": 2.7622439893143365, + "ewc_loss": 0.008511609397828579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51160948514007e-05, + "grad_norm": 4.3019118309021, + "learning_rate": 1e-06, + "loss": 0.283, + "mean_token_accuracy": 0.9005681276321411, + "num_tokens": 828591818.0, + "step": 21714 + }, + { + "epoch": 2.762371199592927, + "ewc_loss": 0.008573545143008232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573545346735045e-05, + "grad_norm": 4.2929887771606445, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8709636926651001, + "num_tokens": 828628872.0, + "step": 21715 + }, + { + "epoch": 2.7624984098715175, + "ewc_loss": 0.008567464537918568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.567464828956872e-05, + "grad_norm": 4.223888397216797, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8907982707023621, + "num_tokens": 828670939.0, + "step": 21716 + }, + { + "epoch": 2.762625620150108, + "ewc_loss": 0.008526488207280636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526488090865314e-05, + "grad_norm": 4.322438716888428, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8677299618721008, + "num_tokens": 828708446.0, + "step": 21717 + }, + { + "epoch": 2.7627528304286986, + "ewc_loss": 0.008613931015133858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61393054947257e-05, + "grad_norm": 4.229742527008057, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8880949020385742, + "num_tokens": 828749873.0, + "step": 21718 + }, + { + "epoch": 2.762880040707289, + "ewc_loss": 0.008531827479600906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531827188562602e-05, + "grad_norm": 4.299885272979736, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.877180814743042, + "num_tokens": 828790066.0, + "step": 21719 + }, + { + "epoch": 2.7630072509858796, + "ewc_loss": 0.008608829230070114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608829375589266e-05, + "grad_norm": 4.352560043334961, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8812546730041504, + "num_tokens": 828821911.0, + "step": 21720 + }, + { + "epoch": 2.76313446126447, + "ewc_loss": 0.008612250909209251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612251258455217e-05, + "grad_norm": 4.293315410614014, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8764340877532959, + "num_tokens": 828856572.0, + "step": 21721 + }, + { + "epoch": 2.7632616715430607, + "ewc_loss": 0.00857543759047985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575437823310494e-05, + "grad_norm": 4.219625949859619, + "learning_rate": 1e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.8951114416122437, + "num_tokens": 828898865.0, + "step": 21722 + }, + { + "epoch": 2.763388881821651, + "ewc_loss": 0.008569220080971718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569219789933413e-05, + "grad_norm": 4.273927211761475, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8892898559570312, + "num_tokens": 828933768.0, + "step": 21723 + }, + { + "epoch": 2.7635160921002417, + "ewc_loss": 0.008636833168566227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636833081254736e-05, + "grad_norm": 4.24907922744751, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8977702856063843, + "num_tokens": 828968869.0, + "step": 21724 + }, + { + "epoch": 2.7636433023788323, + "ewc_loss": 0.008601086214184761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601086301496252e-05, + "grad_norm": 4.390800476074219, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8672056198120117, + "num_tokens": 829005621.0, + "step": 21725 + }, + { + "epoch": 2.763770512657423, + "ewc_loss": 0.00869053415954113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.690534014021978e-05, + "grad_norm": 4.248000621795654, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8733361959457397, + "num_tokens": 829045046.0, + "step": 21726 + }, + { + "epoch": 2.7638977229360133, + "ewc_loss": 0.008553066290915012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553066436434165e-05, + "grad_norm": 4.249861717224121, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8958104848861694, + "num_tokens": 829084588.0, + "step": 21727 + }, + { + "epoch": 2.764024933214604, + "ewc_loss": 0.00862377230077982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623772737337276e-05, + "grad_norm": 4.309244632720947, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8672134280204773, + "num_tokens": 829121010.0, + "step": 21728 + }, + { + "epoch": 2.7641521434931944, + "ewc_loss": 0.008659123443067074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659123705001548e-05, + "grad_norm": 4.271700382232666, + "learning_rate": 1e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8901894688606262, + "num_tokens": 829159618.0, + "step": 21729 + }, + { + "epoch": 2.764279353771785, + "ewc_loss": 0.008607969619333744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60796935739927e-05, + "grad_norm": 4.24149227142334, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8709217309951782, + "num_tokens": 829204826.0, + "step": 21730 + }, + { + "epoch": 2.7644065640503754, + "ewc_loss": 0.00861892756074667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618927677161992e-05, + "grad_norm": 4.277563571929932, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.8939893245697021, + "num_tokens": 829240741.0, + "step": 21731 + }, + { + "epoch": 2.7645337743289655, + "ewc_loss": 0.00863607320934534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636073471279815e-05, + "grad_norm": 4.254999160766602, + "learning_rate": 1e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.9012889266014099, + "num_tokens": 829278065.0, + "step": 21732 + }, + { + "epoch": 2.7646609846075565, + "ewc_loss": 0.008609423413872719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609423093730584e-05, + "grad_norm": 4.27780818939209, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8786848783493042, + "num_tokens": 829319438.0, + "step": 21733 + }, + { + "epoch": 2.7647881948861466, + "ewc_loss": 0.008623335510492325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623335452284664e-05, + "grad_norm": 4.283968925476074, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8923922777175903, + "num_tokens": 829357299.0, + "step": 21734 + }, + { + "epoch": 2.7649154051647375, + "ewc_loss": 0.008602302521467209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602302841609344e-05, + "grad_norm": 4.247743129730225, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8840786218643188, + "num_tokens": 829398600.0, + "step": 21735 + }, + { + "epoch": 2.7650426154433276, + "ewc_loss": 0.008580449037253857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580448775319383e-05, + "grad_norm": 4.274483680725098, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8737964630126953, + "num_tokens": 829439214.0, + "step": 21736 + }, + { + "epoch": 2.7651698257219186, + "ewc_loss": 0.00859944149851799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599441935075447e-05, + "grad_norm": 4.276165008544922, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8729917407035828, + "num_tokens": 829477634.0, + "step": 21737 + }, + { + "epoch": 2.7652970360005087, + "ewc_loss": 0.008592656813561916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592657104600221e-05, + "grad_norm": 4.240964412689209, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8883461952209473, + "num_tokens": 829514527.0, + "step": 21738 + }, + { + "epoch": 2.765424246279099, + "ewc_loss": 0.008570811711251736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570811769459397e-05, + "grad_norm": 4.23441219329834, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8851426243782043, + "num_tokens": 829558531.0, + "step": 21739 + }, + { + "epoch": 2.7655514565576897, + "ewc_loss": 0.008566437289118767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56643746374175e-05, + "grad_norm": 4.239011764526367, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8862460255622864, + "num_tokens": 829600769.0, + "step": 21740 + }, + { + "epoch": 2.7656786668362803, + "ewc_loss": 0.008572295308113098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572295337216929e-05, + "grad_norm": 4.28412389755249, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8890749216079712, + "num_tokens": 829635938.0, + "step": 21741 + }, + { + "epoch": 2.765805877114871, + "ewc_loss": 0.008570082485675812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570082718506455e-05, + "grad_norm": 4.309203624725342, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8820691108703613, + "num_tokens": 829672956.0, + "step": 21742 + }, + { + "epoch": 2.7659330873934613, + "ewc_loss": 0.008563581854104996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563581650378183e-05, + "grad_norm": 4.263705730438232, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8831793069839478, + "num_tokens": 829712658.0, + "step": 21743 + }, + { + "epoch": 2.766060297672052, + "ewc_loss": 0.008540759794414043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540759881725535e-05, + "grad_norm": 4.246973991394043, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8836799263954163, + "num_tokens": 829755429.0, + "step": 21744 + }, + { + "epoch": 2.7661875079506424, + "ewc_loss": 0.008529527112841606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529527258360758e-05, + "grad_norm": 4.259956359863281, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8826479911804199, + "num_tokens": 829796373.0, + "step": 21745 + }, + { + "epoch": 2.766314718229233, + "ewc_loss": 0.008543405681848526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.543406147509813e-05, + "grad_norm": 4.2744855880737305, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8789312839508057, + "num_tokens": 829834049.0, + "step": 21746 + }, + { + "epoch": 2.7664419285078234, + "ewc_loss": 0.00853798445314169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537984831491485e-05, + "grad_norm": 4.2814154624938965, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8828920722007751, + "num_tokens": 829870853.0, + "step": 21747 + }, + { + "epoch": 2.766569138786414, + "ewc_loss": 0.008532144129276276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53214442031458e-05, + "grad_norm": 4.321755886077881, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8769389986991882, + "num_tokens": 829907519.0, + "step": 21748 + }, + { + "epoch": 2.7666963490650045, + "ewc_loss": 0.008554873056709766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554873056709766e-05, + "grad_norm": 4.279002666473389, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8704320192337036, + "num_tokens": 829947162.0, + "step": 21749 + }, + { + "epoch": 2.766823559343595, + "ewc_loss": 0.008515775203704834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.51577497087419e-05, + "grad_norm": 4.234745979309082, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8814006447792053, + "num_tokens": 829991981.0, + "step": 21750 + }, + { + "epoch": 2.7669507696221856, + "ewc_loss": 0.008504167199134827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.504166908096522e-05, + "grad_norm": 4.24575138092041, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8887053728103638, + "num_tokens": 830032409.0, + "step": 21751 + }, + { + "epoch": 2.767077979900776, + "ewc_loss": 0.008530022576451302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530022751074284e-05, + "grad_norm": 4.240724563598633, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8856147527694702, + "num_tokens": 830072738.0, + "step": 21752 + }, + { + "epoch": 2.7672051901793666, + "ewc_loss": 0.008517734706401825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.517735113855451e-05, + "grad_norm": 4.261159420013428, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8804366588592529, + "num_tokens": 830110880.0, + "step": 21753 + }, + { + "epoch": 2.767332400457957, + "ewc_loss": 0.008520885370671749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520885603502393e-05, + "grad_norm": 4.252401828765869, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8774921894073486, + "num_tokens": 830150148.0, + "step": 21754 + }, + { + "epoch": 2.7674596107365477, + "ewc_loss": 0.0085129514336586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512951899319887e-05, + "grad_norm": 4.315492153167725, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8723728656768799, + "num_tokens": 830190717.0, + "step": 21755 + }, + { + "epoch": 2.767586821015138, + "ewc_loss": 0.008546705357730389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546705794287845e-05, + "grad_norm": 4.273108959197998, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8846796154975891, + "num_tokens": 830226972.0, + "step": 21756 + }, + { + "epoch": 2.7677140312937283, + "ewc_loss": 0.008508804254233837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.508804603479803e-05, + "grad_norm": 4.279829025268555, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8908082246780396, + "num_tokens": 830262612.0, + "step": 21757 + }, + { + "epoch": 2.7678412415723193, + "ewc_loss": 0.008524813689291477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52481389301829e-05, + "grad_norm": 4.226936340332031, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8897263407707214, + "num_tokens": 830306856.0, + "step": 21758 + }, + { + "epoch": 2.7679684518509093, + "ewc_loss": 0.008506103418767452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506103768013418e-05, + "grad_norm": 4.347728729248047, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8821788430213928, + "num_tokens": 830340726.0, + "step": 21759 + }, + { + "epoch": 2.7680956621295003, + "ewc_loss": 0.008582193404436111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582193549955264e-05, + "grad_norm": 4.250946998596191, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8838462829589844, + "num_tokens": 830380697.0, + "step": 21760 + }, + { + "epoch": 2.7682228724080904, + "ewc_loss": 0.008491814136505127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.491813787259161e-05, + "grad_norm": 4.299871921539307, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8634762167930603, + "num_tokens": 830414737.0, + "step": 21761 + }, + { + "epoch": 2.768350082686681, + "ewc_loss": 0.008585701696574688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585702016716823e-05, + "grad_norm": 4.2458367347717285, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.882146954536438, + "num_tokens": 830456518.0, + "step": 21762 + }, + { + "epoch": 2.7684772929652715, + "ewc_loss": 0.008542225696146488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542225259589031e-05, + "grad_norm": 4.251375198364258, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.880309522151947, + "num_tokens": 830494192.0, + "step": 21763 + }, + { + "epoch": 2.768604503243862, + "ewc_loss": 0.008571615442633629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571615035180002e-05, + "grad_norm": 4.284122943878174, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8764024972915649, + "num_tokens": 830533659.0, + "step": 21764 + }, + { + "epoch": 2.7687317135224525, + "ewc_loss": 0.008583533577620983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583533781347796e-05, + "grad_norm": 4.290262699127197, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8770115375518799, + "num_tokens": 830569605.0, + "step": 21765 + }, + { + "epoch": 2.768858923801043, + "ewc_loss": 0.008579508401453495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57950872159563e-05, + "grad_norm": 4.317738056182861, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8631025552749634, + "num_tokens": 830610814.0, + "step": 21766 + }, + { + "epoch": 2.7689861340796336, + "ewc_loss": 0.0085846446454525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584644820075482e-05, + "grad_norm": 4.267838478088379, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8815914392471313, + "num_tokens": 830646811.0, + "step": 21767 + }, + { + "epoch": 2.769113344358224, + "ewc_loss": 0.008569173514842987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569173951400444e-05, + "grad_norm": 4.2761993408203125, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8870369791984558, + "num_tokens": 830680984.0, + "step": 21768 + }, + { + "epoch": 2.7692405546368146, + "ewc_loss": 0.00858014915138483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580149005865678e-05, + "grad_norm": 4.377026081085205, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8829648494720459, + "num_tokens": 830713384.0, + "step": 21769 + }, + { + "epoch": 2.769367764915405, + "ewc_loss": 0.008650263771414757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.650263771414757e-05, + "grad_norm": 4.4926371574401855, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.871354341506958, + "num_tokens": 830746922.0, + "step": 21770 + }, + { + "epoch": 2.7694949751939957, + "ewc_loss": 0.008678230457007885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.678230369696394e-05, + "grad_norm": 4.163151264190674, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8893874883651733, + "num_tokens": 830789459.0, + "step": 21771 + }, + { + "epoch": 2.769622185472586, + "ewc_loss": 0.00848614051938057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486140723107383e-05, + "grad_norm": 4.2758026123046875, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8798530101776123, + "num_tokens": 830826736.0, + "step": 21772 + }, + { + "epoch": 2.7697493957511767, + "ewc_loss": 0.008673352189362049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.673352567711845e-05, + "grad_norm": 4.222013473510742, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8860227465629578, + "num_tokens": 830868118.0, + "step": 21773 + }, + { + "epoch": 2.7698766060297673, + "ewc_loss": 0.00858540553599596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585405157646164e-05, + "grad_norm": 4.238593578338623, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8799713253974915, + "num_tokens": 830911658.0, + "step": 21774 + }, + { + "epoch": 2.770003816308358, + "ewc_loss": 0.008624464273452759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624464680906385e-05, + "grad_norm": 4.297269821166992, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8844257593154907, + "num_tokens": 830948456.0, + "step": 21775 + }, + { + "epoch": 2.7701310265869483, + "ewc_loss": 0.008625605143606663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625605551060289e-05, + "grad_norm": 4.2850661277771, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8899028301239014, + "num_tokens": 830988790.0, + "step": 21776 + }, + { + "epoch": 2.770258236865539, + "ewc_loss": 0.008624490350484848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624490146758035e-05, + "grad_norm": 4.328289031982422, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8750152587890625, + "num_tokens": 831024551.0, + "step": 21777 + }, + { + "epoch": 2.7703854471441294, + "ewc_loss": 0.008645106106996536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.645105845062062e-05, + "grad_norm": 4.283112525939941, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8800633549690247, + "num_tokens": 831060082.0, + "step": 21778 + }, + { + "epoch": 2.77051265742272, + "ewc_loss": 0.008599906228482723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599906141171232e-05, + "grad_norm": 4.241568088531494, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8837361931800842, + "num_tokens": 831100067.0, + "step": 21779 + }, + { + "epoch": 2.77063986770131, + "ewc_loss": 0.00859262514859438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592625090386719e-05, + "grad_norm": 4.276247501373291, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8937909603118896, + "num_tokens": 831131923.0, + "step": 21780 + }, + { + "epoch": 2.770767077979901, + "ewc_loss": 0.008634313009679317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634313417132944e-05, + "grad_norm": 4.292796611785889, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8698831796646118, + "num_tokens": 831167985.0, + "step": 21781 + }, + { + "epoch": 2.770894288258491, + "ewc_loss": 0.008637542836368084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63754321471788e-05, + "grad_norm": 4.2345099449157715, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8762266635894775, + "num_tokens": 831211737.0, + "step": 21782 + }, + { + "epoch": 2.771021498537082, + "ewc_loss": 0.00859486311674118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594863174948841e-05, + "grad_norm": 4.3092360496521, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8726697564125061, + "num_tokens": 831248764.0, + "step": 21783 + }, + { + "epoch": 2.771148708815672, + "ewc_loss": 0.008667295798659325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667295332998037e-05, + "grad_norm": 4.300631523132324, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8896983861923218, + "num_tokens": 831284140.0, + "step": 21784 + }, + { + "epoch": 2.771275919094263, + "ewc_loss": 0.008627250790596008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627250645076856e-05, + "grad_norm": 4.2908806800842285, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8844945430755615, + "num_tokens": 831318567.0, + "step": 21785 + }, + { + "epoch": 2.771403129372853, + "ewc_loss": 0.008620258420705795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620258449809626e-05, + "grad_norm": 4.269032001495361, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8749248385429382, + "num_tokens": 831357128.0, + "step": 21786 + }, + { + "epoch": 2.7715303396514437, + "ewc_loss": 0.008632922545075417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632922254037112e-05, + "grad_norm": 4.315390110015869, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8634361624717712, + "num_tokens": 831391551.0, + "step": 21787 + }, + { + "epoch": 2.771657549930034, + "ewc_loss": 0.008667741902172565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667742076795548e-05, + "grad_norm": 4.283221244812012, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8788180351257324, + "num_tokens": 831430332.0, + "step": 21788 + }, + { + "epoch": 2.7717847602086247, + "ewc_loss": 0.008638640865683556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63864115672186e-05, + "grad_norm": 4.221749782562256, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8894422650337219, + "num_tokens": 831471639.0, + "step": 21789 + }, + { + "epoch": 2.7719119704872153, + "ewc_loss": 0.008617505431175232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617505227448419e-05, + "grad_norm": 4.3529791831970215, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8860337138175964, + "num_tokens": 831506470.0, + "step": 21790 + }, + { + "epoch": 2.772039180765806, + "ewc_loss": 0.008705256506800652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.705256914254278e-05, + "grad_norm": 4.258016586303711, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8761645555496216, + "num_tokens": 831547212.0, + "step": 21791 + }, + { + "epoch": 2.7721663910443963, + "ewc_loss": 0.00860917940735817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609179349150509e-05, + "grad_norm": 4.2927961349487305, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8680834174156189, + "num_tokens": 831593181.0, + "step": 21792 + }, + { + "epoch": 2.772293601322987, + "ewc_loss": 0.00866755098104477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667550719110295e-05, + "grad_norm": 4.295629501342773, + "learning_rate": 1e-06, + "loss": 0.284, + "mean_token_accuracy": 0.8976534605026245, + "num_tokens": 831624889.0, + "step": 21793 + }, + { + "epoch": 2.7724208116015774, + "ewc_loss": 0.00863367784768343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633677498437464e-05, + "grad_norm": 4.256314277648926, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8880127668380737, + "num_tokens": 831662389.0, + "step": 21794 + }, + { + "epoch": 2.772548021880168, + "ewc_loss": 0.008617272600531578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617272396804765e-05, + "grad_norm": 4.237876892089844, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8858455419540405, + "num_tokens": 831705282.0, + "step": 21795 + }, + { + "epoch": 2.7726752321587584, + "ewc_loss": 0.008616091683506966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616091508883983e-05, + "grad_norm": 4.264105319976807, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8877155780792236, + "num_tokens": 831741604.0, + "step": 21796 + }, + { + "epoch": 2.772802442437349, + "ewc_loss": 0.008626054041087627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626054477645084e-05, + "grad_norm": 4.274295330047607, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8740335702896118, + "num_tokens": 831784165.0, + "step": 21797 + }, + { + "epoch": 2.7729296527159395, + "ewc_loss": 0.008615460246801376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615459955763072e-05, + "grad_norm": 4.2827534675598145, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8867260217666626, + "num_tokens": 831821101.0, + "step": 21798 + }, + { + "epoch": 2.77305686299453, + "ewc_loss": 0.008646741509437561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64674148033373e-05, + "grad_norm": 4.275553226470947, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8866410255432129, + "num_tokens": 831855766.0, + "step": 21799 + }, + { + "epoch": 2.7731840732731206, + "ewc_loss": 0.00861138291656971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611383236711845e-05, + "grad_norm": 4.225710391998291, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8881508708000183, + "num_tokens": 831895692.0, + "step": 21800 + }, + { + "epoch": 2.773311283551711, + "ewc_loss": 0.008589978329837322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589978097006679e-05, + "grad_norm": 4.262153625488281, + "learning_rate": 1e-06, + "loss": 0.2756, + "mean_token_accuracy": 0.899833083152771, + "num_tokens": 831931079.0, + "step": 21801 + }, + { + "epoch": 2.7734384938303016, + "ewc_loss": 0.008601474575698376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60147483763285e-05, + "grad_norm": 4.283667087554932, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8574175834655762, + "num_tokens": 831969742.0, + "step": 21802 + }, + { + "epoch": 2.773565704108892, + "ewc_loss": 0.008621386252343655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621386223239824e-05, + "grad_norm": 4.2973103523254395, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8726838231086731, + "num_tokens": 832011059.0, + "step": 21803 + }, + { + "epoch": 2.7736929143874827, + "ewc_loss": 0.008602701127529144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60270083649084e-05, + "grad_norm": 4.315974235534668, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8910303115844727, + "num_tokens": 832042768.0, + "step": 21804 + }, + { + "epoch": 2.7738201246660728, + "ewc_loss": 0.008612747304141521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612747478764504e-05, + "grad_norm": 4.285510063171387, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.890015721321106, + "num_tokens": 832077699.0, + "step": 21805 + }, + { + "epoch": 2.7739473349446637, + "ewc_loss": 0.00857500173151493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575001993449405e-05, + "grad_norm": 4.275288105010986, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8901660442352295, + "num_tokens": 832113528.0, + "step": 21806 + }, + { + "epoch": 2.774074545223254, + "ewc_loss": 0.00859008263796568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590082870796323e-05, + "grad_norm": 4.2595672607421875, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8859695196151733, + "num_tokens": 832149108.0, + "step": 21807 + }, + { + "epoch": 2.774201755501845, + "ewc_loss": 0.008600118570029736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600118599133566e-05, + "grad_norm": 4.256107330322266, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8982347846031189, + "num_tokens": 832189279.0, + "step": 21808 + }, + { + "epoch": 2.774328965780435, + "ewc_loss": 0.008604580536484718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604580943938345e-05, + "grad_norm": 4.3107805252075195, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8849948644638062, + "num_tokens": 832224423.0, + "step": 21809 + }, + { + "epoch": 2.774456176059026, + "ewc_loss": 0.008641381748020649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641382009955123e-05, + "grad_norm": 4.31193208694458, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8781172037124634, + "num_tokens": 832260389.0, + "step": 21810 + }, + { + "epoch": 2.774583386337616, + "ewc_loss": 0.008623388595879078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623388566775247e-05, + "grad_norm": 4.31610107421875, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8820765614509583, + "num_tokens": 832297591.0, + "step": 21811 + }, + { + "epoch": 2.7747105966162064, + "ewc_loss": 0.008639179170131683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63917957758531e-05, + "grad_norm": 4.2890520095825195, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8884769082069397, + "num_tokens": 832330195.0, + "step": 21812 + }, + { + "epoch": 2.774837806894797, + "ewc_loss": 0.008616223931312561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616223931312561e-05, + "grad_norm": 4.327159881591797, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8900874257087708, + "num_tokens": 832361613.0, + "step": 21813 + }, + { + "epoch": 2.7749650171733875, + "ewc_loss": 0.00865568220615387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655682177050039e-05, + "grad_norm": 4.260452747344971, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8889478445053101, + "num_tokens": 832402800.0, + "step": 21814 + }, + { + "epoch": 2.775092227451978, + "ewc_loss": 0.008622266352176666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622265886515379e-05, + "grad_norm": 4.280273914337158, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.884755551815033, + "num_tokens": 832445161.0, + "step": 21815 + }, + { + "epoch": 2.7752194377305686, + "ewc_loss": 0.00863867811858654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638678264105693e-05, + "grad_norm": 4.277106761932373, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8697857856750488, + "num_tokens": 832487905.0, + "step": 21816 + }, + { + "epoch": 2.775346648009159, + "ewc_loss": 0.008638078346848488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638077997602522e-05, + "grad_norm": 4.306665420532227, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.884901762008667, + "num_tokens": 832521107.0, + "step": 21817 + }, + { + "epoch": 2.7754738582877496, + "ewc_loss": 0.008655702695250511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655702549731359e-05, + "grad_norm": 4.306370735168457, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8789498209953308, + "num_tokens": 832557213.0, + "step": 21818 + }, + { + "epoch": 2.77560106856634, + "ewc_loss": 0.008651693351566792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.651693497085944e-05, + "grad_norm": 4.275635719299316, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8706696629524231, + "num_tokens": 832600445.0, + "step": 21819 + }, + { + "epoch": 2.7757282788449307, + "ewc_loss": 0.00862977933138609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629779040347785e-05, + "grad_norm": 4.28980016708374, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8796508312225342, + "num_tokens": 832635994.0, + "step": 21820 + }, + { + "epoch": 2.775855489123521, + "ewc_loss": 0.008649575524032116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.649575465824455e-05, + "grad_norm": 4.400248050689697, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8933534622192383, + "num_tokens": 832677151.0, + "step": 21821 + }, + { + "epoch": 2.7759826994021117, + "ewc_loss": 0.008694685064256191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.69468494784087e-05, + "grad_norm": 4.261242866516113, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8822247982025146, + "num_tokens": 832713995.0, + "step": 21822 + }, + { + "epoch": 2.7761099096807023, + "ewc_loss": 0.008604963310062885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60496293171309e-05, + "grad_norm": 4.256550312042236, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8743753433227539, + "num_tokens": 832754201.0, + "step": 21823 + }, + { + "epoch": 2.776237119959293, + "ewc_loss": 0.008635936304926872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635935955680907e-05, + "grad_norm": 4.246789455413818, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8796373009681702, + "num_tokens": 832796920.0, + "step": 21824 + }, + { + "epoch": 2.7763643302378833, + "ewc_loss": 0.008644192479550838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644192712381482e-05, + "grad_norm": 4.251494407653809, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8823506236076355, + "num_tokens": 832835952.0, + "step": 21825 + }, + { + "epoch": 2.776491540516474, + "ewc_loss": 0.008631362579762936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631363016320392e-05, + "grad_norm": 4.234912395477295, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8857849836349487, + "num_tokens": 832878306.0, + "step": 21826 + }, + { + "epoch": 2.7766187507950644, + "ewc_loss": 0.008591754361987114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591754158260301e-05, + "grad_norm": 4.300415515899658, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.875525951385498, + "num_tokens": 832914315.0, + "step": 21827 + }, + { + "epoch": 2.776745961073655, + "ewc_loss": 0.008662478998303413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662478649057448e-05, + "grad_norm": 4.258887767791748, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8788309693336487, + "num_tokens": 832956661.0, + "step": 21828 + }, + { + "epoch": 2.7768731713522454, + "ewc_loss": 0.008606227114796638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606226765550673e-05, + "grad_norm": 4.262935638427734, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8878640532493591, + "num_tokens": 832999446.0, + "step": 21829 + }, + { + "epoch": 2.7770003816308355, + "ewc_loss": 0.00861340295523405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613403042545542e-05, + "grad_norm": 4.334312438964844, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8571920394897461, + "num_tokens": 833039001.0, + "step": 21830 + }, + { + "epoch": 2.7771275919094265, + "ewc_loss": 0.008631519041955471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631519449409097e-05, + "grad_norm": 4.298529624938965, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8817148208618164, + "num_tokens": 833074334.0, + "step": 21831 + }, + { + "epoch": 2.7772548021880166, + "ewc_loss": 0.008569302968680859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569302735850215e-05, + "grad_norm": 4.289332866668701, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8685473203659058, + "num_tokens": 833113089.0, + "step": 21832 + }, + { + "epoch": 2.7773820124666075, + "ewc_loss": 0.008611183613538742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611183147877455e-05, + "grad_norm": 4.335042953491211, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8798255920410156, + "num_tokens": 833149755.0, + "step": 21833 + }, + { + "epoch": 2.7775092227451976, + "ewc_loss": 0.008615073747932911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615073602413759e-05, + "grad_norm": 4.302910327911377, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8748005032539368, + "num_tokens": 833187718.0, + "step": 21834 + }, + { + "epoch": 2.7776364330237886, + "ewc_loss": 0.008586183190345764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58618295751512e-05, + "grad_norm": 4.272833824157715, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8904297947883606, + "num_tokens": 833221416.0, + "step": 21835 + }, + { + "epoch": 2.7777636433023787, + "ewc_loss": 0.00859538558870554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59538558870554e-05, + "grad_norm": 4.2086968421936035, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8962245583534241, + "num_tokens": 833261101.0, + "step": 21836 + }, + { + "epoch": 2.777890853580969, + "ewc_loss": 0.008552740328013897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552740473533049e-05, + "grad_norm": 4.267755031585693, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8910838961601257, + "num_tokens": 833296804.0, + "step": 21837 + }, + { + "epoch": 2.7780180638595597, + "ewc_loss": 0.008620701730251312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62070155562833e-05, + "grad_norm": 4.337460517883301, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8826732635498047, + "num_tokens": 833332005.0, + "step": 21838 + }, + { + "epoch": 2.7781452741381503, + "ewc_loss": 0.0086232153698802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62321539898403e-05, + "grad_norm": 4.252612113952637, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8792287111282349, + "num_tokens": 833371991.0, + "step": 21839 + }, + { + "epoch": 2.778272484416741, + "ewc_loss": 0.008579220622777939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579220593674108e-05, + "grad_norm": 4.274557113647461, + "learning_rate": 1e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8943038582801819, + "num_tokens": 833407514.0, + "step": 21840 + }, + { + "epoch": 2.7783996946953313, + "ewc_loss": 0.008642288856208324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64228859427385e-05, + "grad_norm": 4.3032002449035645, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8819204568862915, + "num_tokens": 833444519.0, + "step": 21841 + }, + { + "epoch": 2.778526904973922, + "ewc_loss": 0.008637324906885624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637324935989454e-05, + "grad_norm": 4.330300331115723, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8781030178070068, + "num_tokens": 833482252.0, + "step": 21842 + }, + { + "epoch": 2.7786541152525124, + "ewc_loss": 0.008637343533337116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637343853479251e-05, + "grad_norm": 4.283049583435059, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8807628154754639, + "num_tokens": 833515513.0, + "step": 21843 + }, + { + "epoch": 2.778781325531103, + "ewc_loss": 0.008613456971943378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613456884631887e-05, + "grad_norm": 4.273655414581299, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8893117904663086, + "num_tokens": 833551188.0, + "step": 21844 + }, + { + "epoch": 2.7789085358096934, + "ewc_loss": 0.008636508136987686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636508573545143e-05, + "grad_norm": 4.2601518630981445, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.8916690945625305, + "num_tokens": 833589159.0, + "step": 21845 + }, + { + "epoch": 2.779035746088284, + "ewc_loss": 0.008635826408863068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635826088720933e-05, + "grad_norm": 4.307323455810547, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8740177154541016, + "num_tokens": 833623581.0, + "step": 21846 + }, + { + "epoch": 2.7791629563668745, + "ewc_loss": 0.008662814274430275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662814070703462e-05, + "grad_norm": 4.3286919593811035, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8737350106239319, + "num_tokens": 833660466.0, + "step": 21847 + }, + { + "epoch": 2.779290166645465, + "ewc_loss": 0.00865340605378151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.653406257508323e-05, + "grad_norm": 4.291590690612793, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8735147714614868, + "num_tokens": 833697465.0, + "step": 21848 + }, + { + "epoch": 2.7794173769240555, + "ewc_loss": 0.008639409206807613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639408770250157e-05, + "grad_norm": 4.287364959716797, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8808757662773132, + "num_tokens": 833735178.0, + "step": 21849 + }, + { + "epoch": 2.779544587202646, + "ewc_loss": 0.008643129840493202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64312969497405e-05, + "grad_norm": 4.285253047943115, + "learning_rate": 1e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.8982067704200745, + "num_tokens": 833770193.0, + "step": 21850 + }, + { + "epoch": 2.7796717974812366, + "ewc_loss": 0.00866520032286644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.665200584800914e-05, + "grad_norm": 4.31343412399292, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8837247490882874, + "num_tokens": 833807836.0, + "step": 21851 + }, + { + "epoch": 2.779799007759827, + "ewc_loss": 0.008659183979034424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659184095449746e-05, + "grad_norm": 4.258877277374268, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8794440627098083, + "num_tokens": 833847605.0, + "step": 21852 + }, + { + "epoch": 2.7799262180384177, + "ewc_loss": 0.008624427951872349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624427573522553e-05, + "grad_norm": 4.281970024108887, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8868170976638794, + "num_tokens": 833884239.0, + "step": 21853 + }, + { + "epoch": 2.780053428317008, + "ewc_loss": 0.008634033612906933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634034020360559e-05, + "grad_norm": 4.265111446380615, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8842632174491882, + "num_tokens": 833921305.0, + "step": 21854 + }, + { + "epoch": 2.7801806385955983, + "ewc_loss": 0.008624084293842316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624084148323163e-05, + "grad_norm": 4.325176239013672, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8744183778762817, + "num_tokens": 833959297.0, + "step": 21855 + }, + { + "epoch": 2.7803078488741892, + "ewc_loss": 0.008660007268190384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66000700625591e-05, + "grad_norm": 4.366940021514893, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8624446988105774, + "num_tokens": 833992135.0, + "step": 21856 + }, + { + "epoch": 2.7804350591527793, + "ewc_loss": 0.008670879527926445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.670879469718784e-05, + "grad_norm": 4.274437427520752, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8741084337234497, + "num_tokens": 834035933.0, + "step": 21857 + }, + { + "epoch": 2.7805622694313703, + "ewc_loss": 0.008607687428593636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607687777839601e-05, + "grad_norm": 4.327036380767822, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8816405534744263, + "num_tokens": 834070187.0, + "step": 21858 + }, + { + "epoch": 2.7806894797099604, + "ewc_loss": 0.008673734962940216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67373455548659e-05, + "grad_norm": 4.245423793792725, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8836155533790588, + "num_tokens": 834107370.0, + "step": 21859 + }, + { + "epoch": 2.780816689988551, + "ewc_loss": 0.008621767163276672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621767483418807e-05, + "grad_norm": 4.265753269195557, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8743560910224915, + "num_tokens": 834145357.0, + "step": 21860 + }, + { + "epoch": 2.7809439002671414, + "ewc_loss": 0.008663217537105083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663217886351049e-05, + "grad_norm": 4.271440029144287, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8788006901741028, + "num_tokens": 834187250.0, + "step": 21861 + }, + { + "epoch": 2.781071110545732, + "ewc_loss": 0.008647050708532333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647050708532333e-05, + "grad_norm": 4.259716033935547, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8926519751548767, + "num_tokens": 834227598.0, + "step": 21862 + }, + { + "epoch": 2.7811983208243225, + "ewc_loss": 0.008618735708296299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618735591880977e-05, + "grad_norm": 4.252193450927734, + "learning_rate": 1e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.8957507014274597, + "num_tokens": 834267032.0, + "step": 21863 + }, + { + "epoch": 2.781325531102913, + "ewc_loss": 0.008620708249509335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620708103990182e-05, + "grad_norm": 4.267674446105957, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8729429244995117, + "num_tokens": 834308479.0, + "step": 21864 + }, + { + "epoch": 2.7814527413815036, + "ewc_loss": 0.008630986325442791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.630986121715978e-05, + "grad_norm": 4.246708393096924, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.8910714387893677, + "num_tokens": 834346185.0, + "step": 21865 + }, + { + "epoch": 2.781579951660094, + "ewc_loss": 0.008607360534369946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607360359746963e-05, + "grad_norm": 4.29376745223999, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.890442967414856, + "num_tokens": 834384404.0, + "step": 21866 + }, + { + "epoch": 2.7817071619386846, + "ewc_loss": 0.0086296321824193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629632066003978e-05, + "grad_norm": 4.298527240753174, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8793233633041382, + "num_tokens": 834424833.0, + "step": 21867 + }, + { + "epoch": 2.781834372217275, + "ewc_loss": 0.008600512519478798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600512228440493e-05, + "grad_norm": 4.254642963409424, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8867269158363342, + "num_tokens": 834464215.0, + "step": 21868 + }, + { + "epoch": 2.7819615824958657, + "ewc_loss": 0.008585858158767223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585858449805528e-05, + "grad_norm": 4.391317367553711, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.861382782459259, + "num_tokens": 834502452.0, + "step": 21869 + }, + { + "epoch": 2.782088792774456, + "ewc_loss": 0.0086662657558918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66626578499563e-05, + "grad_norm": 4.286171913146973, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8815156817436218, + "num_tokens": 834539690.0, + "step": 21870 + }, + { + "epoch": 2.7822160030530467, + "ewc_loss": 0.00856009591370821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560095739085227e-05, + "grad_norm": 4.2545166015625, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8728947639465332, + "num_tokens": 834583193.0, + "step": 21871 + }, + { + "epoch": 2.7823432133316373, + "ewc_loss": 0.008571146056056023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571145735913888e-05, + "grad_norm": 4.319913864135742, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8769718408584595, + "num_tokens": 834620028.0, + "step": 21872 + }, + { + "epoch": 2.782470423610228, + "ewc_loss": 0.00863560102880001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635601261630654e-05, + "grad_norm": 4.329607963562012, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8738633394241333, + "num_tokens": 834657351.0, + "step": 21873 + }, + { + "epoch": 2.7825976338888183, + "ewc_loss": 0.00858959648758173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589596109231934e-05, + "grad_norm": 4.2109832763671875, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.8964175581932068, + "num_tokens": 834693687.0, + "step": 21874 + }, + { + "epoch": 2.782724844167409, + "ewc_loss": 0.008544658310711384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.544658339815214e-05, + "grad_norm": 4.276752948760986, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8814874291419983, + "num_tokens": 834736171.0, + "step": 21875 + }, + { + "epoch": 2.7828520544459994, + "ewc_loss": 0.008611642755568027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611642988398671e-05, + "grad_norm": 4.223572731018066, + "learning_rate": 1e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8953002691268921, + "num_tokens": 834775699.0, + "step": 21876 + }, + { + "epoch": 2.78297926472459, + "ewc_loss": 0.008533649146556854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533649088349193e-05, + "grad_norm": 4.261800289154053, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8771040439605713, + "num_tokens": 834816053.0, + "step": 21877 + }, + { + "epoch": 2.78310647500318, + "ewc_loss": 0.008583631366491318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583631279179826e-05, + "grad_norm": 4.251526355743408, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8802044987678528, + "num_tokens": 834857125.0, + "step": 21878 + }, + { + "epoch": 2.783233685281771, + "ewc_loss": 0.008566748350858688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566748147131875e-05, + "grad_norm": 4.2733941078186035, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8939735293388367, + "num_tokens": 834895676.0, + "step": 21879 + }, + { + "epoch": 2.783360895560361, + "ewc_loss": 0.008574660867452621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5746607510373e-05, + "grad_norm": 4.330873012542725, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8831802606582642, + "num_tokens": 834927748.0, + "step": 21880 + }, + { + "epoch": 2.783488105838952, + "ewc_loss": 0.008599457331001759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599457214586437e-05, + "grad_norm": 4.2717742919921875, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8787575364112854, + "num_tokens": 834966722.0, + "step": 21881 + }, + { + "epoch": 2.783615316117542, + "ewc_loss": 0.008533550426363945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533550862921402e-05, + "grad_norm": 4.280189514160156, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8846030235290527, + "num_tokens": 835003770.0, + "step": 21882 + }, + { + "epoch": 2.783742526396133, + "ewc_loss": 0.008585937321186066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585937757743523e-05, + "grad_norm": 4.300459861755371, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8897162675857544, + "num_tokens": 835041504.0, + "step": 21883 + }, + { + "epoch": 2.783869736674723, + "ewc_loss": 0.008594825863838196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594826067565009e-05, + "grad_norm": 4.267084121704102, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8851101994514465, + "num_tokens": 835075179.0, + "step": 21884 + }, + { + "epoch": 2.7839969469533137, + "ewc_loss": 0.008587297983467579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587297634221613e-05, + "grad_norm": 4.312704563140869, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.872723400592804, + "num_tokens": 835113080.0, + "step": 21885 + }, + { + "epoch": 2.784124157231904, + "ewc_loss": 0.00861523486673832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615235128672794e-05, + "grad_norm": 4.2897725105285645, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.881880521774292, + "num_tokens": 835150586.0, + "step": 21886 + }, + { + "epoch": 2.7842513675104947, + "ewc_loss": 0.008587141521275043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587141201132908e-05, + "grad_norm": 4.329493045806885, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8805177807807922, + "num_tokens": 835184049.0, + "step": 21887 + }, + { + "epoch": 2.7843785777890853, + "ewc_loss": 0.008627187460660934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627187344245613e-05, + "grad_norm": 4.289528846740723, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8768898844718933, + "num_tokens": 835222745.0, + "step": 21888 + }, + { + "epoch": 2.784505788067676, + "ewc_loss": 0.008602793328464031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60279324115254e-05, + "grad_norm": 4.2612080574035645, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8696913719177246, + "num_tokens": 835263017.0, + "step": 21889 + }, + { + "epoch": 2.7846329983462663, + "ewc_loss": 0.008621816523373127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621816959930584e-05, + "grad_norm": 4.210551738739014, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8946371078491211, + "num_tokens": 835303916.0, + "step": 21890 + }, + { + "epoch": 2.784760208624857, + "ewc_loss": 0.008604410104453564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604409958934411e-05, + "grad_norm": 4.256545543670654, + "learning_rate": 1e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.896536648273468, + "num_tokens": 835343148.0, + "step": 21891 + }, + { + "epoch": 2.7848874189034474, + "ewc_loss": 0.008647527545690536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647527283756062e-05, + "grad_norm": 4.273087024688721, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8795761466026306, + "num_tokens": 835382337.0, + "step": 21892 + }, + { + "epoch": 2.785014629182038, + "ewc_loss": 0.008626307360827923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626307680970058e-05, + "grad_norm": 4.278338432312012, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8751926422119141, + "num_tokens": 835423645.0, + "step": 21893 + }, + { + "epoch": 2.7851418394606284, + "ewc_loss": 0.008627869188785553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627869101474062e-05, + "grad_norm": 4.3301496505737305, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8566514253616333, + "num_tokens": 835464590.0, + "step": 21894 + }, + { + "epoch": 2.785269049739219, + "ewc_loss": 0.008657483384013176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.657483704155311e-05, + "grad_norm": 4.62001371383667, + "learning_rate": 1e-06, + "loss": 0.2774, + "mean_token_accuracy": 0.9040223360061646, + "num_tokens": 835496376.0, + "step": 21895 + }, + { + "epoch": 2.7853962600178095, + "ewc_loss": 0.00878618098795414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.786180842434987e-05, + "grad_norm": 4.254176139831543, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.867715060710907, + "num_tokens": 835541231.0, + "step": 21896 + }, + { + "epoch": 2.7855234702964, + "ewc_loss": 0.008498115465044975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.498115494148806e-05, + "grad_norm": 4.23844575881958, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8868577480316162, + "num_tokens": 835584038.0, + "step": 21897 + }, + { + "epoch": 2.7856506805749905, + "ewc_loss": 0.008602156303822994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602155867265537e-05, + "grad_norm": 4.252735614776611, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8757885694503784, + "num_tokens": 835627189.0, + "step": 21898 + }, + { + "epoch": 2.785777890853581, + "ewc_loss": 0.008604282513260841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604282629676163e-05, + "grad_norm": 4.312948226928711, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8836534023284912, + "num_tokens": 835659701.0, + "step": 21899 + }, + { + "epoch": 2.7859051011321716, + "ewc_loss": 0.00862494483590126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624944894108921e-05, + "grad_norm": 4.294360160827637, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.889163613319397, + "num_tokens": 835694045.0, + "step": 21900 + }, + { + "epoch": 2.786032311410762, + "ewc_loss": 0.008575700223445892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575700485380366e-05, + "grad_norm": 4.270624160766602, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8798341155052185, + "num_tokens": 835732118.0, + "step": 21901 + }, + { + "epoch": 2.7861595216893527, + "ewc_loss": 0.008601859211921692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601859008194879e-05, + "grad_norm": 4.261213779449463, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8907093405723572, + "num_tokens": 835772700.0, + "step": 21902 + }, + { + "epoch": 2.7862867319679427, + "ewc_loss": 0.00858089979737997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580899884691462e-05, + "grad_norm": 4.310428619384766, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8669883608818054, + "num_tokens": 835811206.0, + "step": 21903 + }, + { + "epoch": 2.7864139422465337, + "ewc_loss": 0.008629628457129002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629628428025171e-05, + "grad_norm": 4.2896409034729, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8885053396224976, + "num_tokens": 835844381.0, + "step": 21904 + }, + { + "epoch": 2.786541152525124, + "ewc_loss": 0.00859594251960516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595942199463025e-05, + "grad_norm": 4.311675548553467, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8757374286651611, + "num_tokens": 835879926.0, + "step": 21905 + }, + { + "epoch": 2.7866683628037148, + "ewc_loss": 0.008627046830952168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627046918263659e-05, + "grad_norm": 4.2418694496154785, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8697462677955627, + "num_tokens": 835924014.0, + "step": 21906 + }, + { + "epoch": 2.786795573082305, + "ewc_loss": 0.00857248529791832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572485239710659e-05, + "grad_norm": 4.309096813201904, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.876811146736145, + "num_tokens": 835956054.0, + "step": 21907 + }, + { + "epoch": 2.786922783360896, + "ewc_loss": 0.00864560529589653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.645605703350157e-05, + "grad_norm": 4.279000282287598, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8868558406829834, + "num_tokens": 835988885.0, + "step": 21908 + }, + { + "epoch": 2.787049993639486, + "ewc_loss": 0.008594898506999016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594898827141151e-05, + "grad_norm": 4.20996618270874, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8811162114143372, + "num_tokens": 836028885.0, + "step": 21909 + }, + { + "epoch": 2.7871772039180764, + "ewc_loss": 0.008589856326580048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58985586091876e-05, + "grad_norm": 4.242159843444824, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8940384387969971, + "num_tokens": 836067340.0, + "step": 21910 + }, + { + "epoch": 2.787304414196667, + "ewc_loss": 0.008620348758995533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620348671684042e-05, + "grad_norm": 4.253207683563232, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8839486837387085, + "num_tokens": 836104432.0, + "step": 21911 + }, + { + "epoch": 2.7874316244752575, + "ewc_loss": 0.008612756617367268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612756937509403e-05, + "grad_norm": 4.27595853805542, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8765348196029663, + "num_tokens": 836141197.0, + "step": 21912 + }, + { + "epoch": 2.787558834753848, + "ewc_loss": 0.008631588891148567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631588571006432e-05, + "grad_norm": 4.278232097625732, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8939739465713501, + "num_tokens": 836176949.0, + "step": 21913 + }, + { + "epoch": 2.7876860450324386, + "ewc_loss": 0.008644980378448963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644979970995337e-05, + "grad_norm": 4.3443522453308105, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8799105882644653, + "num_tokens": 836206484.0, + "step": 21914 + }, + { + "epoch": 2.787813255311029, + "ewc_loss": 0.008668359369039536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668359805596992e-05, + "grad_norm": 4.295053482055664, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8876820802688599, + "num_tokens": 836243056.0, + "step": 21915 + }, + { + "epoch": 2.7879404655896196, + "ewc_loss": 0.008622715249657631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622715540695935e-05, + "grad_norm": 4.249340534210205, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8760568499565125, + "num_tokens": 836281154.0, + "step": 21916 + }, + { + "epoch": 2.78806767586821, + "ewc_loss": 0.008648050017654896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.648049697512761e-05, + "grad_norm": 4.276665210723877, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8804703950881958, + "num_tokens": 836319895.0, + "step": 21917 + }, + { + "epoch": 2.7881948861468007, + "ewc_loss": 0.008663802407681942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663802145747468e-05, + "grad_norm": 4.284184455871582, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8866922855377197, + "num_tokens": 836357227.0, + "step": 21918 + }, + { + "epoch": 2.788322096425391, + "ewc_loss": 0.008647868409752846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647868526168168e-05, + "grad_norm": 4.2636213302612305, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8764698505401611, + "num_tokens": 836398442.0, + "step": 21919 + }, + { + "epoch": 2.7884493067039817, + "ewc_loss": 0.008645597845315933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.645597699796781e-05, + "grad_norm": 4.2846174240112305, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.889102578163147, + "num_tokens": 836436660.0, + "step": 21920 + }, + { + "epoch": 2.7885765169825723, + "ewc_loss": 0.008649601601064205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.649601659271866e-05, + "grad_norm": 4.268876075744629, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8773760199546814, + "num_tokens": 836479180.0, + "step": 21921 + }, + { + "epoch": 2.788703727261163, + "ewc_loss": 0.008632386103272438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632386015960947e-05, + "grad_norm": 4.297812461853027, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8874673247337341, + "num_tokens": 836518874.0, + "step": 21922 + }, + { + "epoch": 2.7888309375397533, + "ewc_loss": 0.00864627305418253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646272908663377e-05, + "grad_norm": 4.240972518920898, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8795733451843262, + "num_tokens": 836556910.0, + "step": 21923 + }, + { + "epoch": 2.788958147818344, + "ewc_loss": 0.008619477972388268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619477739557624e-05, + "grad_norm": 4.234512805938721, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8929766416549683, + "num_tokens": 836598151.0, + "step": 21924 + }, + { + "epoch": 2.7890853580969344, + "ewc_loss": 0.0086361700668931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636170241516083e-05, + "grad_norm": 4.2531232833862305, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8751934766769409, + "num_tokens": 836637952.0, + "step": 21925 + }, + { + "epoch": 2.789212568375525, + "ewc_loss": 0.008632594719529152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632594835944474e-05, + "grad_norm": 4.251466751098633, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8944010734558105, + "num_tokens": 836677138.0, + "step": 21926 + }, + { + "epoch": 2.7893397786541154, + "ewc_loss": 0.00860879197716713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608792268205434e-05, + "grad_norm": 4.2390971183776855, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8875664472579956, + "num_tokens": 836716880.0, + "step": 21927 + }, + { + "epoch": 2.7894669889327055, + "ewc_loss": 0.008627197705209255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627197530586272e-05, + "grad_norm": 4.248840808868408, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8935366272926331, + "num_tokens": 836752572.0, + "step": 21928 + }, + { + "epoch": 2.7895941992112965, + "ewc_loss": 0.00860358402132988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603584137745202e-05, + "grad_norm": 4.268133640289307, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8789777755737305, + "num_tokens": 836794321.0, + "step": 21929 + }, + { + "epoch": 2.7897214094898866, + "ewc_loss": 0.00861301552504301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613015961600468e-05, + "grad_norm": 4.274485111236572, + "learning_rate": 1e-06, + "loss": 0.286, + "mean_token_accuracy": 0.895381510257721, + "num_tokens": 836829183.0, + "step": 21930 + }, + { + "epoch": 2.7898486197684775, + "ewc_loss": 0.00859918538480997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599185093771666e-05, + "grad_norm": 4.341883182525635, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8728381395339966, + "num_tokens": 836864644.0, + "step": 21931 + }, + { + "epoch": 2.7899758300470676, + "ewc_loss": 0.008633039891719818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633040124550462e-05, + "grad_norm": 4.3048882484436035, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8950011730194092, + "num_tokens": 836895384.0, + "step": 21932 + }, + { + "epoch": 2.7901030403256586, + "ewc_loss": 0.008594812825322151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594812970841303e-05, + "grad_norm": 4.219416618347168, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8853524327278137, + "num_tokens": 836943336.0, + "step": 21933 + }, + { + "epoch": 2.7902302506042487, + "ewc_loss": 0.008549352176487446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549352060072124e-05, + "grad_norm": 4.229568958282471, + "learning_rate": 1e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.8955773115158081, + "num_tokens": 836980708.0, + "step": 21934 + }, + { + "epoch": 2.790357460882839, + "ewc_loss": 0.008589896373450756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5898966062814e-05, + "grad_norm": 4.302201747894287, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8875877261161804, + "num_tokens": 837015738.0, + "step": 21935 + }, + { + "epoch": 2.7904846711614297, + "ewc_loss": 0.00859750621020794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597505802754313e-05, + "grad_norm": 4.276437759399414, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8891908526420593, + "num_tokens": 837053014.0, + "step": 21936 + }, + { + "epoch": 2.7906118814400203, + "ewc_loss": 0.008587270975112915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587271440774202e-05, + "grad_norm": 4.395516872406006, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8772956132888794, + "num_tokens": 837083579.0, + "step": 21937 + }, + { + "epoch": 2.790739091718611, + "ewc_loss": 0.008646054193377495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646054629934952e-05, + "grad_norm": 4.221948623657227, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8907815217971802, + "num_tokens": 837126201.0, + "step": 21938 + }, + { + "epoch": 2.7908663019972013, + "ewc_loss": 0.008527620695531368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527620957465842e-05, + "grad_norm": 4.30805778503418, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8925591707229614, + "num_tokens": 837162400.0, + "step": 21939 + }, + { + "epoch": 2.790993512275792, + "ewc_loss": 0.008637660183012486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637660357635468e-05, + "grad_norm": 4.22453498840332, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8849865198135376, + "num_tokens": 837205733.0, + "step": 21940 + }, + { + "epoch": 2.7911207225543824, + "ewc_loss": 0.008566259406507015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566259202780202e-05, + "grad_norm": 4.309183597564697, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.873751699924469, + "num_tokens": 837243838.0, + "step": 21941 + }, + { + "epoch": 2.791247932832973, + "ewc_loss": 0.008624376729130745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624376641819254e-05, + "grad_norm": 4.288534641265869, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8789398670196533, + "num_tokens": 837280936.0, + "step": 21942 + }, + { + "epoch": 2.7913751431115634, + "ewc_loss": 0.008577665314078331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577665721531957e-05, + "grad_norm": 4.244942665100098, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.883010983467102, + "num_tokens": 837321790.0, + "step": 21943 + }, + { + "epoch": 2.791502353390154, + "ewc_loss": 0.00854772049933672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547720790375024e-05, + "grad_norm": 4.263792514801025, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8844780325889587, + "num_tokens": 837362023.0, + "step": 21944 + }, + { + "epoch": 2.7916295636687445, + "ewc_loss": 0.008586816489696503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586816693423316e-05, + "grad_norm": 4.212726593017578, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8885923624038696, + "num_tokens": 837404876.0, + "step": 21945 + }, + { + "epoch": 2.791756773947335, + "ewc_loss": 0.008549883961677551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549883932573721e-05, + "grad_norm": 4.333733558654785, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8782768249511719, + "num_tokens": 837442899.0, + "step": 21946 + }, + { + "epoch": 2.7918839842259255, + "ewc_loss": 0.008625372312963009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625371992820874e-05, + "grad_norm": 4.280975341796875, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8921947479248047, + "num_tokens": 837477611.0, + "step": 21947 + }, + { + "epoch": 2.792011194504516, + "ewc_loss": 0.008546541444957256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546541357645765e-05, + "grad_norm": 4.293530464172363, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8714315891265869, + "num_tokens": 837515928.0, + "step": 21948 + }, + { + "epoch": 2.7921384047831066, + "ewc_loss": 0.00858645886182785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586458716308698e-05, + "grad_norm": 4.283674240112305, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8839869499206543, + "num_tokens": 837553042.0, + "step": 21949 + }, + { + "epoch": 2.792265615061697, + "ewc_loss": 0.008570130914449692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570130739826709e-05, + "grad_norm": 4.289032459259033, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8891503810882568, + "num_tokens": 837589749.0, + "step": 21950 + }, + { + "epoch": 2.7923928253402877, + "ewc_loss": 0.008573903702199459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573903323849663e-05, + "grad_norm": 4.222815990447998, + "learning_rate": 1e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.9012761116027832, + "num_tokens": 837631104.0, + "step": 21951 + }, + { + "epoch": 2.792520035618878, + "ewc_loss": 0.008537891320884228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.537890971638262e-05, + "grad_norm": 4.321288108825684, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8903099298477173, + "num_tokens": 837668173.0, + "step": 21952 + }, + { + "epoch": 2.7926472458974683, + "ewc_loss": 0.00859797466546297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597975102020428e-05, + "grad_norm": 4.272581577301025, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8883431553840637, + "num_tokens": 837704282.0, + "step": 21953 + }, + { + "epoch": 2.7927744561760592, + "ewc_loss": 0.008555972017347813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555971726309508e-05, + "grad_norm": 4.270781993865967, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8729157447814941, + "num_tokens": 837750790.0, + "step": 21954 + }, + { + "epoch": 2.7929016664546493, + "ewc_loss": 0.008547885343432426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547885227017105e-05, + "grad_norm": 4.267975330352783, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.88114994764328, + "num_tokens": 837792041.0, + "step": 21955 + }, + { + "epoch": 2.7930288767332403, + "ewc_loss": 0.008553062565624714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553062798455358e-05, + "grad_norm": 4.448273181915283, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8894293308258057, + "num_tokens": 837818645.0, + "step": 21956 + }, + { + "epoch": 2.7931560870118304, + "ewc_loss": 0.008635692298412323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635692211100832e-05, + "grad_norm": 4.238944053649902, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8914586901664734, + "num_tokens": 837859816.0, + "step": 21957 + }, + { + "epoch": 2.793283297290421, + "ewc_loss": 0.00848400965332985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484009595122188e-05, + "grad_norm": 4.271100044250488, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8775229454040527, + "num_tokens": 837903970.0, + "step": 21958 + }, + { + "epoch": 2.7934105075690114, + "ewc_loss": 0.008573444560170174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573444210924208e-05, + "grad_norm": 4.331182479858398, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8843579888343811, + "num_tokens": 837941158.0, + "step": 21959 + }, + { + "epoch": 2.793537717847602, + "ewc_loss": 0.008594896644353867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594896644353867e-05, + "grad_norm": 4.312551498413086, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.888253927230835, + "num_tokens": 837978157.0, + "step": 21960 + }, + { + "epoch": 2.7936649281261925, + "ewc_loss": 0.008544143289327621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54414320201613e-05, + "grad_norm": 4.198935508728027, + "learning_rate": 1e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.8938624858856201, + "num_tokens": 838017871.0, + "step": 21961 + }, + { + "epoch": 2.793792138404783, + "ewc_loss": 0.008529533632099628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529533806722611e-05, + "grad_norm": 4.3558478355407715, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8932726979255676, + "num_tokens": 838050521.0, + "step": 21962 + }, + { + "epoch": 2.7939193486833735, + "ewc_loss": 0.008637758903205395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637759310659021e-05, + "grad_norm": 4.281561851501465, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8900015354156494, + "num_tokens": 838084450.0, + "step": 21963 + }, + { + "epoch": 2.794046558961964, + "ewc_loss": 0.008542846888303757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542847353965044e-05, + "grad_norm": 4.323060512542725, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8777759671211243, + "num_tokens": 838121505.0, + "step": 21964 + }, + { + "epoch": 2.7941737692405546, + "ewc_loss": 0.008600597269833088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600597357144579e-05, + "grad_norm": 4.23884391784668, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8810304403305054, + "num_tokens": 838166148.0, + "step": 21965 + }, + { + "epoch": 2.794300979519145, + "ewc_loss": 0.008553460240364075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553460065741092e-05, + "grad_norm": 4.303680419921875, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8884798884391785, + "num_tokens": 838203578.0, + "step": 21966 + }, + { + "epoch": 2.7944281897977357, + "ewc_loss": 0.008622984401881695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622984023531899e-05, + "grad_norm": 4.3028974533081055, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8903898596763611, + "num_tokens": 838240702.0, + "step": 21967 + }, + { + "epoch": 2.794555400076326, + "ewc_loss": 0.00858047790825367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580477879149839e-05, + "grad_norm": 4.28566312789917, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.891873836517334, + "num_tokens": 838275639.0, + "step": 21968 + }, + { + "epoch": 2.7946826103549167, + "ewc_loss": 0.008588818833231926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588819036958739e-05, + "grad_norm": 4.321691513061523, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8835418224334717, + "num_tokens": 838310045.0, + "step": 21969 + }, + { + "epoch": 2.7948098206335072, + "ewc_loss": 0.008636211976408958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636211714474484e-05, + "grad_norm": 4.2807207107543945, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8843892812728882, + "num_tokens": 838351571.0, + "step": 21970 + }, + { + "epoch": 2.7949370309120978, + "ewc_loss": 0.008575019426643848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575019455747679e-05, + "grad_norm": 4.289530277252197, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8943719863891602, + "num_tokens": 838393804.0, + "step": 21971 + }, + { + "epoch": 2.7950642411906883, + "ewc_loss": 0.008604110218584538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604110189480707e-05, + "grad_norm": 4.253478527069092, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.882398247718811, + "num_tokens": 838435516.0, + "step": 21972 + }, + { + "epoch": 2.795191451469279, + "ewc_loss": 0.008566761389374733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566761243855581e-05, + "grad_norm": 4.3253278732299805, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8902966976165771, + "num_tokens": 838465425.0, + "step": 21973 + }, + { + "epoch": 2.7953186617478694, + "ewc_loss": 0.008623126894235611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623127359896898e-05, + "grad_norm": 4.2130913734436035, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8826001882553101, + "num_tokens": 838512832.0, + "step": 21974 + }, + { + "epoch": 2.79544587202646, + "ewc_loss": 0.008528945036232471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528945181751624e-05, + "grad_norm": 4.268853187561035, + "learning_rate": 1e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8887409567832947, + "num_tokens": 838552806.0, + "step": 21975 + }, + { + "epoch": 2.79557308230505, + "ewc_loss": 0.00860328134149313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603281457908452e-05, + "grad_norm": 4.317173480987549, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8816725015640259, + "num_tokens": 838588592.0, + "step": 21976 + }, + { + "epoch": 2.795700292583641, + "ewc_loss": 0.008611551485955715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611551311332732e-05, + "grad_norm": 4.265012741088867, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8812291622161865, + "num_tokens": 838629402.0, + "step": 21977 + }, + { + "epoch": 2.795827502862231, + "ewc_loss": 0.008561980910599232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561980939703062e-05, + "grad_norm": 4.244585037231445, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8750358819961548, + "num_tokens": 838676580.0, + "step": 21978 + }, + { + "epoch": 2.795954713140822, + "ewc_loss": 0.008565062656998634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565062307752669e-05, + "grad_norm": 4.331938743591309, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.879202663898468, + "num_tokens": 838715773.0, + "step": 21979 + }, + { + "epoch": 2.796081923419412, + "ewc_loss": 0.008616498671472073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616498234914616e-05, + "grad_norm": 4.33220911026001, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8738234639167786, + "num_tokens": 838750270.0, + "step": 21980 + }, + { + "epoch": 2.796209133698003, + "ewc_loss": 0.00857711210846901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577112021157518e-05, + "grad_norm": 4.276249885559082, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8619261980056763, + "num_tokens": 838794823.0, + "step": 21981 + }, + { + "epoch": 2.796336343976593, + "ewc_loss": 0.008550315164029598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550315396860242e-05, + "grad_norm": 4.355595588684082, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8755683898925781, + "num_tokens": 838828901.0, + "step": 21982 + }, + { + "epoch": 2.7964635542551837, + "ewc_loss": 0.008636067621409893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636067650513723e-05, + "grad_norm": 4.275223731994629, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8757534027099609, + "num_tokens": 838872648.0, + "step": 21983 + }, + { + "epoch": 2.796590764533774, + "ewc_loss": 0.008558783680200577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55878388392739e-05, + "grad_norm": 4.305327415466309, + "learning_rate": 1e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8949788808822632, + "num_tokens": 838905150.0, + "step": 21984 + }, + { + "epoch": 2.7967179748123647, + "ewc_loss": 0.008616778068244457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616778359282762e-05, + "grad_norm": 4.226328372955322, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8815760016441345, + "num_tokens": 838949027.0, + "step": 21985 + }, + { + "epoch": 2.7968451850909553, + "ewc_loss": 0.008564618416130543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564618474338204e-05, + "grad_norm": 4.260550022125244, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8848446607589722, + "num_tokens": 838987320.0, + "step": 21986 + }, + { + "epoch": 2.796972395369546, + "ewc_loss": 0.008606553077697754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606552728451788e-05, + "grad_norm": 4.293920993804932, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8798347115516663, + "num_tokens": 839027396.0, + "step": 21987 + }, + { + "epoch": 2.7970996056481363, + "ewc_loss": 0.008598264306783676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59826395753771e-05, + "grad_norm": 4.221989154815674, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8893967866897583, + "num_tokens": 839069779.0, + "step": 21988 + }, + { + "epoch": 2.797226815926727, + "ewc_loss": 0.0085622388869524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562239236198366e-05, + "grad_norm": 4.260521411895752, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8893721699714661, + "num_tokens": 839110919.0, + "step": 21989 + }, + { + "epoch": 2.7973540262053174, + "ewc_loss": 0.008607836440205574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60783620737493e-05, + "grad_norm": 4.30167818069458, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8952184915542603, + "num_tokens": 839143689.0, + "step": 21990 + }, + { + "epoch": 2.797481236483908, + "ewc_loss": 0.008594145067036152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594145037932321e-05, + "grad_norm": 4.329991340637207, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.865419864654541, + "num_tokens": 839181274.0, + "step": 21991 + }, + { + "epoch": 2.7976084467624984, + "ewc_loss": 0.00860725436359644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607254130765796e-05, + "grad_norm": 4.3199782371521, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8682910203933716, + "num_tokens": 839217038.0, + "step": 21992 + }, + { + "epoch": 2.797735657041089, + "ewc_loss": 0.008583195507526398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583195449318737e-05, + "grad_norm": 4.2396440505981445, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.882756769657135, + "num_tokens": 839260104.0, + "step": 21993 + }, + { + "epoch": 2.7978628673196795, + "ewc_loss": 0.008546482771635056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546483149984851e-05, + "grad_norm": 4.282171726226807, + "learning_rate": 1e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8870199918746948, + "num_tokens": 839294328.0, + "step": 21994 + }, + { + "epoch": 2.79799007759827, + "ewc_loss": 0.008607183583080769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607183553976938e-05, + "grad_norm": 4.335013389587402, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8807551860809326, + "num_tokens": 839326662.0, + "step": 21995 + }, + { + "epoch": 2.7981172878768605, + "ewc_loss": 0.008618480525910854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618480933364481e-05, + "grad_norm": 4.33425235748291, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.882774829864502, + "num_tokens": 839355997.0, + "step": 21996 + }, + { + "epoch": 2.798244498155451, + "ewc_loss": 0.00860598124563694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605981565779075e-05, + "grad_norm": 4.247707843780518, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.8884260058403015, + "num_tokens": 839396596.0, + "step": 21997 + }, + { + "epoch": 2.7983717084340416, + "ewc_loss": 0.008574595674872398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574595995014533e-05, + "grad_norm": 4.318287372589111, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8845161199569702, + "num_tokens": 839432251.0, + "step": 21998 + }, + { + "epoch": 2.798498918712632, + "ewc_loss": 0.0086580291390419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.658028673380613e-05, + "grad_norm": 4.281083583831787, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8931000828742981, + "num_tokens": 839465987.0, + "step": 21999 + }, + { + "epoch": 2.7986261289912227, + "ewc_loss": 0.008595801889896393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595801773481071e-05, + "grad_norm": 4.282124996185303, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8861621618270874, + "num_tokens": 839506147.0, + "step": 22000 + }, + { + "epoch": 2.7987533392698127, + "ewc_loss": 0.00861683301627636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616832928964868e-05, + "grad_norm": 4.2738142013549805, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.874561607837677, + "num_tokens": 839544620.0, + "step": 22001 + }, + { + "epoch": 2.7988805495484037, + "ewc_loss": 0.0086437426507473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643742330605164e-05, + "grad_norm": 4.382214069366455, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8757050633430481, + "num_tokens": 839583217.0, + "step": 22002 + }, + { + "epoch": 2.799007759826994, + "ewc_loss": 0.008693654090166092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.69365394464694e-05, + "grad_norm": 4.24852180480957, + "learning_rate": 1e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8995084762573242, + "num_tokens": 839621283.0, + "step": 22003 + }, + { + "epoch": 2.7991349701055848, + "ewc_loss": 0.008565151132643223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565151074435562e-05, + "grad_norm": 4.310202598571777, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8786706328392029, + "num_tokens": 839658072.0, + "step": 22004 + }, + { + "epoch": 2.799262180384175, + "ewc_loss": 0.008667094632983208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667094516567886e-05, + "grad_norm": 4.25834846496582, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8658394813537598, + "num_tokens": 839700926.0, + "step": 22005 + }, + { + "epoch": 2.799389390662766, + "ewc_loss": 0.008602285757660866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602286106906831e-05, + "grad_norm": 4.269748210906982, + "learning_rate": 1e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.891546905040741, + "num_tokens": 839736160.0, + "step": 22006 + }, + { + "epoch": 2.799516600941356, + "ewc_loss": 0.008620659820735455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620660082669929e-05, + "grad_norm": 4.234358787536621, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.89738929271698, + "num_tokens": 839776834.0, + "step": 22007 + }, + { + "epoch": 2.7996438112199464, + "ewc_loss": 0.008598736487329006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598736167186871e-05, + "grad_norm": 4.262270927429199, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8768001198768616, + "num_tokens": 839818156.0, + "step": 22008 + }, + { + "epoch": 2.799771021498537, + "ewc_loss": 0.008633734658360481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633734978502616e-05, + "grad_norm": 4.268574237823486, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8779838681221008, + "num_tokens": 839857133.0, + "step": 22009 + }, + { + "epoch": 2.7998982317771275, + "ewc_loss": 0.008625630289316177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625630289316177e-05, + "grad_norm": 4.3263959884643555, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8695382475852966, + "num_tokens": 839893129.0, + "step": 22010 + }, + { + "epoch": 2.800025442055718, + "ewc_loss": 0.008641437627375126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641438034828752e-05, + "grad_norm": 4.348389148712158, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8880996704101562, + "num_tokens": 839924622.0, + "step": 22011 + }, + { + "epoch": 2.8001526523343085, + "ewc_loss": 0.008639892563223839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6398926214315e-05, + "grad_norm": 4.243130683898926, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.9022554159164429, + "num_tokens": 839961276.0, + "step": 22012 + }, + { + "epoch": 2.800279862612899, + "ewc_loss": 0.008581051602959633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581051952205598e-05, + "grad_norm": 4.30304479598999, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8740639686584473, + "num_tokens": 840002554.0, + "step": 22013 + }, + { + "epoch": 2.8004070728914896, + "ewc_loss": 0.008651587180793285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.651587268104777e-05, + "grad_norm": 4.257866859436035, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8812264800071716, + "num_tokens": 840041642.0, + "step": 22014 + }, + { + "epoch": 2.80053428317008, + "ewc_loss": 0.008603864349424839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603864262113348e-05, + "grad_norm": 4.28709077835083, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8908430337905884, + "num_tokens": 840083532.0, + "step": 22015 + }, + { + "epoch": 2.8006614934486707, + "ewc_loss": 0.00860463734716177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604636968811974e-05, + "grad_norm": 4.248743534088135, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8808718919754028, + "num_tokens": 840124221.0, + "step": 22016 + }, + { + "epoch": 2.800788703727261, + "ewc_loss": 0.008569247089326382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569247438572347e-05, + "grad_norm": 4.271142959594727, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8864495158195496, + "num_tokens": 840162309.0, + "step": 22017 + }, + { + "epoch": 2.8009159140058517, + "ewc_loss": 0.008581630885601044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581631118431687e-05, + "grad_norm": 4.306395053863525, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8849993944168091, + "num_tokens": 840194778.0, + "step": 22018 + }, + { + "epoch": 2.8010431242844422, + "ewc_loss": 0.008597167208790779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597167470725253e-05, + "grad_norm": 4.365772724151611, + "learning_rate": 1e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.9008140563964844, + "num_tokens": 840225081.0, + "step": 22019 + }, + { + "epoch": 2.8011703345630328, + "ewc_loss": 0.008612766861915588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612766396254301e-05, + "grad_norm": 4.248271465301514, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8809859752655029, + "num_tokens": 840267698.0, + "step": 22020 + }, + { + "epoch": 2.8012975448416233, + "ewc_loss": 0.008524728938937187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.524728764314204e-05, + "grad_norm": 4.328869342803955, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8778305053710938, + "num_tokens": 840303255.0, + "step": 22021 + }, + { + "epoch": 2.801424755120214, + "ewc_loss": 0.008609830401837826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609830547356978e-05, + "grad_norm": 4.253767490386963, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8836475014686584, + "num_tokens": 840344616.0, + "step": 22022 + }, + { + "epoch": 2.8015519653988044, + "ewc_loss": 0.008540386334061623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540386625099927e-05, + "grad_norm": 4.283444881439209, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8831215500831604, + "num_tokens": 840383293.0, + "step": 22023 + }, + { + "epoch": 2.801679175677395, + "ewc_loss": 0.008602391928434372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602391608292237e-05, + "grad_norm": 4.288590431213379, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8742862939834595, + "num_tokens": 840424583.0, + "step": 22024 + }, + { + "epoch": 2.8018063859559854, + "ewc_loss": 0.008571685291826725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571684884373099e-05, + "grad_norm": 4.327070236206055, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8780045509338379, + "num_tokens": 840458752.0, + "step": 22025 + }, + { + "epoch": 2.8019335962345755, + "ewc_loss": 0.008592762984335423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592762605985627e-05, + "grad_norm": 4.265247821807861, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8885686993598938, + "num_tokens": 840498441.0, + "step": 22026 + }, + { + "epoch": 2.8020608065131665, + "ewc_loss": 0.00853875745087862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538757538190112e-05, + "grad_norm": 4.261605739593506, + "learning_rate": 1e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8928507566452026, + "num_tokens": 840537838.0, + "step": 22027 + }, + { + "epoch": 2.8021880167917566, + "ewc_loss": 0.00857079029083252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570790669182315e-05, + "grad_norm": 4.383065223693848, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8732250332832336, + "num_tokens": 840571770.0, + "step": 22028 + }, + { + "epoch": 2.8023152270703475, + "ewc_loss": 0.008636215701699257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636216080049053e-05, + "grad_norm": 4.331772804260254, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8771107196807861, + "num_tokens": 840607559.0, + "step": 22029 + }, + { + "epoch": 2.8024424373489376, + "ewc_loss": 0.008570999838411808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570999489165843e-05, + "grad_norm": 4.275524616241455, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8676574230194092, + "num_tokens": 840650156.0, + "step": 22030 + }, + { + "epoch": 2.8025696476275286, + "ewc_loss": 0.008568464778363705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568464545533061e-05, + "grad_norm": 4.326444625854492, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.874330997467041, + "num_tokens": 840686590.0, + "step": 22031 + }, + { + "epoch": 2.8026968579061187, + "ewc_loss": 0.008622854948043823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622855239082128e-05, + "grad_norm": 4.296574592590332, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8758598566055298, + "num_tokens": 840724445.0, + "step": 22032 + }, + { + "epoch": 2.802824068184709, + "ewc_loss": 0.00858998205512762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589982462581247e-05, + "grad_norm": 4.242920398712158, + "learning_rate": 1e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.8967083692550659, + "num_tokens": 840765047.0, + "step": 22033 + }, + { + "epoch": 2.8029512784632997, + "ewc_loss": 0.008588580414652824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588580385548994e-05, + "grad_norm": 4.334450721740723, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8702998161315918, + "num_tokens": 840802375.0, + "step": 22034 + }, + { + "epoch": 2.8030784887418903, + "ewc_loss": 0.008651694282889366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.651694224681705e-05, + "grad_norm": 4.308236598968506, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8821169137954712, + "num_tokens": 840839994.0, + "step": 22035 + }, + { + "epoch": 2.803205699020481, + "ewc_loss": 0.008600742556154728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600742876296863e-05, + "grad_norm": 4.2845540046691895, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8814623355865479, + "num_tokens": 840877144.0, + "step": 22036 + }, + { + "epoch": 2.8033329092990713, + "ewc_loss": 0.00860754307359457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607542986283079e-05, + "grad_norm": 4.262840270996094, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8744755387306213, + "num_tokens": 840918478.0, + "step": 22037 + }, + { + "epoch": 2.803460119577662, + "ewc_loss": 0.008605924434959888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605924085713923e-05, + "grad_norm": 4.263975143432617, + "learning_rate": 1e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8906669616699219, + "num_tokens": 840958484.0, + "step": 22038 + }, + { + "epoch": 2.8035873298562524, + "ewc_loss": 0.008614805527031422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614805847173557e-05, + "grad_norm": 4.271198272705078, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8841376900672913, + "num_tokens": 841000035.0, + "step": 22039 + }, + { + "epoch": 2.803714540134843, + "ewc_loss": 0.008614440448582172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614440594101325e-05, + "grad_norm": 4.316431045532227, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8780232071876526, + "num_tokens": 841035836.0, + "step": 22040 + }, + { + "epoch": 2.8038417504134334, + "ewc_loss": 0.0086273904889822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627390343463048e-05, + "grad_norm": 4.2958807945251465, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.883621335029602, + "num_tokens": 841069295.0, + "step": 22041 + }, + { + "epoch": 2.803968960692024, + "ewc_loss": 0.008613909594714642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613909449195489e-05, + "grad_norm": 4.2780866622924805, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8796263933181763, + "num_tokens": 841108341.0, + "step": 22042 + }, + { + "epoch": 2.8040961709706145, + "ewc_loss": 0.008611164055764675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611164230387658e-05, + "grad_norm": 4.2789411544799805, + "learning_rate": 1e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8927585482597351, + "num_tokens": 841142945.0, + "step": 22043 + }, + { + "epoch": 2.804223381249205, + "ewc_loss": 0.00864025205373764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64025205373764e-05, + "grad_norm": 4.330657482147217, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8955197334289551, + "num_tokens": 841174137.0, + "step": 22044 + }, + { + "epoch": 2.8043505915277955, + "ewc_loss": 0.008647039532661438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647039067000151e-05, + "grad_norm": 4.2591094970703125, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8914473056793213, + "num_tokens": 841215914.0, + "step": 22045 + }, + { + "epoch": 2.804477801806386, + "ewc_loss": 0.008593123406171799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59312349348329e-05, + "grad_norm": 4.284485340118408, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8839753866195679, + "num_tokens": 841256097.0, + "step": 22046 + }, + { + "epoch": 2.8046050120849766, + "ewc_loss": 0.008616253733634949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616253762738779e-05, + "grad_norm": 4.314496040344238, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8871016502380371, + "num_tokens": 841288997.0, + "step": 22047 + }, + { + "epoch": 2.804732222363567, + "ewc_loss": 0.008627460338175297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627460192656144e-05, + "grad_norm": 4.288821697235107, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8790214657783508, + "num_tokens": 841328053.0, + "step": 22048 + }, + { + "epoch": 2.8048594326421576, + "ewc_loss": 0.008584626950323582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584626630181447e-05, + "grad_norm": 4.236987113952637, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8986735343933105, + "num_tokens": 841366848.0, + "step": 22049 + }, + { + "epoch": 2.804986642920748, + "ewc_loss": 0.008589550852775574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589550998294726e-05, + "grad_norm": 4.322340965270996, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8919109106063843, + "num_tokens": 841398568.0, + "step": 22050 + }, + { + "epoch": 2.8051138531993383, + "ewc_loss": 0.008640628308057785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640628220746294e-05, + "grad_norm": 4.29666805267334, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8818328380584717, + "num_tokens": 841435722.0, + "step": 22051 + }, + { + "epoch": 2.8052410634779292, + "ewc_loss": 0.008591040037572384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591039659222588e-05, + "grad_norm": 4.31171989440918, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8798250555992126, + "num_tokens": 841473635.0, + "step": 22052 + }, + { + "epoch": 2.8053682737565193, + "ewc_loss": 0.008615436032414436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615435945102945e-05, + "grad_norm": 4.297196388244629, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8866939544677734, + "num_tokens": 841512383.0, + "step": 22053 + }, + { + "epoch": 2.8054954840351103, + "ewc_loss": 0.008608251810073853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6082516645547e-05, + "grad_norm": 4.369412422180176, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.879450261592865, + "num_tokens": 841543174.0, + "step": 22054 + }, + { + "epoch": 2.8056226943137004, + "ewc_loss": 0.008661244995892048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.661245374241844e-05, + "grad_norm": 4.259727478027344, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.888477087020874, + "num_tokens": 841582185.0, + "step": 22055 + }, + { + "epoch": 2.805749904592291, + "ewc_loss": 0.008558513596653938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558513945899904e-05, + "grad_norm": 4.249427318572998, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8781884908676147, + "num_tokens": 841623179.0, + "step": 22056 + }, + { + "epoch": 2.8058771148708814, + "ewc_loss": 0.008601048029959202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601048466516659e-05, + "grad_norm": 4.380823612213135, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8900350332260132, + "num_tokens": 841661062.0, + "step": 22057 + }, + { + "epoch": 2.806004325149472, + "ewc_loss": 0.008676283992826939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6762840510346e-05, + "grad_norm": 4.258759498596191, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8830277919769287, + "num_tokens": 841704259.0, + "step": 22058 + }, + { + "epoch": 2.8061315354280625, + "ewc_loss": 0.00854762364178896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547623292542994e-05, + "grad_norm": 4.246328830718994, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8815054893493652, + "num_tokens": 841748396.0, + "step": 22059 + }, + { + "epoch": 2.806258745706653, + "ewc_loss": 0.00859464704990387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5946470790077e-05, + "grad_norm": 4.239444732666016, + "learning_rate": 1e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.9016039967536926, + "num_tokens": 841787809.0, + "step": 22060 + }, + { + "epoch": 2.8063859559852435, + "ewc_loss": 0.008588012307882309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588012133259326e-05, + "grad_norm": 4.341437339782715, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8824700713157654, + "num_tokens": 841824046.0, + "step": 22061 + }, + { + "epoch": 2.806513166263834, + "ewc_loss": 0.008638983592391014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638983854325488e-05, + "grad_norm": 4.280017375946045, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8927063345909119, + "num_tokens": 841856949.0, + "step": 22062 + }, + { + "epoch": 2.8066403765424246, + "ewc_loss": 0.0085632698610425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563269511796534e-05, + "grad_norm": 4.321201324462891, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8692030310630798, + "num_tokens": 841898644.0, + "step": 22063 + }, + { + "epoch": 2.806767586821015, + "ewc_loss": 0.00861338060349226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613380487076938e-05, + "grad_norm": 4.307032108306885, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8855302929878235, + "num_tokens": 841934795.0, + "step": 22064 + }, + { + "epoch": 2.8068947970996057, + "ewc_loss": 0.00859004259109497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590042853029445e-05, + "grad_norm": 4.298560619354248, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8687564134597778, + "num_tokens": 841976878.0, + "step": 22065 + }, + { + "epoch": 2.807022007378196, + "ewc_loss": 0.008575852029025555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575851825298741e-05, + "grad_norm": 4.286839962005615, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8829976320266724, + "num_tokens": 842010998.0, + "step": 22066 + }, + { + "epoch": 2.8071492176567867, + "ewc_loss": 0.008579801768064499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579801942687482e-05, + "grad_norm": 4.221096992492676, + "learning_rate": 1e-06, + "loss": 0.2661, + "mean_token_accuracy": 0.9059286117553711, + "num_tokens": 842051410.0, + "step": 22067 + }, + { + "epoch": 2.8072764279353772, + "ewc_loss": 0.00853655394166708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536553650628775e-05, + "grad_norm": 4.3020124435424805, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8848622441291809, + "num_tokens": 842087954.0, + "step": 22068 + }, + { + "epoch": 2.8074036382139678, + "ewc_loss": 0.008634492754936218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634492405690253e-05, + "grad_norm": 4.307920455932617, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8818153142929077, + "num_tokens": 842128326.0, + "step": 22069 + }, + { + "epoch": 2.8075308484925583, + "ewc_loss": 0.0085728345438838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57283448567614e-05, + "grad_norm": 4.295313358306885, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8638728857040405, + "num_tokens": 842171143.0, + "step": 22070 + }, + { + "epoch": 2.807658058771149, + "ewc_loss": 0.008567850105464458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.567850454710424e-05, + "grad_norm": 4.264612197875977, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8711785078048706, + "num_tokens": 842212167.0, + "step": 22071 + }, + { + "epoch": 2.8077852690497394, + "ewc_loss": 0.008551377803087234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551377686671913e-05, + "grad_norm": 4.27400016784668, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8742347955703735, + "num_tokens": 842252576.0, + "step": 22072 + }, + { + "epoch": 2.80791247932833, + "ewc_loss": 0.008576461113989353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57646155054681e-05, + "grad_norm": 4.31239128112793, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8804254531860352, + "num_tokens": 842287812.0, + "step": 22073 + }, + { + "epoch": 2.80803968960692, + "ewc_loss": 0.008572258986532688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572258957428858e-05, + "grad_norm": 4.256690979003906, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8787548542022705, + "num_tokens": 842328289.0, + "step": 22074 + }, + { + "epoch": 2.808166899885511, + "ewc_loss": 0.008545974269509315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545973832951859e-05, + "grad_norm": 4.364250659942627, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.88734370470047, + "num_tokens": 842359182.0, + "step": 22075 + }, + { + "epoch": 2.808294110164101, + "ewc_loss": 0.008609839715063572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609840006101876e-05, + "grad_norm": 4.245273113250732, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8721904754638672, + "num_tokens": 842399820.0, + "step": 22076 + }, + { + "epoch": 2.808421320442692, + "ewc_loss": 0.008523675613105297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523675933247432e-05, + "grad_norm": 4.31477165222168, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8826889991760254, + "num_tokens": 842437852.0, + "step": 22077 + }, + { + "epoch": 2.808548530721282, + "ewc_loss": 0.008617879822850227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617879939265549e-05, + "grad_norm": 4.345420837402344, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8697258234024048, + "num_tokens": 842476729.0, + "step": 22078 + }, + { + "epoch": 2.808675740999873, + "ewc_loss": 0.008612456731498241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612456440459937e-05, + "grad_norm": 4.273708820343018, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8714509010314941, + "num_tokens": 842514590.0, + "step": 22079 + }, + { + "epoch": 2.808802951278463, + "ewc_loss": 0.008572284132242203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572284423280507e-05, + "grad_norm": 4.2418694496154785, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8801866769790649, + "num_tokens": 842556502.0, + "step": 22080 + }, + { + "epoch": 2.8089301615570537, + "ewc_loss": 0.00857065711170435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570657519157976e-05, + "grad_norm": 4.302402496337891, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8702683448791504, + "num_tokens": 842594299.0, + "step": 22081 + }, + { + "epoch": 2.809057371835644, + "ewc_loss": 0.008650963194668293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65096299094148e-05, + "grad_norm": 4.282284259796143, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8783578276634216, + "num_tokens": 842633189.0, + "step": 22082 + }, + { + "epoch": 2.8091845821142347, + "ewc_loss": 0.008596515282988548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596514817327261e-05, + "grad_norm": 4.265131950378418, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.884623110294342, + "num_tokens": 842673626.0, + "step": 22083 + }, + { + "epoch": 2.8093117923928252, + "ewc_loss": 0.008588776923716068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588776836404577e-05, + "grad_norm": 4.273090362548828, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8771660327911377, + "num_tokens": 842712574.0, + "step": 22084 + }, + { + "epoch": 2.8094390026714158, + "ewc_loss": 0.00860570464283228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605704351793975e-05, + "grad_norm": 4.293999195098877, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8802053928375244, + "num_tokens": 842749534.0, + "step": 22085 + }, + { + "epoch": 2.8095662129500063, + "ewc_loss": 0.008611426688730717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61142689245753e-05, + "grad_norm": 4.332828998565674, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8738903999328613, + "num_tokens": 842784989.0, + "step": 22086 + }, + { + "epoch": 2.809693423228597, + "ewc_loss": 0.008636350743472576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636350685264915e-05, + "grad_norm": 4.264018535614014, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.884769082069397, + "num_tokens": 842823858.0, + "step": 22087 + }, + { + "epoch": 2.8098206335071874, + "ewc_loss": 0.008596311323344707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596311090514064e-05, + "grad_norm": 4.339676856994629, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.883015513420105, + "num_tokens": 842858849.0, + "step": 22088 + }, + { + "epoch": 2.809947843785778, + "ewc_loss": 0.00866021029651165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.660210733069107e-05, + "grad_norm": 4.295537948608398, + "learning_rate": 1e-06, + "loss": 0.2831, + "mean_token_accuracy": 0.900918185710907, + "num_tokens": 842895809.0, + "step": 22089 + }, + { + "epoch": 2.8100750540643684, + "ewc_loss": 0.008613014593720436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613014506408945e-05, + "grad_norm": 4.2318644523620605, + "learning_rate": 1e-06, + "loss": 0.2614, + "mean_token_accuracy": 0.9077296257019043, + "num_tokens": 842933650.0, + "step": 22090 + }, + { + "epoch": 2.810202264342959, + "ewc_loss": 0.00859485287219286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594852988608181e-05, + "grad_norm": 4.3538312911987305, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8862533569335938, + "num_tokens": 842967200.0, + "step": 22091 + }, + { + "epoch": 2.8103294746215495, + "ewc_loss": 0.008658202365040779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.658202568767592e-05, + "grad_norm": 4.290815353393555, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8747986555099487, + "num_tokens": 843002498.0, + "step": 22092 + }, + { + "epoch": 2.81045668490014, + "ewc_loss": 0.008577811531722546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577811968280002e-05, + "grad_norm": 4.245075225830078, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.887204647064209, + "num_tokens": 843042627.0, + "step": 22093 + }, + { + "epoch": 2.8105838951787305, + "ewc_loss": 0.008610052987933159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610053191659972e-05, + "grad_norm": 4.3524956703186035, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8801828622817993, + "num_tokens": 843076862.0, + "step": 22094 + }, + { + "epoch": 2.810711105457321, + "ewc_loss": 0.008653176948428154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.653177064843476e-05, + "grad_norm": 4.289947509765625, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8838292956352234, + "num_tokens": 843115000.0, + "step": 22095 + }, + { + "epoch": 2.8108383157359116, + "ewc_loss": 0.008602248504757881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602248271927238e-05, + "grad_norm": 4.299654483795166, + "learning_rate": 1e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8894292712211609, + "num_tokens": 843152408.0, + "step": 22096 + }, + { + "epoch": 2.810965526014502, + "ewc_loss": 0.008628785610198975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.628785144537687e-05, + "grad_norm": 4.286043167114258, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8841854929924011, + "num_tokens": 843189269.0, + "step": 22097 + }, + { + "epoch": 2.8110927362930926, + "ewc_loss": 0.00860380195081234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603801688877866e-05, + "grad_norm": 4.286862850189209, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8790398240089417, + "num_tokens": 843227148.0, + "step": 22098 + }, + { + "epoch": 2.8112199465716827, + "ewc_loss": 0.008623020723462105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623021130915731e-05, + "grad_norm": 4.345125198364258, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8753860592842102, + "num_tokens": 843259736.0, + "step": 22099 + }, + { + "epoch": 2.8113471568502737, + "ewc_loss": 0.008671047165989876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67104681674391e-05, + "grad_norm": 4.2626776695251465, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.879319965839386, + "num_tokens": 843302598.0, + "step": 22100 + }, + { + "epoch": 2.811474367128864, + "ewc_loss": 0.008570405654609203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570405771024525e-05, + "grad_norm": 4.289324760437012, + "learning_rate": 1e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8972873091697693, + "num_tokens": 843337615.0, + "step": 22101 + }, + { + "epoch": 2.8116015774074548, + "ewc_loss": 0.008655702695250511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655702549731359e-05, + "grad_norm": 4.222212791442871, + "learning_rate": 1e-06, + "loss": 0.3004, + "mean_token_accuracy": 0.8964844942092896, + "num_tokens": 843378845.0, + "step": 22102 + }, + { + "epoch": 2.811728787686045, + "ewc_loss": 0.008598417043685913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598416752647609e-05, + "grad_norm": 4.344762802124023, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8760055303573608, + "num_tokens": 843414128.0, + "step": 22103 + }, + { + "epoch": 2.811855997964636, + "ewc_loss": 0.008661379106342793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.661379251861945e-05, + "grad_norm": 4.223997592926025, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.896188497543335, + "num_tokens": 843454255.0, + "step": 22104 + }, + { + "epoch": 2.811983208243226, + "ewc_loss": 0.008572124876081944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572125079808757e-05, + "grad_norm": 4.261032581329346, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8854290843009949, + "num_tokens": 843498332.0, + "step": 22105 + }, + { + "epoch": 2.8121104185218164, + "ewc_loss": 0.008624659851193428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624659676570445e-05, + "grad_norm": 4.3125786781311035, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8740643262863159, + "num_tokens": 843539196.0, + "step": 22106 + }, + { + "epoch": 2.812237628800407, + "ewc_loss": 0.008634958416223526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634958794573322e-05, + "grad_norm": 4.312801361083984, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8868385553359985, + "num_tokens": 843573767.0, + "step": 22107 + }, + { + "epoch": 2.8123648390789975, + "ewc_loss": 0.008589780889451504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589780918555334e-05, + "grad_norm": 4.307503700256348, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8711430430412292, + "num_tokens": 843611156.0, + "step": 22108 + }, + { + "epoch": 2.812492049357588, + "ewc_loss": 0.008593395352363586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59339561429806e-05, + "grad_norm": 4.309397220611572, + "learning_rate": 1e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8898268938064575, + "num_tokens": 843644026.0, + "step": 22109 + }, + { + "epoch": 2.8126192596361785, + "ewc_loss": 0.00860578939318657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60578948049806e-05, + "grad_norm": 4.304090976715088, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.885819673538208, + "num_tokens": 843683477.0, + "step": 22110 + }, + { + "epoch": 2.812746469914769, + "ewc_loss": 0.008585121482610703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585121395299211e-05, + "grad_norm": 4.292926788330078, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8709714412689209, + "num_tokens": 843722815.0, + "step": 22111 + }, + { + "epoch": 2.8128736801933596, + "ewc_loss": 0.008576291613280773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576291293138638e-05, + "grad_norm": 4.263491153717041, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8846081495285034, + "num_tokens": 843763843.0, + "step": 22112 + }, + { + "epoch": 2.81300089047195, + "ewc_loss": 0.00858999602496624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589996286900714e-05, + "grad_norm": 4.302500247955322, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.875636100769043, + "num_tokens": 843805327.0, + "step": 22113 + }, + { + "epoch": 2.8131281007505406, + "ewc_loss": 0.008610477671027184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61047810758464e-05, + "grad_norm": 4.244804859161377, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8827018141746521, + "num_tokens": 843850132.0, + "step": 22114 + }, + { + "epoch": 2.813255311029131, + "ewc_loss": 0.008578653447329998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578653068980202e-05, + "grad_norm": 4.341419219970703, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8824607133865356, + "num_tokens": 843887425.0, + "step": 22115 + }, + { + "epoch": 2.8133825213077217, + "ewc_loss": 0.008629653602838516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629653893876821e-05, + "grad_norm": 4.301363468170166, + "learning_rate": 1e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8924864530563354, + "num_tokens": 843920728.0, + "step": 22116 + }, + { + "epoch": 2.8135097315863122, + "ewc_loss": 0.008568570017814636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.568570046918467e-05, + "grad_norm": 4.308346748352051, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8824379444122314, + "num_tokens": 843959733.0, + "step": 22117 + }, + { + "epoch": 2.8136369418649028, + "ewc_loss": 0.008578423410654068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578423876315355e-05, + "grad_norm": 4.244555950164795, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8812415599822998, + "num_tokens": 844005616.0, + "step": 22118 + }, + { + "epoch": 2.8137641521434933, + "ewc_loss": 0.00852980837225914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.529808110324666e-05, + "grad_norm": 4.317393779754639, + "learning_rate": 1e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.8975831270217896, + "num_tokens": 844034216.0, + "step": 22119 + }, + { + "epoch": 2.813891362422084, + "ewc_loss": 0.008615334518253803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615334081696346e-05, + "grad_norm": 4.309354305267334, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8801077604293823, + "num_tokens": 844071173.0, + "step": 22120 + }, + { + "epoch": 2.8140185727006743, + "ewc_loss": 0.00856443215161562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56443220982328e-05, + "grad_norm": 4.28416633605957, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8799012899398804, + "num_tokens": 844113665.0, + "step": 22121 + }, + { + "epoch": 2.814145782979265, + "ewc_loss": 0.008558020927011967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558021363569424e-05, + "grad_norm": 4.307499885559082, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8845458030700684, + "num_tokens": 844154063.0, + "step": 22122 + }, + { + "epoch": 2.8142729932578554, + "ewc_loss": 0.008583423681557178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583423914387822e-05, + "grad_norm": 4.321608543395996, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8990044593811035, + "num_tokens": 844185634.0, + "step": 22123 + }, + { + "epoch": 2.8144002035364455, + "ewc_loss": 0.00858643651008606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586436160840094e-05, + "grad_norm": 4.3383636474609375, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.871475100517273, + "num_tokens": 844221755.0, + "step": 22124 + }, + { + "epoch": 2.8145274138150365, + "ewc_loss": 0.00856006983667612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560069545637816e-05, + "grad_norm": 4.28438138961792, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8880922794342041, + "num_tokens": 844256230.0, + "step": 22125 + }, + { + "epoch": 2.8146546240936265, + "ewc_loss": 0.00853565614670515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.535655797459185e-05, + "grad_norm": 4.393809795379639, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8764870762825012, + "num_tokens": 844292823.0, + "step": 22126 + }, + { + "epoch": 2.8147818343722175, + "ewc_loss": 0.008628549054265022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.628549403510988e-05, + "grad_norm": 4.35479211807251, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8616428971290588, + "num_tokens": 844331843.0, + "step": 22127 + }, + { + "epoch": 2.8149090446508076, + "ewc_loss": 0.008569158613681793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569158671889454e-05, + "grad_norm": 4.247345447540283, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8925763368606567, + "num_tokens": 844375823.0, + "step": 22128 + }, + { + "epoch": 2.815036254929398, + "ewc_loss": 0.0085227619856596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52276207297109e-05, + "grad_norm": 4.259113788604736, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8829724788665771, + "num_tokens": 844415187.0, + "step": 22129 + }, + { + "epoch": 2.8151634652079887, + "ewc_loss": 0.008565092459321022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565092139178887e-05, + "grad_norm": 4.283906936645508, + "learning_rate": 1e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8858942985534668, + "num_tokens": 844454986.0, + "step": 22130 + }, + { + "epoch": 2.815290675486579, + "ewc_loss": 0.008558536879718304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558536501368508e-05, + "grad_norm": 4.270155906677246, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8892226219177246, + "num_tokens": 844495514.0, + "step": 22131 + }, + { + "epoch": 2.8154178857651697, + "ewc_loss": 0.008564983494579792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564983727410436e-05, + "grad_norm": 4.309834003448486, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8830001354217529, + "num_tokens": 844532406.0, + "step": 22132 + }, + { + "epoch": 2.8155450960437602, + "ewc_loss": 0.008582967333495617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582966984249651e-05, + "grad_norm": 4.2829508781433105, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8809558749198914, + "num_tokens": 844569561.0, + "step": 22133 + }, + { + "epoch": 2.8156723063223508, + "ewc_loss": 0.008559481240808964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55948164826259e-05, + "grad_norm": 4.2537360191345215, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8890869617462158, + "num_tokens": 844613504.0, + "step": 22134 + }, + { + "epoch": 2.8157995166009413, + "ewc_loss": 0.008578055538237095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578055712860078e-05, + "grad_norm": 4.3178229331970215, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8824465870857239, + "num_tokens": 844649255.0, + "step": 22135 + }, + { + "epoch": 2.815926726879532, + "ewc_loss": 0.008613526821136475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613526733824983e-05, + "grad_norm": 4.273140907287598, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8847861886024475, + "num_tokens": 844687202.0, + "step": 22136 + }, + { + "epoch": 2.8160539371581224, + "ewc_loss": 0.008571570739150047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571570651838556e-05, + "grad_norm": 4.349433898925781, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8836653232574463, + "num_tokens": 844723096.0, + "step": 22137 + }, + { + "epoch": 2.816181147436713, + "ewc_loss": 0.008620323613286018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620323933428153e-05, + "grad_norm": 4.2739996910095215, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8867698907852173, + "num_tokens": 844759754.0, + "step": 22138 + }, + { + "epoch": 2.8163083577153034, + "ewc_loss": 0.008577834814786911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577834523748606e-05, + "grad_norm": 4.298090934753418, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8867219090461731, + "num_tokens": 844798372.0, + "step": 22139 + }, + { + "epoch": 2.816435567993894, + "ewc_loss": 0.008590718731284142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590718789491802e-05, + "grad_norm": 4.299504280090332, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8770531415939331, + "num_tokens": 844832738.0, + "step": 22140 + }, + { + "epoch": 2.8165627782724845, + "ewc_loss": 0.008609014563262463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609014184912667e-05, + "grad_norm": 4.27419900894165, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8844703435897827, + "num_tokens": 844871684.0, + "step": 22141 + }, + { + "epoch": 2.816689988551075, + "ewc_loss": 0.008585372939705849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585373143432662e-05, + "grad_norm": 4.254401683807373, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8756313323974609, + "num_tokens": 844912140.0, + "step": 22142 + }, + { + "epoch": 2.8168171988296655, + "ewc_loss": 0.008602111600339413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602111483924091e-05, + "grad_norm": 4.337032794952393, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8726606369018555, + "num_tokens": 844948256.0, + "step": 22143 + }, + { + "epoch": 2.816944409108256, + "ewc_loss": 0.008638615719974041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63861569087021e-05, + "grad_norm": 4.270779609680176, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8980745673179626, + "num_tokens": 844983821.0, + "step": 22144 + }, + { + "epoch": 2.8170716193868466, + "ewc_loss": 0.008586608804762363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58660860103555e-05, + "grad_norm": 4.296666622161865, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8877204656600952, + "num_tokens": 845020622.0, + "step": 22145 + }, + { + "epoch": 2.817198829665437, + "ewc_loss": 0.008610682561993599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610682561993599e-05, + "grad_norm": 4.295377254486084, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8745994567871094, + "num_tokens": 845056748.0, + "step": 22146 + }, + { + "epoch": 2.8173260399440276, + "ewc_loss": 0.008617240004241467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617240382591262e-05, + "grad_norm": 4.317332744598389, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.86618971824646, + "num_tokens": 845096294.0, + "step": 22147 + }, + { + "epoch": 2.817453250222618, + "ewc_loss": 0.008641602471470833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641602471470833e-05, + "grad_norm": 4.2278032302856445, + "learning_rate": 1e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8968930840492249, + "num_tokens": 845135132.0, + "step": 22148 + }, + { + "epoch": 2.8175804605012083, + "ewc_loss": 0.008569848723709583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569848432671279e-05, + "grad_norm": 4.276232719421387, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8908962607383728, + "num_tokens": 845172164.0, + "step": 22149 + }, + { + "epoch": 2.8177076707797992, + "ewc_loss": 0.008631548844277859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631548553239554e-05, + "grad_norm": 4.288957595825195, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8853422403335571, + "num_tokens": 845210270.0, + "step": 22150 + }, + { + "epoch": 2.8178348810583893, + "ewc_loss": 0.008623667061328888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623667235951871e-05, + "grad_norm": 4.272736549377441, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8870276212692261, + "num_tokens": 845251863.0, + "step": 22151 + }, + { + "epoch": 2.8179620913369803, + "ewc_loss": 0.008612403646111488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612403325969353e-05, + "grad_norm": 4.265226364135742, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8885073661804199, + "num_tokens": 845290262.0, + "step": 22152 + }, + { + "epoch": 2.8180893016155704, + "ewc_loss": 0.008601341396570206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601340960012749e-05, + "grad_norm": 4.65463924407959, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8903322219848633, + "num_tokens": 845326894.0, + "step": 22153 + }, + { + "epoch": 2.818216511894161, + "ewc_loss": 0.008845148608088493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.845148840919137e-05, + "grad_norm": 4.363582611083984, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8913174867630005, + "num_tokens": 845367545.0, + "step": 22154 + }, + { + "epoch": 2.8183437221727514, + "ewc_loss": 0.008562827482819557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562827133573592e-05, + "grad_norm": 4.306373596191406, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.872297465801239, + "num_tokens": 845400920.0, + "step": 22155 + }, + { + "epoch": 2.818470932451342, + "ewc_loss": 0.008672194555401802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.672194962855428e-05, + "grad_norm": 4.306516647338867, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.879447340965271, + "num_tokens": 845436279.0, + "step": 22156 + }, + { + "epoch": 2.8185981427299325, + "ewc_loss": 0.00869761686772108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.697616431163624e-05, + "grad_norm": 4.241568088531494, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8873264789581299, + "num_tokens": 845476308.0, + "step": 22157 + }, + { + "epoch": 2.818725353008523, + "ewc_loss": 0.008671442046761513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67144190124236e-05, + "grad_norm": 4.277121543884277, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8804289698600769, + "num_tokens": 845516624.0, + "step": 22158 + }, + { + "epoch": 2.8188525632871135, + "ewc_loss": 0.008728302083909512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.728302054805681e-05, + "grad_norm": 4.235603332519531, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8997019529342651, + "num_tokens": 845552385.0, + "step": 22159 + }, + { + "epoch": 2.818979773565704, + "ewc_loss": 0.008675781078636646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67578128236346e-05, + "grad_norm": 4.25242805480957, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8914765119552612, + "num_tokens": 845592600.0, + "step": 22160 + }, + { + "epoch": 2.8191069838442946, + "ewc_loss": 0.008732982911169529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.732982678338885e-05, + "grad_norm": 4.287164211273193, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8905781507492065, + "num_tokens": 845630028.0, + "step": 22161 + }, + { + "epoch": 2.819234194122885, + "ewc_loss": 0.00872605200856924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.726052328711376e-05, + "grad_norm": 4.30933952331543, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.879440426826477, + "num_tokens": 845665874.0, + "step": 22162 + }, + { + "epoch": 2.8193614044014756, + "ewc_loss": 0.00873392540961504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.733925642445683e-05, + "grad_norm": 4.3122944831848145, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8741103410720825, + "num_tokens": 845702738.0, + "step": 22163 + }, + { + "epoch": 2.819488614680066, + "ewc_loss": 0.008742210455238819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.742210775380954e-05, + "grad_norm": 4.264492988586426, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8866047859191895, + "num_tokens": 845744785.0, + "step": 22164 + }, + { + "epoch": 2.8196158249586567, + "ewc_loss": 0.008706391789019108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.70639196364209e-05, + "grad_norm": 4.282349586486816, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8830475807189941, + "num_tokens": 845779196.0, + "step": 22165 + }, + { + "epoch": 2.8197430352372472, + "ewc_loss": 0.00872178003191948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.721779886400327e-05, + "grad_norm": 4.280405521392822, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8743929862976074, + "num_tokens": 845819016.0, + "step": 22166 + }, + { + "epoch": 2.8198702455158378, + "ewc_loss": 0.008713964372873306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.713964780326933e-05, + "grad_norm": 4.2801594734191895, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8854604959487915, + "num_tokens": 845855014.0, + "step": 22167 + }, + { + "epoch": 2.8199974557944283, + "ewc_loss": 0.008692731149494648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.692731353221461e-05, + "grad_norm": 4.248149394989014, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8908686637878418, + "num_tokens": 845895491.0, + "step": 22168 + }, + { + "epoch": 2.820124666073019, + "ewc_loss": 0.008667455054819584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667454676469788e-05, + "grad_norm": 4.327491283416748, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8750215768814087, + "num_tokens": 845931560.0, + "step": 22169 + }, + { + "epoch": 2.8202518763516093, + "ewc_loss": 0.008719096891582012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.719096513232216e-05, + "grad_norm": 4.309450149536133, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8838179707527161, + "num_tokens": 845966701.0, + "step": 22170 + }, + { + "epoch": 2.8203790866302, + "ewc_loss": 0.008676447905600071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.676447760080919e-05, + "grad_norm": 4.290221691131592, + "learning_rate": 1e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8989866375923157, + "num_tokens": 846002327.0, + "step": 22171 + }, + { + "epoch": 2.82050629690879, + "ewc_loss": 0.008658144623041153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65814508870244e-05, + "grad_norm": 4.259466171264648, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8847408294677734, + "num_tokens": 846042157.0, + "step": 22172 + }, + { + "epoch": 2.820633507187381, + "ewc_loss": 0.008648858405649662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.648858056403697e-05, + "grad_norm": 4.283239364624023, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8690222501754761, + "num_tokens": 846082851.0, + "step": 22173 + }, + { + "epoch": 2.820760717465971, + "ewc_loss": 0.008662715554237366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662715117679909e-05, + "grad_norm": 4.329287052154541, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8822553753852844, + "num_tokens": 846116846.0, + "step": 22174 + }, + { + "epoch": 2.820887927744562, + "ewc_loss": 0.008672777563333511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.672777767060325e-05, + "grad_norm": 4.277064323425293, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8842324018478394, + "num_tokens": 846154373.0, + "step": 22175 + }, + { + "epoch": 2.821015138023152, + "ewc_loss": 0.008626813068985939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626812632428482e-05, + "grad_norm": 4.2287492752075195, + "learning_rate": 1e-06, + "loss": 0.28, + "mean_token_accuracy": 0.9011884331703186, + "num_tokens": 846195329.0, + "step": 22176 + }, + { + "epoch": 2.821142348301743, + "ewc_loss": 0.008616657927632332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616657578386366e-05, + "grad_norm": 4.271098613739014, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8928946256637573, + "num_tokens": 846233078.0, + "step": 22177 + }, + { + "epoch": 2.821269558580333, + "ewc_loss": 0.008647710084915161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647709910292178e-05, + "grad_norm": 4.304378986358643, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8823237419128418, + "num_tokens": 846269030.0, + "step": 22178 + }, + { + "epoch": 2.8213967688589237, + "ewc_loss": 0.008647751063108444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647751383250579e-05, + "grad_norm": 4.269924163818359, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8820246458053589, + "num_tokens": 846307269.0, + "step": 22179 + }, + { + "epoch": 2.821523979137514, + "ewc_loss": 0.008612423203885555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612422971054912e-05, + "grad_norm": 4.356372356414795, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8741192817687988, + "num_tokens": 846339116.0, + "step": 22180 + }, + { + "epoch": 2.8216511894161047, + "ewc_loss": 0.008673313073813915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.673313277540728e-05, + "grad_norm": 4.232076644897461, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8761937618255615, + "num_tokens": 846384968.0, + "step": 22181 + }, + { + "epoch": 2.8217783996946952, + "ewc_loss": 0.008566571399569511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56657134136185e-05, + "grad_norm": 4.2913079261779785, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8688243627548218, + "num_tokens": 846426235.0, + "step": 22182 + }, + { + "epoch": 2.8219056099732858, + "ewc_loss": 0.008653365075588226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.653365512145683e-05, + "grad_norm": 4.3109130859375, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8722118735313416, + "num_tokens": 846465715.0, + "step": 22183 + }, + { + "epoch": 2.8220328202518763, + "ewc_loss": 0.008619151078164577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619151049060747e-05, + "grad_norm": 4.329959392547607, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8714331388473511, + "num_tokens": 846499972.0, + "step": 22184 + }, + { + "epoch": 2.822160030530467, + "ewc_loss": 0.008641018532216549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641018212074414e-05, + "grad_norm": 4.286269187927246, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8827446699142456, + "num_tokens": 846535906.0, + "step": 22185 + }, + { + "epoch": 2.8222872408090574, + "ewc_loss": 0.008601031266152859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601031004218385e-05, + "grad_norm": 4.290309906005859, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8782428503036499, + "num_tokens": 846572723.0, + "step": 22186 + }, + { + "epoch": 2.822414451087648, + "ewc_loss": 0.008647617883980274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647617505630478e-05, + "grad_norm": 4.292891025543213, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8878828287124634, + "num_tokens": 846613501.0, + "step": 22187 + }, + { + "epoch": 2.8225416613662384, + "ewc_loss": 0.008633690886199474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63369059516117e-05, + "grad_norm": 4.24478816986084, + "learning_rate": 1e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.9030611515045166, + "num_tokens": 846650001.0, + "step": 22188 + }, + { + "epoch": 2.822668871644829, + "ewc_loss": 0.008617622777819633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617622370366007e-05, + "grad_norm": 4.3088507652282715, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8730932474136353, + "num_tokens": 846690636.0, + "step": 22189 + }, + { + "epoch": 2.8227960819234195, + "ewc_loss": 0.00864396058022976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64396060933359e-05, + "grad_norm": 4.343018054962158, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8650853633880615, + "num_tokens": 846728513.0, + "step": 22190 + }, + { + "epoch": 2.82292329220201, + "ewc_loss": 0.008651592768728733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.651592361275107e-05, + "grad_norm": 4.266894340515137, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8699135780334473, + "num_tokens": 846771989.0, + "step": 22191 + }, + { + "epoch": 2.8230505024806005, + "ewc_loss": 0.008592280559241772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592280937591568e-05, + "grad_norm": 4.250090599060059, + "learning_rate": 1e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.8978303670883179, + "num_tokens": 846808393.0, + "step": 22192 + }, + { + "epoch": 2.823177712759191, + "ewc_loss": 0.008616290055215359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61629014252685e-05, + "grad_norm": 4.30040168762207, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8748236894607544, + "num_tokens": 846846928.0, + "step": 22193 + }, + { + "epoch": 2.8233049230377816, + "ewc_loss": 0.008655549958348274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655549754621461e-05, + "grad_norm": 4.3067307472229, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8749573230743408, + "num_tokens": 846884637.0, + "step": 22194 + }, + { + "epoch": 2.823432133316372, + "ewc_loss": 0.00863662175834179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636622078483924e-05, + "grad_norm": 4.25652551651001, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.886432409286499, + "num_tokens": 846925749.0, + "step": 22195 + }, + { + "epoch": 2.8235593435949626, + "ewc_loss": 0.008608927950263023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608927601017058e-05, + "grad_norm": 4.282694339752197, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8702122569084167, + "num_tokens": 846967191.0, + "step": 22196 + }, + { + "epoch": 2.8236865538735527, + "ewc_loss": 0.008642070926725864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642071043141186e-05, + "grad_norm": 4.266895771026611, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8878106474876404, + "num_tokens": 847003280.0, + "step": 22197 + }, + { + "epoch": 2.8238137641521437, + "ewc_loss": 0.008622035384178162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62203523865901e-05, + "grad_norm": 4.2971014976501465, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8703951239585876, + "num_tokens": 847042179.0, + "step": 22198 + }, + { + "epoch": 2.8239409744307338, + "ewc_loss": 0.008651192300021648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.651192183606327e-05, + "grad_norm": 4.263988018035889, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8784875869750977, + "num_tokens": 847083008.0, + "step": 22199 + }, + { + "epoch": 2.8240681847093247, + "ewc_loss": 0.008612832985818386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612833335064352e-05, + "grad_norm": 4.2872490882873535, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8634581565856934, + "num_tokens": 847124009.0, + "step": 22200 + }, + { + "epoch": 2.824195394987915, + "ewc_loss": 0.008645545691251755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64554604049772e-05, + "grad_norm": 4.313544750213623, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8763322830200195, + "num_tokens": 847163526.0, + "step": 22201 + }, + { + "epoch": 2.824322605266506, + "ewc_loss": 0.008641142398118973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641142630949616e-05, + "grad_norm": 4.278146266937256, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8815897107124329, + "num_tokens": 847201374.0, + "step": 22202 + }, + { + "epoch": 2.824449815545096, + "ewc_loss": 0.008623485453426838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623485337011516e-05, + "grad_norm": 4.323517799377441, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8716062307357788, + "num_tokens": 847239469.0, + "step": 22203 + }, + { + "epoch": 2.8245770258236864, + "ewc_loss": 0.008662289939820766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66229020175524e-05, + "grad_norm": 4.316020965576172, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.879601001739502, + "num_tokens": 847275964.0, + "step": 22204 + }, + { + "epoch": 2.824704236102277, + "ewc_loss": 0.008624284528195858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624284964753315e-05, + "grad_norm": 4.255507469177246, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8863584399223328, + "num_tokens": 847316815.0, + "step": 22205 + }, + { + "epoch": 2.8248314463808675, + "ewc_loss": 0.008606960996985435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606960909673944e-05, + "grad_norm": 4.317696571350098, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.888916015625, + "num_tokens": 847350949.0, + "step": 22206 + }, + { + "epoch": 2.824958656659458, + "ewc_loss": 0.008656629361212254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.656629506731406e-05, + "grad_norm": 4.269830226898193, + "learning_rate": 1e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8877310752868652, + "num_tokens": 847388945.0, + "step": 22207 + }, + { + "epoch": 2.8250858669380485, + "ewc_loss": 0.008614079095423222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6140789790079e-05, + "grad_norm": 4.187532901763916, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.893368124961853, + "num_tokens": 847435114.0, + "step": 22208 + }, + { + "epoch": 2.825213077216639, + "ewc_loss": 0.008582400158047676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582400187151507e-05, + "grad_norm": 4.422039985656738, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8761872053146362, + "num_tokens": 847470815.0, + "step": 22209 + }, + { + "epoch": 2.8253402874952296, + "ewc_loss": 0.008744673803448677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.744673687033355e-05, + "grad_norm": 4.307487487792969, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8815081715583801, + "num_tokens": 847509379.0, + "step": 22210 + }, + { + "epoch": 2.82546749777382, + "ewc_loss": 0.008577018976211548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577018888900056e-05, + "grad_norm": 4.293172836303711, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.88752281665802, + "num_tokens": 847547427.0, + "step": 22211 + }, + { + "epoch": 2.8255947080524106, + "ewc_loss": 0.008619778789579868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619778964202851e-05, + "grad_norm": 4.341026306152344, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8680269122123718, + "num_tokens": 847584740.0, + "step": 22212 + }, + { + "epoch": 2.825721918331001, + "ewc_loss": 0.008647087961435318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647087815916166e-05, + "grad_norm": 4.266564846038818, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8736367225646973, + "num_tokens": 847626201.0, + "step": 22213 + }, + { + "epoch": 2.8258491286095917, + "ewc_loss": 0.008584307506680489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584307215642184e-05, + "grad_norm": 4.317454814910889, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.8972204327583313, + "num_tokens": 847661311.0, + "step": 22214 + }, + { + "epoch": 2.8259763388881822, + "ewc_loss": 0.008633329533040524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633329707663506e-05, + "grad_norm": 4.283546447753906, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8811115026473999, + "num_tokens": 847695234.0, + "step": 22215 + }, + { + "epoch": 2.8261035491667728, + "ewc_loss": 0.008607097901403904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607097697677091e-05, + "grad_norm": 4.233883857727051, + "learning_rate": 1e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.8992416262626648, + "num_tokens": 847738848.0, + "step": 22216 + }, + { + "epoch": 2.8262307594453633, + "ewc_loss": 0.008565893396735191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565893222112209e-05, + "grad_norm": 4.297215938568115, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8852471113204956, + "num_tokens": 847775952.0, + "step": 22217 + }, + { + "epoch": 2.826357969723954, + "ewc_loss": 0.00863638985902071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636389975436032e-05, + "grad_norm": 4.307031154632568, + "learning_rate": 1e-06, + "loss": 0.2957, + "mean_token_accuracy": 0.8976312875747681, + "num_tokens": 847809381.0, + "step": 22218 + }, + { + "epoch": 2.8264851800025443, + "ewc_loss": 0.008585765026509762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585765317548066e-05, + "grad_norm": 4.281598091125488, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8784923553466797, + "num_tokens": 847849835.0, + "step": 22219 + }, + { + "epoch": 2.826612390281135, + "ewc_loss": 0.008576616644859314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576616528443992e-05, + "grad_norm": 4.321050643920898, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8808057308197021, + "num_tokens": 847882726.0, + "step": 22220 + }, + { + "epoch": 2.8267396005597254, + "ewc_loss": 0.00860440731048584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604407776147127e-05, + "grad_norm": 4.29309606552124, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8812441825866699, + "num_tokens": 847920663.0, + "step": 22221 + }, + { + "epoch": 2.8268668108383155, + "ewc_loss": 0.008574826642870903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574826642870903e-05, + "grad_norm": 4.252315521240234, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8894045352935791, + "num_tokens": 847962204.0, + "step": 22222 + }, + { + "epoch": 2.8269940211169065, + "ewc_loss": 0.00855657272040844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55657272040844e-05, + "grad_norm": 4.283105850219727, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8779320120811462, + "num_tokens": 848003722.0, + "step": 22223 + }, + { + "epoch": 2.8271212313954965, + "ewc_loss": 0.008583110757172108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583111048210412e-05, + "grad_norm": 4.253364562988281, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8907573223114014, + "num_tokens": 848045335.0, + "step": 22224 + }, + { + "epoch": 2.8272484416740875, + "ewc_loss": 0.008542880415916443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542880095774308e-05, + "grad_norm": 4.262312889099121, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8752567768096924, + "num_tokens": 848088817.0, + "step": 22225 + }, + { + "epoch": 2.8273756519526776, + "ewc_loss": 0.008559202775359154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559202979085967e-05, + "grad_norm": 4.386919021606445, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8744461536407471, + "num_tokens": 848120993.0, + "step": 22226 + }, + { + "epoch": 2.827502862231268, + "ewc_loss": 0.00861039012670517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610390068497509e-05, + "grad_norm": 4.28855562210083, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.87595534324646, + "num_tokens": 848162542.0, + "step": 22227 + }, + { + "epoch": 2.8276300725098586, + "ewc_loss": 0.008506717160344124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.506717131240293e-05, + "grad_norm": 4.295230388641357, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.897240161895752, + "num_tokens": 848198194.0, + "step": 22228 + }, + { + "epoch": 2.827757282788449, + "ewc_loss": 0.008566764183342457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566764154238626e-05, + "grad_norm": 4.314104080200195, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8766944408416748, + "num_tokens": 848240575.0, + "step": 22229 + }, + { + "epoch": 2.8278844930670397, + "ewc_loss": 0.008552292361855507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552292274544016e-05, + "grad_norm": 4.288735389709473, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8830853700637817, + "num_tokens": 848289026.0, + "step": 22230 + }, + { + "epoch": 2.8280117033456302, + "ewc_loss": 0.008534342981874943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534343214705586e-05, + "grad_norm": 4.338785171508789, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8860430717468262, + "num_tokens": 848326438.0, + "step": 22231 + }, + { + "epoch": 2.8281389136242208, + "ewc_loss": 0.008580050431191921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580050052842125e-05, + "grad_norm": 4.280692100524902, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.88880455493927, + "num_tokens": 848368106.0, + "step": 22232 + }, + { + "epoch": 2.8282661239028113, + "ewc_loss": 0.008520897477865219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.520897245034575e-05, + "grad_norm": 4.294641971588135, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8798209428787231, + "num_tokens": 848409775.0, + "step": 22233 + }, + { + "epoch": 2.828393334181402, + "ewc_loss": 0.008533227257430553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533227082807571e-05, + "grad_norm": 4.385497570037842, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8834307789802551, + "num_tokens": 848442149.0, + "step": 22234 + }, + { + "epoch": 2.8285205444599923, + "ewc_loss": 0.008610040880739689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610040822532028e-05, + "grad_norm": 4.363988399505615, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8764429092407227, + "num_tokens": 848475468.0, + "step": 22235 + }, + { + "epoch": 2.828647754738583, + "ewc_loss": 0.008543815463781357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54381505632773e-05, + "grad_norm": 4.288851261138916, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8831942081451416, + "num_tokens": 848509565.0, + "step": 22236 + }, + { + "epoch": 2.8287749650171734, + "ewc_loss": 0.008556573651731014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556573448004201e-05, + "grad_norm": 4.284764766693115, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8835542798042297, + "num_tokens": 848546357.0, + "step": 22237 + }, + { + "epoch": 2.828902175295764, + "ewc_loss": 0.008592750877141953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592750964453444e-05, + "grad_norm": 4.273185729980469, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8708237409591675, + "num_tokens": 848587661.0, + "step": 22238 + }, + { + "epoch": 2.8290293855743545, + "ewc_loss": 0.008584897965192795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584898023400456e-05, + "grad_norm": 4.222433567047119, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8859587907791138, + "num_tokens": 848628250.0, + "step": 22239 + }, + { + "epoch": 2.829156595852945, + "ewc_loss": 0.008565847761929035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565847383579239e-05, + "grad_norm": 4.265912055969238, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.883039116859436, + "num_tokens": 848670452.0, + "step": 22240 + }, + { + "epoch": 2.8292838061315355, + "ewc_loss": 0.008612832985818386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61283260746859e-05, + "grad_norm": 4.3532257080078125, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8839125633239746, + "num_tokens": 848703592.0, + "step": 22241 + }, + { + "epoch": 2.829411016410126, + "ewc_loss": 0.008645323105156422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.645323396194726e-05, + "grad_norm": 4.3514580726623535, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.869254469871521, + "num_tokens": 848739291.0, + "step": 22242 + }, + { + "epoch": 2.8295382266887166, + "ewc_loss": 0.008609741926193237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609741780674085e-05, + "grad_norm": 4.313021183013916, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8784005641937256, + "num_tokens": 848776039.0, + "step": 22243 + }, + { + "epoch": 2.829665436967307, + "ewc_loss": 0.008578524924814701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578525012126192e-05, + "grad_norm": 4.265663146972656, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8841742277145386, + "num_tokens": 848814799.0, + "step": 22244 + }, + { + "epoch": 2.8297926472458976, + "ewc_loss": 0.008576498366892338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576497930334881e-05, + "grad_norm": 4.201690673828125, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8829299807548523, + "num_tokens": 848860123.0, + "step": 22245 + }, + { + "epoch": 2.829919857524488, + "ewc_loss": 0.008570133708417416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570133650209755e-05, + "grad_norm": 4.257867813110352, + "learning_rate": 1e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8980470299720764, + "num_tokens": 848900447.0, + "step": 22246 + }, + { + "epoch": 2.8300470678030782, + "ewc_loss": 0.00860372930765152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603728929301724e-05, + "grad_norm": 4.2620038986206055, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8764939904212952, + "num_tokens": 848945385.0, + "step": 22247 + }, + { + "epoch": 2.830174278081669, + "ewc_loss": 0.008579356595873833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579356654081494e-05, + "grad_norm": 4.3752617835998535, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8725993633270264, + "num_tokens": 848978783.0, + "step": 22248 + }, + { + "epoch": 2.8303014883602593, + "ewc_loss": 0.008635290898382664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63529130583629e-05, + "grad_norm": 4.31437349319458, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8722336292266846, + "num_tokens": 849017324.0, + "step": 22249 + }, + { + "epoch": 2.8304286986388503, + "ewc_loss": 0.008558576926589012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558577246731147e-05, + "grad_norm": 4.373157978057861, + "learning_rate": 1e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.9014896750450134, + "num_tokens": 849045928.0, + "step": 22250 + }, + { + "epoch": 2.8305559089174404, + "ewc_loss": 0.0086056524887681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605652692494914e-05, + "grad_norm": 4.2984747886657715, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8819053769111633, + "num_tokens": 849080855.0, + "step": 22251 + }, + { + "epoch": 2.830683119196031, + "ewc_loss": 0.008551163598895073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551163773518056e-05, + "grad_norm": 4.353553295135498, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8855528831481934, + "num_tokens": 849114835.0, + "step": 22252 + }, + { + "epoch": 2.8308103294746214, + "ewc_loss": 0.008630266413092613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.630266529507935e-05, + "grad_norm": 4.3029608726501465, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8722176551818848, + "num_tokens": 849152632.0, + "step": 22253 + }, + { + "epoch": 2.830937539753212, + "ewc_loss": 0.008574641309678555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574641105951741e-05, + "grad_norm": 4.3109822273254395, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8899754285812378, + "num_tokens": 849189911.0, + "step": 22254 + }, + { + "epoch": 2.8310647500318025, + "ewc_loss": 0.008610542863607407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610542863607407e-05, + "grad_norm": 4.274764060974121, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8827259540557861, + "num_tokens": 849227847.0, + "step": 22255 + }, + { + "epoch": 2.831191960310393, + "ewc_loss": 0.008588766679167747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588766650063917e-05, + "grad_norm": 4.251852512359619, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8870316743850708, + "num_tokens": 849265778.0, + "step": 22256 + }, + { + "epoch": 2.8313191705889835, + "ewc_loss": 0.008606174029409885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60617365106009e-05, + "grad_norm": 4.312533855438232, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8829472064971924, + "num_tokens": 849302488.0, + "step": 22257 + }, + { + "epoch": 2.831446380867574, + "ewc_loss": 0.008644874207675457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644874469609931e-05, + "grad_norm": 4.272908687591553, + "learning_rate": 1e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8966858386993408, + "num_tokens": 849337483.0, + "step": 22258 + }, + { + "epoch": 2.8315735911461646, + "ewc_loss": 0.008621392771601677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621392771601677e-05, + "grad_norm": 4.271204948425293, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8896644115447998, + "num_tokens": 849378491.0, + "step": 22259 + }, + { + "epoch": 2.831700801424755, + "ewc_loss": 0.008622943423688412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62294327816926e-05, + "grad_norm": 4.24723482131958, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8899142742156982, + "num_tokens": 849418897.0, + "step": 22260 + }, + { + "epoch": 2.8318280117033456, + "ewc_loss": 0.0086137056350708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613705722382292e-05, + "grad_norm": 4.296852111816406, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8880626559257507, + "num_tokens": 849457057.0, + "step": 22261 + }, + { + "epoch": 2.831955221981936, + "ewc_loss": 0.008622804656624794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622804307378829e-05, + "grad_norm": 4.337689399719238, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8877135515213013, + "num_tokens": 849488825.0, + "step": 22262 + }, + { + "epoch": 2.8320824322605267, + "ewc_loss": 0.008634124882519245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634124969830737e-05, + "grad_norm": 4.262300968170166, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8879172801971436, + "num_tokens": 849528336.0, + "step": 22263 + }, + { + "epoch": 2.8322096425391172, + "ewc_loss": 0.008572985418140888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572985825594515e-05, + "grad_norm": 4.252921104431152, + "learning_rate": 1e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8962072134017944, + "num_tokens": 849565373.0, + "step": 22264 + }, + { + "epoch": 2.8323368528177078, + "ewc_loss": 0.008586290292441845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58629064168781e-05, + "grad_norm": 4.275832653045654, + "learning_rate": 1e-06, + "loss": 0.296, + "mean_token_accuracy": 0.893587589263916, + "num_tokens": 849600909.0, + "step": 22265 + }, + { + "epoch": 2.8324640630962983, + "ewc_loss": 0.008629236370325089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629236253909767e-05, + "grad_norm": 4.296128749847412, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8744841814041138, + "num_tokens": 849639499.0, + "step": 22266 + }, + { + "epoch": 2.832591273374889, + "ewc_loss": 0.008612912148237228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612911915406585e-05, + "grad_norm": 4.265382766723633, + "learning_rate": 1e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8932301998138428, + "num_tokens": 849677232.0, + "step": 22267 + }, + { + "epoch": 2.8327184836534793, + "ewc_loss": 0.008592372760176659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592372614657506e-05, + "grad_norm": 4.2569966316223145, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8882999420166016, + "num_tokens": 849717187.0, + "step": 22268 + }, + { + "epoch": 2.83284569393207, + "ewc_loss": 0.008604347705841064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604348113294691e-05, + "grad_norm": 4.27323579788208, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8925665616989136, + "num_tokens": 849756545.0, + "step": 22269 + }, + { + "epoch": 2.83297290421066, + "ewc_loss": 0.008598578162491322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598578278906643e-05, + "grad_norm": 4.3770060539245605, + "learning_rate": 1e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8877922296524048, + "num_tokens": 849788670.0, + "step": 22270 + }, + { + "epoch": 2.833100114489251, + "ewc_loss": 0.008657765574753284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65776528371498e-05, + "grad_norm": 4.298127174377441, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8842661380767822, + "num_tokens": 849825715.0, + "step": 22271 + }, + { + "epoch": 2.833227324767841, + "ewc_loss": 0.00856025144457817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560251444578171e-05, + "grad_norm": 4.353246688842773, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8903928995132446, + "num_tokens": 849856435.0, + "step": 22272 + }, + { + "epoch": 2.833354535046432, + "ewc_loss": 0.008643655106425285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643655019113794e-05, + "grad_norm": 4.2939066886901855, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8974897861480713, + "num_tokens": 849895634.0, + "step": 22273 + }, + { + "epoch": 2.833481745325022, + "ewc_loss": 0.008589953184127808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58995335875079e-05, + "grad_norm": 4.281283378601074, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8638414144515991, + "num_tokens": 849938646.0, + "step": 22274 + }, + { + "epoch": 2.833608955603613, + "ewc_loss": 0.008617321029305458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61732114572078e-05, + "grad_norm": 4.290863037109375, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8797868490219116, + "num_tokens": 849978642.0, + "step": 22275 + }, + { + "epoch": 2.833736165882203, + "ewc_loss": 0.00861753337085247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617533603683114e-05, + "grad_norm": 4.277904987335205, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8837152719497681, + "num_tokens": 850017820.0, + "step": 22276 + }, + { + "epoch": 2.8338633761607936, + "ewc_loss": 0.00860095489770174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600955334259197e-05, + "grad_norm": 4.347106456756592, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8645583391189575, + "num_tokens": 850056619.0, + "step": 22277 + }, + { + "epoch": 2.833990586439384, + "ewc_loss": 0.008632016368210316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632016397314146e-05, + "grad_norm": 4.2896809577941895, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8822382688522339, + "num_tokens": 850092456.0, + "step": 22278 + }, + { + "epoch": 2.8341177967179747, + "ewc_loss": 0.008597408421337605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597408304922283e-05, + "grad_norm": 4.286413192749023, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8794413208961487, + "num_tokens": 850131598.0, + "step": 22279 + }, + { + "epoch": 2.8342450069965652, + "ewc_loss": 0.008597001433372498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59700157889165e-05, + "grad_norm": 4.289491176605225, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.874590277671814, + "num_tokens": 850170884.0, + "step": 22280 + }, + { + "epoch": 2.8343722172751558, + "ewc_loss": 0.008623252622783184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623252506367862e-05, + "grad_norm": 4.293509006500244, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.882745087146759, + "num_tokens": 850206989.0, + "step": 22281 + }, + { + "epoch": 2.8344994275537463, + "ewc_loss": 0.008616914041340351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616913692094386e-05, + "grad_norm": 4.238236427307129, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8801013231277466, + "num_tokens": 850248602.0, + "step": 22282 + }, + { + "epoch": 2.834626637832337, + "ewc_loss": 0.008583351038396358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58335115481168e-05, + "grad_norm": 4.3330888748168945, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8761371374130249, + "num_tokens": 850289790.0, + "step": 22283 + }, + { + "epoch": 2.8347538481109273, + "ewc_loss": 0.00865238532423973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.652385440655053e-05, + "grad_norm": 4.253047943115234, + "learning_rate": 1e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8909372091293335, + "num_tokens": 850326688.0, + "step": 22284 + }, + { + "epoch": 2.834881058389518, + "ewc_loss": 0.008579671382904053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579671703046188e-05, + "grad_norm": 4.2593793869018555, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8931592702865601, + "num_tokens": 850366460.0, + "step": 22285 + }, + { + "epoch": 2.8350082686681084, + "ewc_loss": 0.008600026369094849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600026194471866e-05, + "grad_norm": 4.297914028167725, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8888006210327148, + "num_tokens": 850400046.0, + "step": 22286 + }, + { + "epoch": 2.835135478946699, + "ewc_loss": 0.008624959737062454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624960173619911e-05, + "grad_norm": 4.359441757202148, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.875645637512207, + "num_tokens": 850434374.0, + "step": 22287 + }, + { + "epoch": 2.8352626892252895, + "ewc_loss": 0.008647685870528221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647685899632052e-05, + "grad_norm": 4.307921409606934, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.884059727191925, + "num_tokens": 850469056.0, + "step": 22288 + }, + { + "epoch": 2.83538989950388, + "ewc_loss": 0.00857451930642128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574518869863823e-05, + "grad_norm": 4.3025102615356445, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8809645175933838, + "num_tokens": 850505805.0, + "step": 22289 + }, + { + "epoch": 2.8355171097824705, + "ewc_loss": 0.008612430654466152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612430247012526e-05, + "grad_norm": 4.27512788772583, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.873001754283905, + "num_tokens": 850550056.0, + "step": 22290 + }, + { + "epoch": 2.835644320061061, + "ewc_loss": 0.008590570650994778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590570359956473e-05, + "grad_norm": 4.306253910064697, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8896411657333374, + "num_tokens": 850585075.0, + "step": 22291 + }, + { + "epoch": 2.8357715303396516, + "ewc_loss": 0.008614452555775642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614452235633507e-05, + "grad_norm": 4.276315212249756, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8922311067581177, + "num_tokens": 850625758.0, + "step": 22292 + }, + { + "epoch": 2.835898740618242, + "ewc_loss": 0.008580110967159271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580111170886084e-05, + "grad_norm": 4.249390602111816, + "learning_rate": 1e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.8952302932739258, + "num_tokens": 850664608.0, + "step": 22293 + }, + { + "epoch": 2.8360259508968326, + "ewc_loss": 0.008576408959925175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576409163651988e-05, + "grad_norm": 4.261517524719238, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8895605802536011, + "num_tokens": 850705897.0, + "step": 22294 + }, + { + "epoch": 2.8361531611754227, + "ewc_loss": 0.008586948737502098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586949115851894e-05, + "grad_norm": 4.2918291091918945, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8780404329299927, + "num_tokens": 850746195.0, + "step": 22295 + }, + { + "epoch": 2.8362803714540137, + "ewc_loss": 0.008592051453888416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59205101733096e-05, + "grad_norm": 4.294937610626221, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8912662267684937, + "num_tokens": 850785183.0, + "step": 22296 + }, + { + "epoch": 2.8364075817326038, + "ewc_loss": 0.00857933983206749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579339919378981e-05, + "grad_norm": 4.287735939025879, + "learning_rate": 1e-06, + "loss": 0.3166, + "mean_token_accuracy": 0.8895914554595947, + "num_tokens": 850819558.0, + "step": 22297 + }, + { + "epoch": 2.8365347920111947, + "ewc_loss": 0.008573297411203384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573297236580402e-05, + "grad_norm": 4.320887565612793, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8736693859100342, + "num_tokens": 850857385.0, + "step": 22298 + }, + { + "epoch": 2.836662002289785, + "ewc_loss": 0.008594026789069176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59402643982321e-05, + "grad_norm": 4.259261608123779, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8786487579345703, + "num_tokens": 850895297.0, + "step": 22299 + }, + { + "epoch": 2.836789212568376, + "ewc_loss": 0.008562169969081879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562170114601031e-05, + "grad_norm": 4.346552848815918, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8832035660743713, + "num_tokens": 850930982.0, + "step": 22300 + }, + { + "epoch": 2.836916422846966, + "ewc_loss": 0.008612962439656258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612962847109884e-05, + "grad_norm": 4.272139549255371, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8781253695487976, + "num_tokens": 850969032.0, + "step": 22301 + }, + { + "epoch": 2.8370436331255564, + "ewc_loss": 0.008564743213355541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.564742893213406e-05, + "grad_norm": 4.284445762634277, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8856406211853027, + "num_tokens": 851008325.0, + "step": 22302 + }, + { + "epoch": 2.837170843404147, + "ewc_loss": 0.008615207858383656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61520748003386e-05, + "grad_norm": 4.30318546295166, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.883533239364624, + "num_tokens": 851044289.0, + "step": 22303 + }, + { + "epoch": 2.8372980536827375, + "ewc_loss": 0.008600926958024502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600926958024502e-05, + "grad_norm": 4.2939372062683105, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8907436728477478, + "num_tokens": 851079887.0, + "step": 22304 + }, + { + "epoch": 2.837425263961328, + "ewc_loss": 0.008598500862717628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598500426160172e-05, + "grad_norm": 4.32683801651001, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8864262700080872, + "num_tokens": 851124535.0, + "step": 22305 + }, + { + "epoch": 2.8375524742399185, + "ewc_loss": 0.00861336663365364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613366662757471e-05, + "grad_norm": 4.250578880310059, + "learning_rate": 1e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8915799856185913, + "num_tokens": 851162982.0, + "step": 22306 + }, + { + "epoch": 2.837679684518509, + "ewc_loss": 0.008553761057555676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553761290386319e-05, + "grad_norm": 4.249416828155518, + "learning_rate": 1e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.890476405620575, + "num_tokens": 851201864.0, + "step": 22307 + }, + { + "epoch": 2.8378068947970996, + "ewc_loss": 0.008584505878388882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584505849285051e-05, + "grad_norm": 4.282341003417969, + "learning_rate": 1e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8880676031112671, + "num_tokens": 851241659.0, + "step": 22308 + }, + { + "epoch": 2.83793410507569, + "ewc_loss": 0.008600469678640366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60046930029057e-05, + "grad_norm": 4.325986862182617, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8757204413414001, + "num_tokens": 851278861.0, + "step": 22309 + }, + { + "epoch": 2.8380613153542806, + "ewc_loss": 0.008591831661760807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591832011006773e-05, + "grad_norm": 4.338955879211426, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8854808807373047, + "num_tokens": 851314442.0, + "step": 22310 + }, + { + "epoch": 2.838188525632871, + "ewc_loss": 0.008585194125771523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585194154875353e-05, + "grad_norm": 4.262485027313232, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8795490264892578, + "num_tokens": 851356322.0, + "step": 22311 + }, + { + "epoch": 2.8383157359114617, + "ewc_loss": 0.00854065828025341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540658745914698e-05, + "grad_norm": 4.30888557434082, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8723332285881042, + "num_tokens": 851395374.0, + "step": 22312 + }, + { + "epoch": 2.838442946190052, + "ewc_loss": 0.008586376905441284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586377225583419e-05, + "grad_norm": 4.28700590133667, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8814117908477783, + "num_tokens": 851430957.0, + "step": 22313 + }, + { + "epoch": 2.8385701564686427, + "ewc_loss": 0.00855260994285345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552610233891755e-05, + "grad_norm": 4.232210159301758, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8778388500213623, + "num_tokens": 851474357.0, + "step": 22314 + }, + { + "epoch": 2.8386973667472333, + "ewc_loss": 0.008545976132154465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545976015739143e-05, + "grad_norm": 4.252924919128418, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8954461812973022, + "num_tokens": 851510346.0, + "step": 22315 + }, + { + "epoch": 2.838824577025824, + "ewc_loss": 0.008556708693504333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556708780815825e-05, + "grad_norm": 4.309330940246582, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8852155208587646, + "num_tokens": 851546798.0, + "step": 22316 + }, + { + "epoch": 2.8389517873044143, + "ewc_loss": 0.008583721704781055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583721501054242e-05, + "grad_norm": 4.276785373687744, + "learning_rate": 1e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8878834247589111, + "num_tokens": 851587819.0, + "step": 22317 + }, + { + "epoch": 2.839078997583005, + "ewc_loss": 0.008545991033315659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545990567654371e-05, + "grad_norm": 4.27268648147583, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8884893655776978, + "num_tokens": 851628764.0, + "step": 22318 + }, + { + "epoch": 2.8392062078615954, + "ewc_loss": 0.008571313694119453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571313810534775e-05, + "grad_norm": 4.293670654296875, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.871444821357727, + "num_tokens": 851672206.0, + "step": 22319 + }, + { + "epoch": 2.8393334181401855, + "ewc_loss": 0.008546674624085426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.546674507670105e-05, + "grad_norm": 4.284265518188477, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8777276277542114, + "num_tokens": 851709969.0, + "step": 22320 + }, + { + "epoch": 2.8394606284187764, + "ewc_loss": 0.008534645661711693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534645894542336e-05, + "grad_norm": 4.296482563018799, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8768283724784851, + "num_tokens": 851749015.0, + "step": 22321 + }, + { + "epoch": 2.8395878386973665, + "ewc_loss": 0.008555984124541283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555984095437452e-05, + "grad_norm": 4.278724670410156, + "learning_rate": 1e-06, + "loss": 0.3219, + "mean_token_accuracy": 0.8860089778900146, + "num_tokens": 851787210.0, + "step": 22322 + }, + { + "epoch": 2.8397150489759575, + "ewc_loss": 0.008550005033612251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550004713470116e-05, + "grad_norm": 4.306966781616211, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8780167102813721, + "num_tokens": 851824677.0, + "step": 22323 + }, + { + "epoch": 2.8398422592545476, + "ewc_loss": 0.008570187725126743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5701874922961e-05, + "grad_norm": 4.271642684936523, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8898302316665649, + "num_tokens": 851863614.0, + "step": 22324 + }, + { + "epoch": 2.839969469533138, + "ewc_loss": 0.008532443083822727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.532443462172523e-05, + "grad_norm": 4.252117156982422, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8765510320663452, + "num_tokens": 851908153.0, + "step": 22325 + }, + { + "epoch": 2.8400966798117286, + "ewc_loss": 0.008539644069969654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539643749827519e-05, + "grad_norm": 4.29366397857666, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8789656162261963, + "num_tokens": 851946473.0, + "step": 22326 + }, + { + "epoch": 2.840223890090319, + "ewc_loss": 0.008577974513173103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577974222134799e-05, + "grad_norm": 4.245718479156494, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8739842772483826, + "num_tokens": 851991600.0, + "step": 22327 + }, + { + "epoch": 2.8403511003689097, + "ewc_loss": 0.008508136495947838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.50813667057082e-05, + "grad_norm": 4.283753871917725, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8825222849845886, + "num_tokens": 852029468.0, + "step": 22328 + }, + { + "epoch": 2.8404783106475002, + "ewc_loss": 0.008562102913856506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562102448195219e-05, + "grad_norm": 4.3248291015625, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8885728120803833, + "num_tokens": 852061400.0, + "step": 22329 + }, + { + "epoch": 2.8406055209260908, + "ewc_loss": 0.008553214371204376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553214138373733e-05, + "grad_norm": 4.273594379425049, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8820177912712097, + "num_tokens": 852100596.0, + "step": 22330 + }, + { + "epoch": 2.8407327312046813, + "ewc_loss": 0.008526010438799858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526010788045824e-05, + "grad_norm": 4.257086753845215, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8854284286499023, + "num_tokens": 852140320.0, + "step": 22331 + }, + { + "epoch": 2.840859941483272, + "ewc_loss": 0.008527395315468311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527395402779803e-05, + "grad_norm": 4.319810390472412, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8828481435775757, + "num_tokens": 852175969.0, + "step": 22332 + }, + { + "epoch": 2.8409871517618623, + "ewc_loss": 0.008562733419239521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562733273720369e-05, + "grad_norm": 4.306338787078857, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8704577684402466, + "num_tokens": 852210816.0, + "step": 22333 + }, + { + "epoch": 2.841114362040453, + "ewc_loss": 0.00853690691292286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.536906534573063e-05, + "grad_norm": 4.310642242431641, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8838906288146973, + "num_tokens": 852246684.0, + "step": 22334 + }, + { + "epoch": 2.8412415723190434, + "ewc_loss": 0.008559278212487698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.559278649045154e-05, + "grad_norm": 4.258080959320068, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8853574395179749, + "num_tokens": 852285283.0, + "step": 22335 + }, + { + "epoch": 2.841368782597634, + "ewc_loss": 0.008550849743187428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.550849452149123e-05, + "grad_norm": 4.296197414398193, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8903630971908569, + "num_tokens": 852323310.0, + "step": 22336 + }, + { + "epoch": 2.8414959928762245, + "ewc_loss": 0.008575749583542347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575749234296381e-05, + "grad_norm": 4.292356967926025, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8879234790802002, + "num_tokens": 852360850.0, + "step": 22337 + }, + { + "epoch": 2.841623203154815, + "ewc_loss": 0.008566111326217651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566111500840634e-05, + "grad_norm": 4.328695297241211, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.877849280834198, + "num_tokens": 852399320.0, + "step": 22338 + }, + { + "epoch": 2.8417504134334055, + "ewc_loss": 0.008588122203946114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5881220002193e-05, + "grad_norm": 4.337017059326172, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8802425265312195, + "num_tokens": 852430980.0, + "step": 22339 + }, + { + "epoch": 2.841877623711996, + "ewc_loss": 0.008593698032200336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59369829413481e-05, + "grad_norm": 4.294785976409912, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8937174677848816, + "num_tokens": 852465688.0, + "step": 22340 + }, + { + "epoch": 2.8420048339905866, + "ewc_loss": 0.008587277494370937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587277261540294e-05, + "grad_norm": 4.302241802215576, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8812188506126404, + "num_tokens": 852504372.0, + "step": 22341 + }, + { + "epoch": 2.842132044269177, + "ewc_loss": 0.008587743155658245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587742922827601e-05, + "grad_norm": 4.243651390075684, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.887283444404602, + "num_tokens": 852542232.0, + "step": 22342 + }, + { + "epoch": 2.8422592545477676, + "ewc_loss": 0.00855253729969263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.552537474315614e-05, + "grad_norm": 4.323106288909912, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8744602203369141, + "num_tokens": 852580005.0, + "step": 22343 + }, + { + "epoch": 2.842386464826358, + "ewc_loss": 0.008606585673987865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606585470261052e-05, + "grad_norm": 4.220218181610107, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8936382532119751, + "num_tokens": 852619912.0, + "step": 22344 + }, + { + "epoch": 2.8425136751049482, + "ewc_loss": 0.008547923527657986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54792378959246e-05, + "grad_norm": 4.359646797180176, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8870887756347656, + "num_tokens": 852654349.0, + "step": 22345 + }, + { + "epoch": 2.842640885383539, + "ewc_loss": 0.008643305860459805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643305773148313e-05, + "grad_norm": 4.296065330505371, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8765627145767212, + "num_tokens": 852691300.0, + "step": 22346 + }, + { + "epoch": 2.8427680956621293, + "ewc_loss": 0.0085903936997056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590393554186448e-05, + "grad_norm": 4.307559013366699, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8840105533599854, + "num_tokens": 852731228.0, + "step": 22347 + }, + { + "epoch": 2.8428953059407203, + "ewc_loss": 0.008586885407567024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586885087424889e-05, + "grad_norm": 4.284855842590332, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8808848857879639, + "num_tokens": 852768333.0, + "step": 22348 + }, + { + "epoch": 2.8430225162193103, + "ewc_loss": 0.008586080744862556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58608036651276e-05, + "grad_norm": 4.320529937744141, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8536706566810608, + "num_tokens": 852810962.0, + "step": 22349 + }, + { + "epoch": 2.843149726497901, + "ewc_loss": 0.008609120734035969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609120413893834e-05, + "grad_norm": 4.3037004470825195, + "learning_rate": 1e-06, + "loss": 0.2942, + "mean_token_accuracy": 0.8993645906448364, + "num_tokens": 852844326.0, + "step": 22350 + }, + { + "epoch": 2.8432769367764914, + "ewc_loss": 0.00858518946915865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585189061705023e-05, + "grad_norm": 4.33980655670166, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8788102865219116, + "num_tokens": 852877737.0, + "step": 22351 + }, + { + "epoch": 2.843404147055082, + "ewc_loss": 0.008628475479781628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.628475188743323e-05, + "grad_norm": 4.211449146270752, + "learning_rate": 1e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8897277116775513, + "num_tokens": 852922085.0, + "step": 22352 + }, + { + "epoch": 2.8435313573336725, + "ewc_loss": 0.00855826586484909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55826583574526e-05, + "grad_norm": 4.264004230499268, + "learning_rate": 1e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.8988831043243408, + "num_tokens": 852960742.0, + "step": 22353 + }, + { + "epoch": 2.843658567612263, + "ewc_loss": 0.008618587628006935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618587162345648e-05, + "grad_norm": 4.2992682456970215, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8878602981567383, + "num_tokens": 852993502.0, + "step": 22354 + }, + { + "epoch": 2.8437857778908535, + "ewc_loss": 0.008614861406385899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614861144451424e-05, + "grad_norm": 4.275503635406494, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8867281675338745, + "num_tokens": 853036844.0, + "step": 22355 + }, + { + "epoch": 2.843912988169444, + "ewc_loss": 0.008578288368880749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578288543503731e-05, + "grad_norm": 4.319054126739502, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8832719922065735, + "num_tokens": 853070851.0, + "step": 22356 + }, + { + "epoch": 2.8440401984480346, + "ewc_loss": 0.008618396706879139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618396532256156e-05, + "grad_norm": 4.301489353179932, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8825663328170776, + "num_tokens": 853106116.0, + "step": 22357 + }, + { + "epoch": 2.844167408726625, + "ewc_loss": 0.008576353080570698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576353138778359e-05, + "grad_norm": 4.194452285766602, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.881164014339447, + "num_tokens": 853152776.0, + "step": 22358 + }, + { + "epoch": 2.8442946190052156, + "ewc_loss": 0.008540348149836063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540348062524572e-05, + "grad_norm": 4.288132190704346, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8804029226303101, + "num_tokens": 853194176.0, + "step": 22359 + }, + { + "epoch": 2.844421829283806, + "ewc_loss": 0.008610524237155914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61052394611761e-05, + "grad_norm": 4.2646613121032715, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8807722330093384, + "num_tokens": 853236834.0, + "step": 22360 + }, + { + "epoch": 2.8445490395623967, + "ewc_loss": 0.008556141518056393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.556141256121919e-05, + "grad_norm": 4.3538289070129395, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8756726980209351, + "num_tokens": 853274145.0, + "step": 22361 + }, + { + "epoch": 2.844676249840987, + "ewc_loss": 0.008607068099081516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607067866250873e-05, + "grad_norm": 4.282269477844238, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8863213658332825, + "num_tokens": 853312569.0, + "step": 22362 + }, + { + "epoch": 2.8448034601195777, + "ewc_loss": 0.008523128926753998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.523128781234846e-05, + "grad_norm": 4.303796768188477, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.889276385307312, + "num_tokens": 853351742.0, + "step": 22363 + }, + { + "epoch": 2.8449306703981683, + "ewc_loss": 0.00857172254472971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571722719352692e-05, + "grad_norm": 4.277245998382568, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8866283893585205, + "num_tokens": 853389607.0, + "step": 22364 + }, + { + "epoch": 2.845057880676759, + "ewc_loss": 0.008540532551705837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540532144252211e-05, + "grad_norm": 4.306933879852295, + "learning_rate": 1e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8901822566986084, + "num_tokens": 853427218.0, + "step": 22365 + }, + { + "epoch": 2.8451850909553493, + "ewc_loss": 0.008551987819373608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551987411919981e-05, + "grad_norm": 4.299525260925293, + "learning_rate": 1e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8882877826690674, + "num_tokens": 853464773.0, + "step": 22366 + }, + { + "epoch": 2.84531230123394, + "ewc_loss": 0.008533080108463764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533080108463764e-05, + "grad_norm": 4.246801853179932, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8837156295776367, + "num_tokens": 853507250.0, + "step": 22367 + }, + { + "epoch": 2.84543951151253, + "ewc_loss": 0.008512930013239384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.512930071447045e-05, + "grad_norm": 4.268948078155518, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8860488533973694, + "num_tokens": 853551947.0, + "step": 22368 + }, + { + "epoch": 2.845566721791121, + "ewc_loss": 0.008544543758034706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54454337968491e-05, + "grad_norm": 4.266555309295654, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8757345676422119, + "num_tokens": 853590557.0, + "step": 22369 + }, + { + "epoch": 2.845693932069711, + "ewc_loss": 0.008539726957678795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539726695744321e-05, + "grad_norm": 4.333499431610107, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8769760131835938, + "num_tokens": 853630870.0, + "step": 22370 + }, + { + "epoch": 2.845821142348302, + "ewc_loss": 0.008573167957365513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573167724534869e-05, + "grad_norm": 4.415584087371826, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8828003406524658, + "num_tokens": 853662360.0, + "step": 22371 + }, + { + "epoch": 2.845948352626892, + "ewc_loss": 0.008577532134950161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577532571507618e-05, + "grad_norm": 4.269529342651367, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8780074119567871, + "num_tokens": 853703661.0, + "step": 22372 + }, + { + "epoch": 2.846075562905483, + "ewc_loss": 0.00849209912121296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.492099004797637e-05, + "grad_norm": 4.293745040893555, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8729376792907715, + "num_tokens": 853742398.0, + "step": 22373 + }, + { + "epoch": 2.846202773184073, + "ewc_loss": 0.00855324137955904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.553241787012666e-05, + "grad_norm": 4.298992156982422, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8899969458580017, + "num_tokens": 853781479.0, + "step": 22374 + }, + { + "epoch": 2.8463299834626636, + "ewc_loss": 0.008551026694476604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551026257919148e-05, + "grad_norm": 4.296270847320557, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8596698045730591, + "num_tokens": 853822330.0, + "step": 22375 + }, + { + "epoch": 2.846457193741254, + "ewc_loss": 0.00856068730354309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56068727443926e-05, + "grad_norm": 4.289705276489258, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8707503080368042, + "num_tokens": 853862747.0, + "step": 22376 + }, + { + "epoch": 2.8465844040198447, + "ewc_loss": 0.008569223806262016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569224155507982e-05, + "grad_norm": 4.260560035705566, + "learning_rate": 1e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.905726969242096, + "num_tokens": 853897124.0, + "step": 22377 + }, + { + "epoch": 2.8467116142984352, + "ewc_loss": 0.008574002422392368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574002276873216e-05, + "grad_norm": 4.346147537231445, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8710079193115234, + "num_tokens": 853930516.0, + "step": 22378 + }, + { + "epoch": 2.8468388245770258, + "ewc_loss": 0.00862399023026228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62399028846994e-05, + "grad_norm": 4.264824390411377, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8883578181266785, + "num_tokens": 853967016.0, + "step": 22379 + }, + { + "epoch": 2.8469660348556163, + "ewc_loss": 0.008540087379515171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540087583241984e-05, + "grad_norm": 4.269336700439453, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8671392798423767, + "num_tokens": 854008804.0, + "step": 22380 + }, + { + "epoch": 2.847093245134207, + "ewc_loss": 0.00859708059579134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597080159233883e-05, + "grad_norm": 4.3293137550354, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8722613453865051, + "num_tokens": 854048393.0, + "step": 22381 + }, + { + "epoch": 2.8472204554127973, + "ewc_loss": 0.00862451083958149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624510519439355e-05, + "grad_norm": 4.246094703674316, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.887554943561554, + "num_tokens": 854089480.0, + "step": 22382 + }, + { + "epoch": 2.847347665691388, + "ewc_loss": 0.0085491007193923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549101039534435e-05, + "grad_norm": 4.276531219482422, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8945406675338745, + "num_tokens": 854125677.0, + "step": 22383 + }, + { + "epoch": 2.8474748759699784, + "ewc_loss": 0.008593445643782616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593445818405598e-05, + "grad_norm": 4.312870502471924, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8780369758605957, + "num_tokens": 854160485.0, + "step": 22384 + }, + { + "epoch": 2.847602086248569, + "ewc_loss": 0.008607856929302216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60785658005625e-05, + "grad_norm": 4.352145671844482, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8766785264015198, + "num_tokens": 854200635.0, + "step": 22385 + }, + { + "epoch": 2.8477292965271594, + "ewc_loss": 0.00862880889326334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.628809155197814e-05, + "grad_norm": 4.2617645263671875, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8881864547729492, + "num_tokens": 854240046.0, + "step": 22386 + }, + { + "epoch": 2.84785650680575, + "ewc_loss": 0.008570452220737934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570452337153256e-05, + "grad_norm": 4.311082363128662, + "learning_rate": 1e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.8939140439033508, + "num_tokens": 854275880.0, + "step": 22387 + }, + { + "epoch": 2.8479837170843405, + "ewc_loss": 0.00862176064401865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621760207461193e-05, + "grad_norm": 4.301713466644287, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8823674917221069, + "num_tokens": 854311204.0, + "step": 22388 + }, + { + "epoch": 2.848110927362931, + "ewc_loss": 0.008608718402683735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60871805343777e-05, + "grad_norm": 4.342600345611572, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8805429935455322, + "num_tokens": 854346076.0, + "step": 22389 + }, + { + "epoch": 2.8482381376415216, + "ewc_loss": 0.008624833077192307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624832844361663e-05, + "grad_norm": 4.273843288421631, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8909988403320312, + "num_tokens": 854383075.0, + "step": 22390 + }, + { + "epoch": 2.848365347920112, + "ewc_loss": 0.008598524145781994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598524436820298e-05, + "grad_norm": 4.350188732147217, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8779404163360596, + "num_tokens": 854416011.0, + "step": 22391 + }, + { + "epoch": 2.8484925581987026, + "ewc_loss": 0.008665097877383232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.665097993798554e-05, + "grad_norm": 4.31537389755249, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8857674598693848, + "num_tokens": 854449520.0, + "step": 22392 + }, + { + "epoch": 2.8486197684772927, + "ewc_loss": 0.00863065104931593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.630651427665725e-05, + "grad_norm": 4.308961868286133, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8798418045043945, + "num_tokens": 854484355.0, + "step": 22393 + }, + { + "epoch": 2.8487469787558837, + "ewc_loss": 0.008639917708933353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639917359687388e-05, + "grad_norm": 4.285449028015137, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8694970607757568, + "num_tokens": 854526238.0, + "step": 22394 + }, + { + "epoch": 2.8488741890344738, + "ewc_loss": 0.008640715852379799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640716259833425e-05, + "grad_norm": 4.2750091552734375, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8842023611068726, + "num_tokens": 854562619.0, + "step": 22395 + }, + { + "epoch": 2.8490013993130647, + "ewc_loss": 0.008645245805382729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.645245543448254e-05, + "grad_norm": 4.250461101531982, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8878408670425415, + "num_tokens": 854604974.0, + "step": 22396 + }, + { + "epoch": 2.849128609591655, + "ewc_loss": 0.008648392744362354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.648392395116389e-05, + "grad_norm": 4.255350589752197, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8870526552200317, + "num_tokens": 854644066.0, + "step": 22397 + }, + { + "epoch": 2.849255819870246, + "ewc_loss": 0.008658035658299923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.658035221742466e-05, + "grad_norm": 4.296147346496582, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8860107064247131, + "num_tokens": 854682723.0, + "step": 22398 + }, + { + "epoch": 2.849383030148836, + "ewc_loss": 0.008649148046970367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.649147639516741e-05, + "grad_norm": 4.2300214767456055, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8918893337249756, + "num_tokens": 854720292.0, + "step": 22399 + }, + { + "epoch": 2.8495102404274264, + "ewc_loss": 0.0086101358756423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610136137576774e-05, + "grad_norm": 4.330451011657715, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.879098653793335, + "num_tokens": 854756917.0, + "step": 22400 + }, + { + "epoch": 2.849637450706017, + "ewc_loss": 0.008687806315720081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.68780625751242e-05, + "grad_norm": 4.269151210784912, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8854334354400635, + "num_tokens": 854798771.0, + "step": 22401 + }, + { + "epoch": 2.8497646609846075, + "ewc_loss": 0.00861627422273159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616274135420099e-05, + "grad_norm": 4.282599925994873, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8810142278671265, + "num_tokens": 854838193.0, + "step": 22402 + }, + { + "epoch": 2.849891871263198, + "ewc_loss": 0.008621981367468834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621981396572664e-05, + "grad_norm": 4.3176774978637695, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8773291110992432, + "num_tokens": 854871650.0, + "step": 22403 + }, + { + "epoch": 2.8500190815417885, + "ewc_loss": 0.008667903020977974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667902875458822e-05, + "grad_norm": 4.304614067077637, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.881354570388794, + "num_tokens": 854910757.0, + "step": 22404 + }, + { + "epoch": 2.850146291820379, + "ewc_loss": 0.008632521145045757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63252134877257e-05, + "grad_norm": 4.383887767791748, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8724250793457031, + "num_tokens": 854945018.0, + "step": 22405 + }, + { + "epoch": 2.8502735020989696, + "ewc_loss": 0.008670289069414139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.670288661960512e-05, + "grad_norm": 4.255284309387207, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.876128077507019, + "num_tokens": 854988369.0, + "step": 22406 + }, + { + "epoch": 2.85040071237756, + "ewc_loss": 0.008573158644139767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573158265789971e-05, + "grad_norm": 4.3067450523376465, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8854623436927795, + "num_tokens": 855023935.0, + "step": 22407 + }, + { + "epoch": 2.8505279226561506, + "ewc_loss": 0.008626971393823624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626971248304471e-05, + "grad_norm": 4.279465198516846, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8814389705657959, + "num_tokens": 855062002.0, + "step": 22408 + }, + { + "epoch": 2.850655132934741, + "ewc_loss": 0.008602719753980637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602719753980637e-05, + "grad_norm": 4.28712272644043, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8869457244873047, + "num_tokens": 855101257.0, + "step": 22409 + }, + { + "epoch": 2.8507823432133317, + "ewc_loss": 0.008606321178376675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606321352999657e-05, + "grad_norm": 4.324979305267334, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8846678733825684, + "num_tokens": 855135834.0, + "step": 22410 + }, + { + "epoch": 2.850909553491922, + "ewc_loss": 0.008631286211311817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631285891169682e-05, + "grad_norm": 4.269328594207764, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8852028846740723, + "num_tokens": 855174156.0, + "step": 22411 + }, + { + "epoch": 2.8510367637705127, + "ewc_loss": 0.008595251478254795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595251711085439e-05, + "grad_norm": 4.297452926635742, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8913257718086243, + "num_tokens": 855207698.0, + "step": 22412 + }, + { + "epoch": 2.8511639740491033, + "ewc_loss": 0.00863681547343731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636815618956462e-05, + "grad_norm": 4.331578254699707, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8837226629257202, + "num_tokens": 855240636.0, + "step": 22413 + }, + { + "epoch": 2.851291184327694, + "ewc_loss": 0.008635773323476315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63577370182611e-05, + "grad_norm": 4.245686054229736, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8850732445716858, + "num_tokens": 855279923.0, + "step": 22414 + }, + { + "epoch": 2.8514183946062843, + "ewc_loss": 0.008592430502176285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59243082231842e-05, + "grad_norm": 4.316744327545166, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8723530769348145, + "num_tokens": 855315460.0, + "step": 22415 + }, + { + "epoch": 2.851545604884875, + "ewc_loss": 0.008655766025185585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655765850562602e-05, + "grad_norm": 4.27078914642334, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8896541595458984, + "num_tokens": 855351548.0, + "step": 22416 + }, + { + "epoch": 2.8516728151634654, + "ewc_loss": 0.008608538657426834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608538337284699e-05, + "grad_norm": 4.392788410186768, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8659974336624146, + "num_tokens": 855390715.0, + "step": 22417 + }, + { + "epoch": 2.8518000254420555, + "ewc_loss": 0.00869447086006403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.694471034687012e-05, + "grad_norm": 4.260279178619385, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8699734210968018, + "num_tokens": 855431997.0, + "step": 22418 + }, + { + "epoch": 2.8519272357206464, + "ewc_loss": 0.008574182167649269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.574181993026286e-05, + "grad_norm": 4.283324718475342, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8790407776832581, + "num_tokens": 855470242.0, + "step": 22419 + }, + { + "epoch": 2.8520544459992365, + "ewc_loss": 0.008664057590067387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.664057531859726e-05, + "grad_norm": 4.284844398498535, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.872019350528717, + "num_tokens": 855508901.0, + "step": 22420 + }, + { + "epoch": 2.8521816562778275, + "ewc_loss": 0.008635972626507282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635972335468978e-05, + "grad_norm": 4.310683250427246, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8784143924713135, + "num_tokens": 855545290.0, + "step": 22421 + }, + { + "epoch": 2.8523088665564176, + "ewc_loss": 0.008652175776660442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.652175893075764e-05, + "grad_norm": 4.269949913024902, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8801138401031494, + "num_tokens": 855579567.0, + "step": 22422 + }, + { + "epoch": 2.852436076835008, + "ewc_loss": 0.008646121248602867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646121568745002e-05, + "grad_norm": 4.267222881317139, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8741247057914734, + "num_tokens": 855621117.0, + "step": 22423 + }, + { + "epoch": 2.8525632871135986, + "ewc_loss": 0.0086541548371315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.654154953546822e-05, + "grad_norm": 4.261166572570801, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8839694261550903, + "num_tokens": 855660494.0, + "step": 22424 + }, + { + "epoch": 2.852690497392189, + "ewc_loss": 0.008634356781840324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634357072878629e-05, + "grad_norm": 4.318230152130127, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8820167183876038, + "num_tokens": 855694043.0, + "step": 22425 + }, + { + "epoch": 2.8528177076707797, + "ewc_loss": 0.008699762634932995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.699762838659808e-05, + "grad_norm": 4.309377193450928, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.869472324848175, + "num_tokens": 855734615.0, + "step": 22426 + }, + { + "epoch": 2.85294491794937, + "ewc_loss": 0.00865646731108427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65646725287661e-05, + "grad_norm": 4.283140659332275, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.8917618989944458, + "num_tokens": 855771141.0, + "step": 22427 + }, + { + "epoch": 2.8530721282279607, + "ewc_loss": 0.00866754911839962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667549263918772e-05, + "grad_norm": 4.346853256225586, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8816075325012207, + "num_tokens": 855806877.0, + "step": 22428 + }, + { + "epoch": 2.8531993385065513, + "ewc_loss": 0.00871516764163971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.715167496120557e-05, + "grad_norm": 4.3171467781066895, + "learning_rate": 1e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.8959846496582031, + "num_tokens": 855840076.0, + "step": 22429 + }, + { + "epoch": 2.853326548785142, + "ewc_loss": 0.008656877093017101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.656876889290288e-05, + "grad_norm": 4.3043131828308105, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8921480178833008, + "num_tokens": 855877972.0, + "step": 22430 + }, + { + "epoch": 2.8534537590637323, + "ewc_loss": 0.008683118969202042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.683119085617363e-05, + "grad_norm": 4.285203456878662, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8772855401039124, + "num_tokens": 855917424.0, + "step": 22431 + }, + { + "epoch": 2.853580969342323, + "ewc_loss": 0.008679122664034367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.679122402099892e-05, + "grad_norm": 4.353949546813965, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8737103343009949, + "num_tokens": 855954853.0, + "step": 22432 + }, + { + "epoch": 2.8537081796209134, + "ewc_loss": 0.008701791986823082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.701792103238404e-05, + "grad_norm": 4.247110843658447, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8839623928070068, + "num_tokens": 855993300.0, + "step": 22433 + }, + { + "epoch": 2.853835389899504, + "ewc_loss": 0.008628392592072487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.628392242826521e-05, + "grad_norm": 4.285347938537598, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8865103125572205, + "num_tokens": 856035307.0, + "step": 22434 + }, + { + "epoch": 2.8539626001780944, + "ewc_loss": 0.00869055837392807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.690558024682105e-05, + "grad_norm": 4.354898929595947, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8809001445770264, + "num_tokens": 856071801.0, + "step": 22435 + }, + { + "epoch": 2.854089810456685, + "ewc_loss": 0.008683478459715843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.683478517923504e-05, + "grad_norm": 4.274956226348877, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8710654973983765, + "num_tokens": 856112035.0, + "step": 22436 + }, + { + "epoch": 2.8542170207352755, + "ewc_loss": 0.00862028356641531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620283915661275e-05, + "grad_norm": 4.2780256271362305, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.888640284538269, + "num_tokens": 856148275.0, + "step": 22437 + }, + { + "epoch": 2.854344231013866, + "ewc_loss": 0.008659101091325283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659101149532944e-05, + "grad_norm": 4.370303630828857, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8861002922058105, + "num_tokens": 856180330.0, + "step": 22438 + }, + { + "epoch": 2.8544714412924566, + "ewc_loss": 0.008685692213475704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6856925918255e-05, + "grad_norm": 4.297191619873047, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8719538450241089, + "num_tokens": 856219354.0, + "step": 22439 + }, + { + "epoch": 2.854598651571047, + "ewc_loss": 0.008624588139355183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624588372185826e-05, + "grad_norm": 4.253341197967529, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8925150036811829, + "num_tokens": 856257583.0, + "step": 22440 + }, + { + "epoch": 2.8547258618496376, + "ewc_loss": 0.008608223870396614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608224015915766e-05, + "grad_norm": 4.313446521759033, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8653526306152344, + "num_tokens": 856298056.0, + "step": 22441 + }, + { + "epoch": 2.854853072128228, + "ewc_loss": 0.00866679660975933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.666796929901466e-05, + "grad_norm": 4.331152439117432, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8816525936126709, + "num_tokens": 856334109.0, + "step": 22442 + }, + { + "epoch": 2.8549802824068182, + "ewc_loss": 0.008642103523015976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64210378495045e-05, + "grad_norm": 4.246464252471924, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8858292102813721, + "num_tokens": 856376655.0, + "step": 22443 + }, + { + "epoch": 2.855107492685409, + "ewc_loss": 0.008578531444072723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578531560488045e-05, + "grad_norm": 4.277639865875244, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.8984007835388184, + "num_tokens": 856409926.0, + "step": 22444 + }, + { + "epoch": 2.8552347029639993, + "ewc_loss": 0.008641219697892666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641219756100327e-05, + "grad_norm": 4.3087992668151855, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8749841451644897, + "num_tokens": 856451471.0, + "step": 22445 + }, + { + "epoch": 2.8553619132425903, + "ewc_loss": 0.008615368977189064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615369006292894e-05, + "grad_norm": 4.360541343688965, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8900848627090454, + "num_tokens": 856483237.0, + "step": 22446 + }, + { + "epoch": 2.8554891235211803, + "ewc_loss": 0.00863697286695242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636972779640928e-05, + "grad_norm": 4.267185211181641, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8763234615325928, + "num_tokens": 856521278.0, + "step": 22447 + }, + { + "epoch": 2.855616333799771, + "ewc_loss": 0.00856308825314045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563088340451941e-05, + "grad_norm": 4.265371322631836, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.878733217716217, + "num_tokens": 856565048.0, + "step": 22448 + }, + { + "epoch": 2.8557435440783614, + "ewc_loss": 0.00858280062675476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582801092416048e-05, + "grad_norm": 4.236278533935547, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8847919702529907, + "num_tokens": 856603756.0, + "step": 22449 + }, + { + "epoch": 2.855870754356952, + "ewc_loss": 0.008607846684753895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607847121311352e-05, + "grad_norm": 4.295487880706787, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8882671594619751, + "num_tokens": 856640677.0, + "step": 22450 + }, + { + "epoch": 2.8559979646355425, + "ewc_loss": 0.008627627044916153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627626812085509e-05, + "grad_norm": 4.2691779136657715, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8831206560134888, + "num_tokens": 856679982.0, + "step": 22451 + }, + { + "epoch": 2.856125174914133, + "ewc_loss": 0.00858384370803833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58384373714216e-05, + "grad_norm": 4.3313422203063965, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8721474409103394, + "num_tokens": 856720621.0, + "step": 22452 + }, + { + "epoch": 2.8562523851927235, + "ewc_loss": 0.008626005612313747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626005728729069e-05, + "grad_norm": 4.286828994750977, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8897238373756409, + "num_tokens": 856757172.0, + "step": 22453 + }, + { + "epoch": 2.856379595471314, + "ewc_loss": 0.008601120673120022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601120498497039e-05, + "grad_norm": 4.497932434082031, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8808817267417908, + "num_tokens": 856796876.0, + "step": 22454 + }, + { + "epoch": 2.8565068057499046, + "ewc_loss": 0.008705214597284794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.705214713700116e-05, + "grad_norm": 4.303492546081543, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8830729722976685, + "num_tokens": 856831148.0, + "step": 22455 + }, + { + "epoch": 2.856634016028495, + "ewc_loss": 0.008538151159882545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53815145092085e-05, + "grad_norm": 4.294067859649658, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8784959316253662, + "num_tokens": 856867091.0, + "step": 22456 + }, + { + "epoch": 2.8567612263070856, + "ewc_loss": 0.008603816851973534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603816968388855e-05, + "grad_norm": 4.304725170135498, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8731593489646912, + "num_tokens": 856906118.0, + "step": 22457 + }, + { + "epoch": 2.856888436585676, + "ewc_loss": 0.008610489778220654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610489749116823e-05, + "grad_norm": 4.25006628036499, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8586478233337402, + "num_tokens": 856942694.0, + "step": 22458 + }, + { + "epoch": 2.8570156468642667, + "ewc_loss": 0.008592706173658371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592706581111997e-05, + "grad_norm": 4.33355712890625, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8738785982131958, + "num_tokens": 856977530.0, + "step": 22459 + }, + { + "epoch": 2.857142857142857, + "ewc_loss": 0.008679033257067204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.679032907821238e-05, + "grad_norm": 4.268556118011475, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8866522312164307, + "num_tokens": 857012417.0, + "step": 22460 + }, + { + "epoch": 2.8572700674214477, + "ewc_loss": 0.008631136268377304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63113600644283e-05, + "grad_norm": 4.318431854248047, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8769384026527405, + "num_tokens": 857053556.0, + "step": 22461 + }, + { + "epoch": 2.8573972777000383, + "ewc_loss": 0.008687181398272514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.687181252753362e-05, + "grad_norm": 4.2665228843688965, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8859161138534546, + "num_tokens": 857094514.0, + "step": 22462 + }, + { + "epoch": 2.857524487978629, + "ewc_loss": 0.008659278973937035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659279410494491e-05, + "grad_norm": 4.281351089477539, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8832963705062866, + "num_tokens": 857133670.0, + "step": 22463 + }, + { + "epoch": 2.8576516982572193, + "ewc_loss": 0.008667281828820705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66728150867857e-05, + "grad_norm": 4.330920219421387, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8775320053100586, + "num_tokens": 857167934.0, + "step": 22464 + }, + { + "epoch": 2.85777890853581, + "ewc_loss": 0.008692028932273388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.692028495715931e-05, + "grad_norm": 4.316009998321533, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8657377362251282, + "num_tokens": 857208263.0, + "step": 22465 + }, + { + "epoch": 2.8579061188144, + "ewc_loss": 0.008675816468894482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.675816206960008e-05, + "grad_norm": 4.262423992156982, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8660991191864014, + "num_tokens": 857255107.0, + "step": 22466 + }, + { + "epoch": 2.858033329092991, + "ewc_loss": 0.008647535927593708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6475360149052e-05, + "grad_norm": 4.287323474884033, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.880501389503479, + "num_tokens": 857298368.0, + "step": 22467 + }, + { + "epoch": 2.858160539371581, + "ewc_loss": 0.008694506250321865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.694506686879322e-05, + "grad_norm": 4.308923244476318, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8833154439926147, + "num_tokens": 857336899.0, + "step": 22468 + }, + { + "epoch": 2.858287749650172, + "ewc_loss": 0.00867933128029108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67933122208342e-05, + "grad_norm": 4.274895191192627, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.887126624584198, + "num_tokens": 857378482.0, + "step": 22469 + }, + { + "epoch": 2.858414959928762, + "ewc_loss": 0.008644233457744122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644233457744122e-05, + "grad_norm": 4.306185722351074, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.890781581401825, + "num_tokens": 857413203.0, + "step": 22470 + }, + { + "epoch": 2.858542170207353, + "ewc_loss": 0.008680891245603561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6808911873959e-05, + "grad_norm": 4.376110076904297, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8801240921020508, + "num_tokens": 857446706.0, + "step": 22471 + }, + { + "epoch": 2.858669380485943, + "ewc_loss": 0.008694373071193695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.694372809259221e-05, + "grad_norm": 4.309252738952637, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8865221738815308, + "num_tokens": 857483415.0, + "step": 22472 + }, + { + "epoch": 2.8587965907645336, + "ewc_loss": 0.00863875262439251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638752478873357e-05, + "grad_norm": 4.2419047355651855, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.888688862323761, + "num_tokens": 857525323.0, + "step": 22473 + }, + { + "epoch": 2.858923801043124, + "ewc_loss": 0.008618650957942009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618651190772653e-05, + "grad_norm": 4.260282039642334, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8983362913131714, + "num_tokens": 857564400.0, + "step": 22474 + }, + { + "epoch": 2.8590510113217147, + "ewc_loss": 0.008662382140755653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662381878821179e-05, + "grad_norm": 4.323282241821289, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8812268376350403, + "num_tokens": 857602756.0, + "step": 22475 + }, + { + "epoch": 2.859178221600305, + "ewc_loss": 0.008678673766553402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.678674203110859e-05, + "grad_norm": 4.313140392303467, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8831851482391357, + "num_tokens": 857636875.0, + "step": 22476 + }, + { + "epoch": 2.8593054318788957, + "ewc_loss": 0.00865307915955782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.653079567011446e-05, + "grad_norm": 4.296261787414551, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8977436423301697, + "num_tokens": 857671950.0, + "step": 22477 + }, + { + "epoch": 2.8594326421574863, + "ewc_loss": 0.008662693202495575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662693289807066e-05, + "grad_norm": 4.303153991699219, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8952597379684448, + "num_tokens": 857705274.0, + "step": 22478 + }, + { + "epoch": 2.859559852436077, + "ewc_loss": 0.008650843985378742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.650844392832369e-05, + "grad_norm": 4.3029069900512695, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8767417669296265, + "num_tokens": 857743730.0, + "step": 22479 + }, + { + "epoch": 2.8596870627146673, + "ewc_loss": 0.008658979088068008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.658978913445026e-05, + "grad_norm": 4.289700984954834, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8817993402481079, + "num_tokens": 857784055.0, + "step": 22480 + }, + { + "epoch": 2.859814272993258, + "ewc_loss": 0.00865622702986002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.656227146275342e-05, + "grad_norm": 4.3046417236328125, + "learning_rate": 1e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8897163271903992, + "num_tokens": 857822431.0, + "step": 22481 + }, + { + "epoch": 2.8599414832718484, + "ewc_loss": 0.008648675866425037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64867542986758e-05, + "grad_norm": 4.293330192565918, + "learning_rate": 1e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8949384093284607, + "num_tokens": 857859258.0, + "step": 22482 + }, + { + "epoch": 2.860068693550439, + "ewc_loss": 0.008632278069853783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632277604192495e-05, + "grad_norm": 4.367815017700195, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.876089334487915, + "num_tokens": 857901209.0, + "step": 22483 + }, + { + "epoch": 2.8601959038290294, + "ewc_loss": 0.008667458780109882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667459042044356e-05, + "grad_norm": 4.317466735839844, + "learning_rate": 1e-06, + "loss": 0.2998, + "mean_token_accuracy": 0.8954503536224365, + "num_tokens": 857935563.0, + "step": 22484 + }, + { + "epoch": 2.86032311410762, + "ewc_loss": 0.008615599945187569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615600381745026e-05, + "grad_norm": 4.39596700668335, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8741410970687866, + "num_tokens": 857966777.0, + "step": 22485 + }, + { + "epoch": 2.8604503243862105, + "ewc_loss": 0.008673478849232197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67347844177857e-05, + "grad_norm": 4.303288459777832, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8877650499343872, + "num_tokens": 858005078.0, + "step": 22486 + }, + { + "epoch": 2.860577534664801, + "ewc_loss": 0.008599364198744297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599364082328975e-05, + "grad_norm": 4.272469520568848, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8716557621955872, + "num_tokens": 858049288.0, + "step": 22487 + }, + { + "epoch": 2.8607047449433916, + "ewc_loss": 0.008616911247372627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616911509307101e-05, + "grad_norm": 4.255585193634033, + "learning_rate": 1e-06, + "loss": 0.2845, + "mean_token_accuracy": 0.9004684686660767, + "num_tokens": 858089834.0, + "step": 22488 + }, + { + "epoch": 2.860831955221982, + "ewc_loss": 0.008601292967796326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601292938692495e-05, + "grad_norm": 4.3227715492248535, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8875539302825928, + "num_tokens": 858124152.0, + "step": 22489 + }, + { + "epoch": 2.8609591655005726, + "ewc_loss": 0.008647146634757519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64714675117284e-05, + "grad_norm": 4.303925037384033, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.887662947177887, + "num_tokens": 858163202.0, + "step": 22490 + }, + { + "epoch": 2.8610863757791627, + "ewc_loss": 0.008605793118476868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605793118476868e-05, + "grad_norm": 4.2669196128845215, + "learning_rate": 1e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.9014567136764526, + "num_tokens": 858199173.0, + "step": 22491 + }, + { + "epoch": 2.8612135860577537, + "ewc_loss": 0.00860118493437767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601185254519805e-05, + "grad_norm": 4.3105268478393555, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8796491026878357, + "num_tokens": 858239693.0, + "step": 22492 + }, + { + "epoch": 2.8613407963363438, + "ewc_loss": 0.008625784888863564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625784539617598e-05, + "grad_norm": 4.28175687789917, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8889622688293457, + "num_tokens": 858280720.0, + "step": 22493 + }, + { + "epoch": 2.8614680066149347, + "ewc_loss": 0.008595218881964684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595218969276175e-05, + "grad_norm": 4.325870037078857, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8950486779212952, + "num_tokens": 858314145.0, + "step": 22494 + }, + { + "epoch": 2.861595216893525, + "ewc_loss": 0.008617233484983444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61723383422941e-05, + "grad_norm": 4.283533573150635, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8855615854263306, + "num_tokens": 858355177.0, + "step": 22495 + }, + { + "epoch": 2.861722427172116, + "ewc_loss": 0.00857135746628046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57135746628046e-05, + "grad_norm": 4.260342121124268, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8860533237457275, + "num_tokens": 858393070.0, + "step": 22496 + }, + { + "epoch": 2.861849637450706, + "ewc_loss": 0.008576012216508389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576012623962015e-05, + "grad_norm": 4.325677871704102, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8832484483718872, + "num_tokens": 858431827.0, + "step": 22497 + }, + { + "epoch": 2.8619768477292964, + "ewc_loss": 0.008616967126727104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616966806584969e-05, + "grad_norm": 4.240283012390137, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8809874057769775, + "num_tokens": 858474990.0, + "step": 22498 + }, + { + "epoch": 2.862104058007887, + "ewc_loss": 0.008538703434169292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.538703696103767e-05, + "grad_norm": 4.2868852615356445, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8955802917480469, + "num_tokens": 858514698.0, + "step": 22499 + }, + { + "epoch": 2.8622312682864774, + "ewc_loss": 0.00859315600246191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593156235292554e-05, + "grad_norm": 4.258387565612793, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8843898177146912, + "num_tokens": 858557601.0, + "step": 22500 + }, + { + "epoch": 2.862358478565068, + "ewc_loss": 0.008542424067854881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542423893231899e-05, + "grad_norm": 4.272915363311768, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.890389621257782, + "num_tokens": 858596830.0, + "step": 22501 + }, + { + "epoch": 2.8624856888436585, + "ewc_loss": 0.008562972769141197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562972652725875e-05, + "grad_norm": 4.294228553771973, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8725712895393372, + "num_tokens": 858637555.0, + "step": 22502 + }, + { + "epoch": 2.862612899122249, + "ewc_loss": 0.008565414696931839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565414464101195e-05, + "grad_norm": 4.306105613708496, + "learning_rate": 1e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.900593638420105, + "num_tokens": 858672813.0, + "step": 22503 + }, + { + "epoch": 2.8627401094008396, + "ewc_loss": 0.008554046042263508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554045780329034e-05, + "grad_norm": 4.30729341506958, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8881849646568298, + "num_tokens": 858710743.0, + "step": 22504 + }, + { + "epoch": 2.86286731967943, + "ewc_loss": 0.00854070857167244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540708222426474e-05, + "grad_norm": 4.402597427368164, + "learning_rate": 1e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.8949199914932251, + "num_tokens": 858739486.0, + "step": 22505 + }, + { + "epoch": 2.8629945299580206, + "ewc_loss": 0.008586783893406391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586783951614052e-05, + "grad_norm": 4.269347190856934, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8638032674789429, + "num_tokens": 858780678.0, + "step": 22506 + }, + { + "epoch": 2.863121740236611, + "ewc_loss": 0.00848402176052332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.484021964250132e-05, + "grad_norm": 4.227145195007324, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8802413940429688, + "num_tokens": 858826112.0, + "step": 22507 + }, + { + "epoch": 2.8632489505152017, + "ewc_loss": 0.008531864732503891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531864295946434e-05, + "grad_norm": 4.28952169418335, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8957429528236389, + "num_tokens": 858862158.0, + "step": 22508 + }, + { + "epoch": 2.863376160793792, + "ewc_loss": 0.008569363504648209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569363853894174e-05, + "grad_norm": 4.309096336364746, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8873689770698547, + "num_tokens": 858905270.0, + "step": 22509 + }, + { + "epoch": 2.8635033710723827, + "ewc_loss": 0.008549575693905354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.54957543197088e-05, + "grad_norm": 4.295308589935303, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8778014183044434, + "num_tokens": 858941802.0, + "step": 22510 + }, + { + "epoch": 2.8636305813509733, + "ewc_loss": 0.008524470031261444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.52446974022314e-05, + "grad_norm": 4.3414692878723145, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8869097232818604, + "num_tokens": 858974672.0, + "step": 22511 + }, + { + "epoch": 2.863757791629564, + "ewc_loss": 0.008578959852457047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57896011439152e-05, + "grad_norm": 4.274784088134766, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8823481202125549, + "num_tokens": 859015305.0, + "step": 22512 + }, + { + "epoch": 2.8638850019081543, + "ewc_loss": 0.008533100597560406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533100481145084e-05, + "grad_norm": 4.325159549713135, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.876815915107727, + "num_tokens": 859050646.0, + "step": 22513 + }, + { + "epoch": 2.864012212186745, + "ewc_loss": 0.008584653958678246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58465427882038e-05, + "grad_norm": 4.257686614990234, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8931153416633606, + "num_tokens": 859090139.0, + "step": 22514 + }, + { + "epoch": 2.8641394224653354, + "ewc_loss": 0.008545733988285065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545734453946352e-05, + "grad_norm": 4.278173923492432, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8884721398353577, + "num_tokens": 859130661.0, + "step": 22515 + }, + { + "epoch": 2.8642666327439255, + "ewc_loss": 0.008567001670598984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.567001350456849e-05, + "grad_norm": 4.298764705657959, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8829504251480103, + "num_tokens": 859166399.0, + "step": 22516 + }, + { + "epoch": 2.8643938430225164, + "ewc_loss": 0.008561011403799057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.561011054553092e-05, + "grad_norm": 4.276541709899902, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.883023738861084, + "num_tokens": 859208349.0, + "step": 22517 + }, + { + "epoch": 2.8645210533011065, + "ewc_loss": 0.00854804553091526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548045298084617e-05, + "grad_norm": 4.323690891265869, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8854585886001587, + "num_tokens": 859243784.0, + "step": 22518 + }, + { + "epoch": 2.8646482635796975, + "ewc_loss": 0.008560486137866974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560486458009109e-05, + "grad_norm": 4.351980686187744, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8698107004165649, + "num_tokens": 859278345.0, + "step": 22519 + }, + { + "epoch": 2.8647754738582876, + "ewc_loss": 0.008596831001341343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596831321483478e-05, + "grad_norm": 4.30731725692749, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8725928068161011, + "num_tokens": 859317454.0, + "step": 22520 + }, + { + "epoch": 2.864902684136878, + "ewc_loss": 0.008572968654334545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572969090892002e-05, + "grad_norm": 4.3282647132873535, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.879802942276001, + "num_tokens": 859352076.0, + "step": 22521 + }, + { + "epoch": 2.8650298944154686, + "ewc_loss": 0.00858333334326744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583333692513406e-05, + "grad_norm": 4.277156829833984, + "learning_rate": 1e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8923165798187256, + "num_tokens": 859389957.0, + "step": 22522 + }, + { + "epoch": 2.865157104694059, + "ewc_loss": 0.008590889163315296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590889046899974e-05, + "grad_norm": 4.303452014923096, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8835057020187378, + "num_tokens": 859425167.0, + "step": 22523 + }, + { + "epoch": 2.8652843149726497, + "ewc_loss": 0.008618813008069992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618812717031687e-05, + "grad_norm": 4.269120216369629, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8815374374389648, + "num_tokens": 859470999.0, + "step": 22524 + }, + { + "epoch": 2.86541152525124, + "ewc_loss": 0.008598537184298038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598536805948243e-05, + "grad_norm": 4.322220325469971, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8881447315216064, + "num_tokens": 859503763.0, + "step": 22525 + }, + { + "epoch": 2.8655387355298307, + "ewc_loss": 0.008613680489361286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613680256530643e-05, + "grad_norm": 4.296250820159912, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8908153176307678, + "num_tokens": 859537739.0, + "step": 22526 + }, + { + "epoch": 2.8656659458084213, + "ewc_loss": 0.008590794168412685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59079445945099e-05, + "grad_norm": 4.27970027923584, + "learning_rate": 1e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.892569363117218, + "num_tokens": 859573497.0, + "step": 22527 + }, + { + "epoch": 2.865793156087012, + "ewc_loss": 0.008589010685682297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589010394643992e-05, + "grad_norm": 4.36058235168457, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8800230622291565, + "num_tokens": 859606997.0, + "step": 22528 + }, + { + "epoch": 2.8659203663656023, + "ewc_loss": 0.008650797419250011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.650797826703638e-05, + "grad_norm": 4.347550392150879, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.873272180557251, + "num_tokens": 859642278.0, + "step": 22529 + }, + { + "epoch": 2.866047576644193, + "ewc_loss": 0.008608582429587841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608582720626146e-05, + "grad_norm": 4.216226577758789, + "learning_rate": 1e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8933911919593811, + "num_tokens": 859684728.0, + "step": 22530 + }, + { + "epoch": 2.8661747869227834, + "ewc_loss": 0.008547214791178703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.547215111320838e-05, + "grad_norm": 4.31005859375, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8780815601348877, + "num_tokens": 859720946.0, + "step": 22531 + }, + { + "epoch": 2.866301997201374, + "ewc_loss": 0.008637242019176483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637241990072653e-05, + "grad_norm": 4.33856201171875, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8831837773323059, + "num_tokens": 859752605.0, + "step": 22532 + }, + { + "epoch": 2.8664292074799644, + "ewc_loss": 0.008629180490970612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629180956631899e-05, + "grad_norm": 4.304346084594727, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.896550178527832, + "num_tokens": 859784206.0, + "step": 22533 + }, + { + "epoch": 2.866556417758555, + "ewc_loss": 0.008597123436629772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597123087383807e-05, + "grad_norm": 4.267176628112793, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.880129337310791, + "num_tokens": 859828142.0, + "step": 22534 + }, + { + "epoch": 2.8666836280371455, + "ewc_loss": 0.008603845722973347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603845344623551e-05, + "grad_norm": 4.27511739730835, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8799697160720825, + "num_tokens": 859870032.0, + "step": 22535 + }, + { + "epoch": 2.866810838315736, + "ewc_loss": 0.008615094237029552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615093975095078e-05, + "grad_norm": 4.289846420288086, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8735480308532715, + "num_tokens": 859911417.0, + "step": 22536 + }, + { + "epoch": 2.8669380485943265, + "ewc_loss": 0.008593295700848103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593295933678746e-05, + "grad_norm": 4.229625225067139, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8946225643157959, + "num_tokens": 859953929.0, + "step": 22537 + }, + { + "epoch": 2.867065258872917, + "ewc_loss": 0.008566000498831272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566000178689137e-05, + "grad_norm": 4.371661186218262, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8873701095581055, + "num_tokens": 859985402.0, + "step": 22538 + }, + { + "epoch": 2.8671924691515076, + "ewc_loss": 0.008666790090501308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.666789653943852e-05, + "grad_norm": 4.39391565322876, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8808247447013855, + "num_tokens": 860017374.0, + "step": 22539 + }, + { + "epoch": 2.867319679430098, + "ewc_loss": 0.008619830943644047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619831351097673e-05, + "grad_norm": 4.243168830871582, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8933533430099487, + "num_tokens": 860058217.0, + "step": 22540 + }, + { + "epoch": 2.867446889708688, + "ewc_loss": 0.00854284968227148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.542849536752328e-05, + "grad_norm": 4.357636451721191, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8685727119445801, + "num_tokens": 860092317.0, + "step": 22541 + }, + { + "epoch": 2.867574099987279, + "ewc_loss": 0.008647246286273003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647246431792155e-05, + "grad_norm": 4.3342156410217285, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8805949687957764, + "num_tokens": 860125067.0, + "step": 22542 + }, + { + "epoch": 2.8677013102658693, + "ewc_loss": 0.008602402172982693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602401794632897e-05, + "grad_norm": 4.287283897399902, + "learning_rate": 1e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8949830532073975, + "num_tokens": 860163805.0, + "step": 22543 + }, + { + "epoch": 2.8678285205444602, + "ewc_loss": 0.008592604659497738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592604717705399e-05, + "grad_norm": 4.283477306365967, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8726787567138672, + "num_tokens": 860200178.0, + "step": 22544 + }, + { + "epoch": 2.8679557308230503, + "ewc_loss": 0.008625432848930359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625433110864833e-05, + "grad_norm": 4.278128623962402, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8833521604537964, + "num_tokens": 860235283.0, + "step": 22545 + }, + { + "epoch": 2.868082941101641, + "ewc_loss": 0.008625010028481483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625010377727449e-05, + "grad_norm": 4.218500137329102, + "learning_rate": 1e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.899281919002533, + "num_tokens": 860276776.0, + "step": 22546 + }, + { + "epoch": 2.8682101513802314, + "ewc_loss": 0.008591827005147934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591826917836443e-05, + "grad_norm": 4.247364044189453, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.887863278388977, + "num_tokens": 860318492.0, + "step": 22547 + }, + { + "epoch": 2.868337361658822, + "ewc_loss": 0.008629197254776955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629197691334412e-05, + "grad_norm": 4.3185553550720215, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8704087734222412, + "num_tokens": 860358556.0, + "step": 22548 + }, + { + "epoch": 2.8684645719374124, + "ewc_loss": 0.008644195273518562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644194895168766e-05, + "grad_norm": 4.358004570007324, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8770197629928589, + "num_tokens": 860391114.0, + "step": 22549 + }, + { + "epoch": 2.868591782216003, + "ewc_loss": 0.00866853166371584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668531518196687e-05, + "grad_norm": 4.321617126464844, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8789659142494202, + "num_tokens": 860430696.0, + "step": 22550 + }, + { + "epoch": 2.8687189924945935, + "ewc_loss": 0.008618224412202835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618224819656461e-05, + "grad_norm": 4.247729301452637, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8902634382247925, + "num_tokens": 860471055.0, + "step": 22551 + }, + { + "epoch": 2.868846202773184, + "ewc_loss": 0.008589803241193295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589803474023938e-05, + "grad_norm": 4.286984920501709, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8760034441947937, + "num_tokens": 860512472.0, + "step": 22552 + }, + { + "epoch": 2.8689734130517746, + "ewc_loss": 0.008632870391011238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632870594738051e-05, + "grad_norm": 4.411840915679932, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8792415857315063, + "num_tokens": 860541014.0, + "step": 22553 + }, + { + "epoch": 2.869100623330365, + "ewc_loss": 0.008691905066370964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.691905532032251e-05, + "grad_norm": 4.239767074584961, + "learning_rate": 1e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.8964321613311768, + "num_tokens": 860585025.0, + "step": 22554 + }, + { + "epoch": 2.8692278336089556, + "ewc_loss": 0.00857079029083252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570789941586554e-05, + "grad_norm": 4.350516319274902, + "learning_rate": 1e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8911893367767334, + "num_tokens": 860616361.0, + "step": 22555 + }, + { + "epoch": 2.869355043887546, + "ewc_loss": 0.008673432283103466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67343187564984e-05, + "grad_norm": 4.32088041305542, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8764456510543823, + "num_tokens": 860655976.0, + "step": 22556 + }, + { + "epoch": 2.8694822541661367, + "ewc_loss": 0.008601376786828041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601376612205058e-05, + "grad_norm": 4.249632358551025, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8709825873374939, + "num_tokens": 860703949.0, + "step": 22557 + }, + { + "epoch": 2.869609464444727, + "ewc_loss": 0.00858495943248272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584959869040176e-05, + "grad_norm": 4.309405326843262, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8830424547195435, + "num_tokens": 860739935.0, + "step": 22558 + }, + { + "epoch": 2.8697366747233177, + "ewc_loss": 0.008646874688565731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64687463035807e-05, + "grad_norm": 4.274303913116455, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8861562013626099, + "num_tokens": 860782483.0, + "step": 22559 + }, + { + "epoch": 2.8698638850019083, + "ewc_loss": 0.00860203430056572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60203435877338e-05, + "grad_norm": 4.322216510772705, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8734779357910156, + "num_tokens": 860816821.0, + "step": 22560 + }, + { + "epoch": 2.869991095280499, + "ewc_loss": 0.008619621396064758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619621075922623e-05, + "grad_norm": 4.35107946395874, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8719635009765625, + "num_tokens": 860851313.0, + "step": 22561 + }, + { + "epoch": 2.8701183055590893, + "ewc_loss": 0.008633757941424847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633758261566982e-05, + "grad_norm": 4.272244453430176, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8919999599456787, + "num_tokens": 860890540.0, + "step": 22562 + }, + { + "epoch": 2.87024551583768, + "ewc_loss": 0.008571164682507515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571164653403684e-05, + "grad_norm": 4.315640449523926, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.877413272857666, + "num_tokens": 860929686.0, + "step": 22563 + }, + { + "epoch": 2.87037272611627, + "ewc_loss": 0.00864909403026104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.649093797430396e-05, + "grad_norm": 4.28268575668335, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8868526816368103, + "num_tokens": 860968186.0, + "step": 22564 + }, + { + "epoch": 2.870499936394861, + "ewc_loss": 0.00862281583249569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622815948911011e-05, + "grad_norm": 4.271399974822998, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8791333436965942, + "num_tokens": 861007252.0, + "step": 22565 + }, + { + "epoch": 2.870627146673451, + "ewc_loss": 0.00861592497676611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615924889454618e-05, + "grad_norm": 4.241174697875977, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.884005069732666, + "num_tokens": 861049898.0, + "step": 22566 + }, + { + "epoch": 2.870754356952042, + "ewc_loss": 0.008618750609457493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618750871391967e-05, + "grad_norm": 4.29298734664917, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.889439582824707, + "num_tokens": 861087711.0, + "step": 22567 + }, + { + "epoch": 2.870881567230632, + "ewc_loss": 0.008643560111522675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64356043166481e-05, + "grad_norm": 4.308886528015137, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8639754056930542, + "num_tokens": 861128566.0, + "step": 22568 + }, + { + "epoch": 2.871008777509223, + "ewc_loss": 0.008621549233794212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621549204690382e-05, + "grad_norm": 4.400003910064697, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8883645534515381, + "num_tokens": 861156299.0, + "step": 22569 + }, + { + "epoch": 2.871135987787813, + "ewc_loss": 0.008667683228850365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667683141538873e-05, + "grad_norm": 4.271899700164795, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8942577242851257, + "num_tokens": 861189376.0, + "step": 22570 + }, + { + "epoch": 2.8712631980664036, + "ewc_loss": 0.0085695069283247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569506462663412e-05, + "grad_norm": 4.294891834259033, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8799307942390442, + "num_tokens": 861227156.0, + "step": 22571 + }, + { + "epoch": 2.871390408344994, + "ewc_loss": 0.008632764220237732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632764365756884e-05, + "grad_norm": 4.277678489685059, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8835564851760864, + "num_tokens": 861268088.0, + "step": 22572 + }, + { + "epoch": 2.8715176186235847, + "ewc_loss": 0.008614947088062763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614947000751272e-05, + "grad_norm": 4.249463081359863, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8854055404663086, + "num_tokens": 861308892.0, + "step": 22573 + }, + { + "epoch": 2.871644828902175, + "ewc_loss": 0.008594262413680553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59426218084991e-05, + "grad_norm": 4.249237060546875, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8850631713867188, + "num_tokens": 861350108.0, + "step": 22574 + }, + { + "epoch": 2.8717720391807657, + "ewc_loss": 0.008616399951279163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616400009486824e-05, + "grad_norm": 4.251030921936035, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8919001817703247, + "num_tokens": 861390553.0, + "step": 22575 + }, + { + "epoch": 2.8718992494593563, + "ewc_loss": 0.008615788072347641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615788101451471e-05, + "grad_norm": 4.2668914794921875, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8849396109580994, + "num_tokens": 861425878.0, + "step": 22576 + }, + { + "epoch": 2.872026459737947, + "ewc_loss": 0.008603672496974468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603672904428095e-05, + "grad_norm": 4.2691969871521, + "learning_rate": 1e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.8977820873260498, + "num_tokens": 861460935.0, + "step": 22577 + }, + { + "epoch": 2.8721536700165373, + "ewc_loss": 0.008600141853094101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600141882197931e-05, + "grad_norm": 4.310279369354248, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8706223964691162, + "num_tokens": 861500446.0, + "step": 22578 + }, + { + "epoch": 2.872280880295128, + "ewc_loss": 0.008626483380794525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62648303154856e-05, + "grad_norm": 4.28398323059082, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8729413747787476, + "num_tokens": 861538000.0, + "step": 22579 + }, + { + "epoch": 2.8724080905737184, + "ewc_loss": 0.008587823249399662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587822958361357e-05, + "grad_norm": 4.315810203552246, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8873077034950256, + "num_tokens": 861578052.0, + "step": 22580 + }, + { + "epoch": 2.872535300852309, + "ewc_loss": 0.0086184311658144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618431456852704e-05, + "grad_norm": 4.2802581787109375, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8802529573440552, + "num_tokens": 861619065.0, + "step": 22581 + }, + { + "epoch": 2.8726625111308994, + "ewc_loss": 0.00856345146894455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563451410736889e-05, + "grad_norm": 4.2645673751831055, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8860864043235779, + "num_tokens": 861660112.0, + "step": 22582 + }, + { + "epoch": 2.87278972140949, + "ewc_loss": 0.00857273954898119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.572739898227155e-05, + "grad_norm": 4.262057304382324, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8855026960372925, + "num_tokens": 861699312.0, + "step": 22583 + }, + { + "epoch": 2.8729169316880805, + "ewc_loss": 0.008560970425605774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.560970309190452e-05, + "grad_norm": 4.2867913246154785, + "learning_rate": 1e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.9006099700927734, + "num_tokens": 861735167.0, + "step": 22584 + }, + { + "epoch": 2.873044141966671, + "ewc_loss": 0.008595248684287071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595248800702393e-05, + "grad_norm": 4.279332637786865, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8772710561752319, + "num_tokens": 861778594.0, + "step": 22585 + }, + { + "epoch": 2.8731713522452615, + "ewc_loss": 0.008588701486587524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588701894041151e-05, + "grad_norm": 4.32843017578125, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8828525543212891, + "num_tokens": 861814754.0, + "step": 22586 + }, + { + "epoch": 2.873298562523852, + "ewc_loss": 0.008595356717705727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595356484875083e-05, + "grad_norm": 4.3605265617370605, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8687245845794678, + "num_tokens": 861849019.0, + "step": 22587 + }, + { + "epoch": 2.8734257728024426, + "ewc_loss": 0.008601143024861813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601143053965643e-05, + "grad_norm": 4.26871919631958, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8808965682983398, + "num_tokens": 861889471.0, + "step": 22588 + }, + { + "epoch": 2.8735529830810327, + "ewc_loss": 0.008555140346288681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.555140811949968e-05, + "grad_norm": 4.297730922698975, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8746007680892944, + "num_tokens": 861926811.0, + "step": 22589 + }, + { + "epoch": 2.8736801933596237, + "ewc_loss": 0.008602036163210869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602036541560665e-05, + "grad_norm": 4.29002046585083, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8801531791687012, + "num_tokens": 861963628.0, + "step": 22590 + }, + { + "epoch": 2.8738074036382137, + "ewc_loss": 0.008603247813880444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603247988503426e-05, + "grad_norm": 4.3141770362854, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8743239045143127, + "num_tokens": 862002915.0, + "step": 22591 + }, + { + "epoch": 2.8739346139168047, + "ewc_loss": 0.008617711253464222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6177111370489e-05, + "grad_norm": 4.2303290367126465, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8901908993721008, + "num_tokens": 862047276.0, + "step": 22592 + }, + { + "epoch": 2.874061824195395, + "ewc_loss": 0.008573179133236408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573179366067052e-05, + "grad_norm": 4.208252906799316, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8904216885566711, + "num_tokens": 862091241.0, + "step": 22593 + }, + { + "epoch": 2.8741890344739858, + "ewc_loss": 0.008587118238210678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587117918068543e-05, + "grad_norm": 4.270809650421143, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8861051797866821, + "num_tokens": 862126025.0, + "step": 22594 + }, + { + "epoch": 2.874316244752576, + "ewc_loss": 0.008632679469883442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632679237052798e-05, + "grad_norm": 4.3308939933776855, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8761957287788391, + "num_tokens": 862160822.0, + "step": 22595 + }, + { + "epoch": 2.8744434550311664, + "ewc_loss": 0.008640413172543049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640413579996675e-05, + "grad_norm": 4.288964748382568, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8873814344406128, + "num_tokens": 862196808.0, + "step": 22596 + }, + { + "epoch": 2.874570665309757, + "ewc_loss": 0.008601097390055656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601097215432674e-05, + "grad_norm": 4.300119400024414, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8689231872558594, + "num_tokens": 862233722.0, + "step": 22597 + }, + { + "epoch": 2.8746978755883474, + "ewc_loss": 0.008627841249108315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627841452835128e-05, + "grad_norm": 4.230801105499268, + "learning_rate": 1e-06, + "loss": 0.2896, + "mean_token_accuracy": 0.8984150290489197, + "num_tokens": 862271914.0, + "step": 22598 + }, + { + "epoch": 2.874825085866938, + "ewc_loss": 0.008580261841416359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580261783208698e-05, + "grad_norm": 4.240154266357422, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8795230388641357, + "num_tokens": 862313696.0, + "step": 22599 + }, + { + "epoch": 2.8749522961455285, + "ewc_loss": 0.008633838966488838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633839024696499e-05, + "grad_norm": 4.333861351013184, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.877833366394043, + "num_tokens": 862355366.0, + "step": 22600 + }, + { + "epoch": 2.875079506424119, + "ewc_loss": 0.008674412034451962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67441194714047e-05, + "grad_norm": 4.307718276977539, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8753993511199951, + "num_tokens": 862395630.0, + "step": 22601 + }, + { + "epoch": 2.8752067167027096, + "ewc_loss": 0.008611495606601238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611495286459103e-05, + "grad_norm": 4.309040546417236, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8834756016731262, + "num_tokens": 862429650.0, + "step": 22602 + }, + { + "epoch": 2.8753339269813, + "ewc_loss": 0.008611548691987991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611548400949687e-05, + "grad_norm": 4.268584251403809, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8801586627960205, + "num_tokens": 862470921.0, + "step": 22603 + }, + { + "epoch": 2.8754611372598906, + "ewc_loss": 0.008599678054451942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599677676102147e-05, + "grad_norm": 4.32455587387085, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8899087905883789, + "num_tokens": 862505204.0, + "step": 22604 + }, + { + "epoch": 2.875588347538481, + "ewc_loss": 0.008639618754386902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639618317829445e-05, + "grad_norm": 4.261914253234863, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8819486498832703, + "num_tokens": 862548686.0, + "step": 22605 + }, + { + "epoch": 2.8757155578170717, + "ewc_loss": 0.008585399016737938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585398609284312e-05, + "grad_norm": 4.259916305541992, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8944931030273438, + "num_tokens": 862585920.0, + "step": 22606 + }, + { + "epoch": 2.875842768095662, + "ewc_loss": 0.008605385199189186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605384937254712e-05, + "grad_norm": 4.35425329208374, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8793109655380249, + "num_tokens": 862619364.0, + "step": 22607 + }, + { + "epoch": 2.8759699783742527, + "ewc_loss": 0.008644700981676579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644701301818714e-05, + "grad_norm": 4.237427234649658, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8788881301879883, + "num_tokens": 862661317.0, + "step": 22608 + }, + { + "epoch": 2.8760971886528433, + "ewc_loss": 0.008557144552469254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557144610676914e-05, + "grad_norm": 4.274120330810547, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8866254091262817, + "num_tokens": 862698236.0, + "step": 22609 + }, + { + "epoch": 2.876224398931434, + "ewc_loss": 0.008622868917882442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622869063401595e-05, + "grad_norm": 4.233965873718262, + "learning_rate": 1e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8951247930526733, + "num_tokens": 862739224.0, + "step": 22610 + }, + { + "epoch": 2.8763516092100243, + "ewc_loss": 0.00857645832002163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576457912568003e-05, + "grad_norm": 4.2705841064453125, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8966717720031738, + "num_tokens": 862776534.0, + "step": 22611 + }, + { + "epoch": 2.876478819488615, + "ewc_loss": 0.008620087057352066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620087464805692e-05, + "grad_norm": 4.29599666595459, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8761938214302063, + "num_tokens": 862815690.0, + "step": 22612 + }, + { + "epoch": 2.8766060297672054, + "ewc_loss": 0.008606866002082825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60686632222496e-05, + "grad_norm": 4.305568695068359, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8817226886749268, + "num_tokens": 862851021.0, + "step": 22613 + }, + { + "epoch": 2.8767332400457954, + "ewc_loss": 0.008607444353401661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607444033259526e-05, + "grad_norm": 4.296812534332275, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8904711008071899, + "num_tokens": 862885122.0, + "step": 22614 + }, + { + "epoch": 2.8768604503243864, + "ewc_loss": 0.008595372550189495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595372491981834e-05, + "grad_norm": 4.263211727142334, + "learning_rate": 1e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8894919157028198, + "num_tokens": 862922233.0, + "step": 22615 + }, + { + "epoch": 2.8769876606029765, + "ewc_loss": 0.008616743609309196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616743434686214e-05, + "grad_norm": 4.276261329650879, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8913159966468811, + "num_tokens": 862961355.0, + "step": 22616 + }, + { + "epoch": 2.8771148708815675, + "ewc_loss": 0.00861311238259077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613112731836736e-05, + "grad_norm": 4.277312278747559, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.890953779220581, + "num_tokens": 862998523.0, + "step": 22617 + }, + { + "epoch": 2.8772420811601576, + "ewc_loss": 0.008616708219051361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616707782493904e-05, + "grad_norm": 4.262898921966553, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8848429918289185, + "num_tokens": 863039684.0, + "step": 22618 + }, + { + "epoch": 2.877369291438748, + "ewc_loss": 0.008603570982813835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603571041021496e-05, + "grad_norm": 4.27782678604126, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8878753185272217, + "num_tokens": 863077649.0, + "step": 22619 + }, + { + "epoch": 2.8774965017173386, + "ewc_loss": 0.008605239912867546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60524014569819e-05, + "grad_norm": 4.312707424163818, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8803581595420837, + "num_tokens": 863116816.0, + "step": 22620 + }, + { + "epoch": 2.877623711995929, + "ewc_loss": 0.008624333888292313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62433371366933e-05, + "grad_norm": 4.269996643066406, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.893805205821991, + "num_tokens": 863155306.0, + "step": 22621 + }, + { + "epoch": 2.8777509222745197, + "ewc_loss": 0.008577692322432995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577691914979368e-05, + "grad_norm": 4.296782493591309, + "learning_rate": 1e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8911134004592896, + "num_tokens": 863194381.0, + "step": 22622 + }, + { + "epoch": 2.87787813255311, + "ewc_loss": 0.008610857650637627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61085718497634e-05, + "grad_norm": 4.318926811218262, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8821284174919128, + "num_tokens": 863231470.0, + "step": 22623 + }, + { + "epoch": 2.8780053428317007, + "ewc_loss": 0.008598052896559238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5980529547669e-05, + "grad_norm": 4.3340911865234375, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8780097365379333, + "num_tokens": 863265617.0, + "step": 22624 + }, + { + "epoch": 2.8781325531102913, + "ewc_loss": 0.008618725463747978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618725405540317e-05, + "grad_norm": 4.361099720001221, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8737476468086243, + "num_tokens": 863300111.0, + "step": 22625 + }, + { + "epoch": 2.878259763388882, + "ewc_loss": 0.008622869849205017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622869790997356e-05, + "grad_norm": 4.299031734466553, + "learning_rate": 1e-06, + "loss": 0.3084, + "mean_token_accuracy": 0.8909603357315063, + "num_tokens": 863338626.0, + "step": 22626 + }, + { + "epoch": 2.8783869736674723, + "ewc_loss": 0.008595102466642857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595102553954348e-05, + "grad_norm": 4.375144958496094, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8777093887329102, + "num_tokens": 863374672.0, + "step": 22627 + }, + { + "epoch": 2.878514183946063, + "ewc_loss": 0.008639562875032425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639563020551577e-05, + "grad_norm": 4.278530597686768, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8801056146621704, + "num_tokens": 863412717.0, + "step": 22628 + }, + { + "epoch": 2.8786413942246534, + "ewc_loss": 0.008581024594604969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581025031162426e-05, + "grad_norm": 4.327401638031006, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8915391564369202, + "num_tokens": 863447776.0, + "step": 22629 + }, + { + "epoch": 2.878768604503244, + "ewc_loss": 0.008633469231426716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633469406049699e-05, + "grad_norm": 4.256067752838135, + "learning_rate": 1e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.8990191221237183, + "num_tokens": 863485697.0, + "step": 22630 + }, + { + "epoch": 2.8788958147818344, + "ewc_loss": 0.008573330007493496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573329978389665e-05, + "grad_norm": 4.270842552185059, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8783707022666931, + "num_tokens": 863528706.0, + "step": 22631 + }, + { + "epoch": 2.879023025060425, + "ewc_loss": 0.008635826408863068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635826816316694e-05, + "grad_norm": 4.310054302215576, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8650679588317871, + "num_tokens": 863569181.0, + "step": 22632 + }, + { + "epoch": 2.8791502353390155, + "ewc_loss": 0.00862904917448759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629049261799082e-05, + "grad_norm": 4.341319561004639, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8706136345863342, + "num_tokens": 863602985.0, + "step": 22633 + }, + { + "epoch": 2.879277445617606, + "ewc_loss": 0.008647122420370579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647122012916952e-05, + "grad_norm": 4.386890888214111, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8859984874725342, + "num_tokens": 863634590.0, + "step": 22634 + }, + { + "epoch": 2.8794046558961965, + "ewc_loss": 0.008662723004817963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662723121233284e-05, + "grad_norm": 4.2245588302612305, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8812893033027649, + "num_tokens": 863677671.0, + "step": 22635 + }, + { + "epoch": 2.879531866174787, + "ewc_loss": 0.008565066382288933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.565066673327237e-05, + "grad_norm": 4.271205425262451, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8846153020858765, + "num_tokens": 863716102.0, + "step": 22636 + }, + { + "epoch": 2.8796590764533776, + "ewc_loss": 0.008644240908324718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644240733701736e-05, + "grad_norm": 4.29832649230957, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8751623034477234, + "num_tokens": 863751560.0, + "step": 22637 + }, + { + "epoch": 2.879786286731968, + "ewc_loss": 0.008637589402496815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63758978084661e-05, + "grad_norm": 4.244297027587891, + "learning_rate": 1e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.8970238566398621, + "num_tokens": 863792373.0, + "step": 22638 + }, + { + "epoch": 2.879913497010558, + "ewc_loss": 0.008597500622272491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597500709583983e-05, + "grad_norm": 4.302523612976074, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8795039057731628, + "num_tokens": 863829969.0, + "step": 22639 + }, + { + "epoch": 2.880040707289149, + "ewc_loss": 0.008649949915707111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.649950177641585e-05, + "grad_norm": 4.270946502685547, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8892951011657715, + "num_tokens": 863867080.0, + "step": 22640 + }, + { + "epoch": 2.8801679175677393, + "ewc_loss": 0.008612880483269691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612880628788844e-05, + "grad_norm": 4.273711681365967, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8873951435089111, + "num_tokens": 863904230.0, + "step": 22641 + }, + { + "epoch": 2.8802951278463302, + "ewc_loss": 0.008620715700089931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620716107543558e-05, + "grad_norm": 4.252267360687256, + "learning_rate": 1e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8873810768127441, + "num_tokens": 863947309.0, + "step": 22642 + }, + { + "epoch": 2.8804223381249203, + "ewc_loss": 0.008599692955613136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599692955613136e-05, + "grad_norm": 4.31588077545166, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8881409168243408, + "num_tokens": 863985557.0, + "step": 22643 + }, + { + "epoch": 2.880549548403511, + "ewc_loss": 0.008625865913927555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625866030342877e-05, + "grad_norm": 4.305509567260742, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8724696636199951, + "num_tokens": 864021842.0, + "step": 22644 + }, + { + "epoch": 2.8806767586821014, + "ewc_loss": 0.008593256585299969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59325664350763e-05, + "grad_norm": 4.300713539123535, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8762545585632324, + "num_tokens": 864060765.0, + "step": 22645 + }, + { + "epoch": 2.880803968960692, + "ewc_loss": 0.008591734804213047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591735240770504e-05, + "grad_norm": 4.277845859527588, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.885542631149292, + "num_tokens": 864100624.0, + "step": 22646 + }, + { + "epoch": 2.8809311792392824, + "ewc_loss": 0.00860271230340004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602712478023022e-05, + "grad_norm": 4.338931560516357, + "learning_rate": 1e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.8945035934448242, + "num_tokens": 864136599.0, + "step": 22647 + }, + { + "epoch": 2.881058389517873, + "ewc_loss": 0.00861009769141674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610097575001419e-05, + "grad_norm": 4.32684850692749, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8869249820709229, + "num_tokens": 864169876.0, + "step": 22648 + }, + { + "epoch": 2.8811855997964635, + "ewc_loss": 0.008585206232964993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585206524003297e-05, + "grad_norm": 4.2504048347473145, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8826223611831665, + "num_tokens": 864213709.0, + "step": 22649 + }, + { + "epoch": 2.881312810075054, + "ewc_loss": 0.008554618805646896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55461839819327e-05, + "grad_norm": 4.416199684143066, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8874660730361938, + "num_tokens": 864245581.0, + "step": 22650 + }, + { + "epoch": 2.8814400203536445, + "ewc_loss": 0.008656344376504421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65634428919293e-05, + "grad_norm": 4.289549350738525, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8717851042747498, + "num_tokens": 864287081.0, + "step": 22651 + }, + { + "epoch": 2.881567230632235, + "ewc_loss": 0.008527150377631187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.527150203008205e-05, + "grad_norm": 4.368622779846191, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8696547746658325, + "num_tokens": 864321480.0, + "step": 22652 + }, + { + "epoch": 2.8816944409108256, + "ewc_loss": 0.008631834760308266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631834498373792e-05, + "grad_norm": 4.261480331420898, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8897402882575989, + "num_tokens": 864358580.0, + "step": 22653 + }, + { + "epoch": 2.881821651189416, + "ewc_loss": 0.008564498275518417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.56449842103757e-05, + "grad_norm": 4.33831262588501, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8765279054641724, + "num_tokens": 864396277.0, + "step": 22654 + }, + { + "epoch": 2.8819488614680067, + "ewc_loss": 0.008639037609100342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639037696411833e-05, + "grad_norm": 4.335723400115967, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.880221962928772, + "num_tokens": 864433282.0, + "step": 22655 + }, + { + "epoch": 2.882076071746597, + "ewc_loss": 0.0086170993745327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617099229013547e-05, + "grad_norm": 4.345088958740234, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8768436908721924, + "num_tokens": 864469305.0, + "step": 22656 + }, + { + "epoch": 2.8822032820251877, + "ewc_loss": 0.008621909655630589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621910092188045e-05, + "grad_norm": 4.292242050170898, + "learning_rate": 1e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8849366903305054, + "num_tokens": 864507246.0, + "step": 22657 + }, + { + "epoch": 2.8823304923037782, + "ewc_loss": 0.008595968596637249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595968392910436e-05, + "grad_norm": 4.282797336578369, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8786793351173401, + "num_tokens": 864547315.0, + "step": 22658 + }, + { + "epoch": 2.8824577025823688, + "ewc_loss": 0.00860955286771059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609552605776116e-05, + "grad_norm": 4.275012016296387, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8778170347213745, + "num_tokens": 864590813.0, + "step": 22659 + }, + { + "epoch": 2.8825849128609593, + "ewc_loss": 0.008623672649264336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6236723291222e-05, + "grad_norm": 4.372578144073486, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8746684789657593, + "num_tokens": 864625078.0, + "step": 22660 + }, + { + "epoch": 2.88271212313955, + "ewc_loss": 0.008679608814418316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.679609163664281e-05, + "grad_norm": 4.2779154777526855, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.891060471534729, + "num_tokens": 864663714.0, + "step": 22661 + }, + { + "epoch": 2.88283933341814, + "ewc_loss": 0.008575713261961937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575713582104072e-05, + "grad_norm": 4.340720176696777, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8817028403282166, + "num_tokens": 864697026.0, + "step": 22662 + }, + { + "epoch": 2.882966543696731, + "ewc_loss": 0.008668215945363045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668215741636232e-05, + "grad_norm": 4.315367698669434, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8921580910682678, + "num_tokens": 864732337.0, + "step": 22663 + }, + { + "epoch": 2.883093753975321, + "ewc_loss": 0.008647177368402481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64717731019482e-05, + "grad_norm": 4.284262180328369, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8773056268692017, + "num_tokens": 864776598.0, + "step": 22664 + }, + { + "epoch": 2.883220964253912, + "ewc_loss": 0.008642705157399178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642704779049382e-05, + "grad_norm": 4.319685459136963, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.871042788028717, + "num_tokens": 864817279.0, + "step": 22665 + }, + { + "epoch": 2.883348174532502, + "ewc_loss": 0.008673174306750298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.673174306750298e-05, + "grad_norm": 4.276366233825684, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.890772819519043, + "num_tokens": 864854914.0, + "step": 22666 + }, + { + "epoch": 2.883475384811093, + "ewc_loss": 0.008625759743154049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62575980136171e-05, + "grad_norm": 4.321975231170654, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8869829177856445, + "num_tokens": 864892914.0, + "step": 22667 + }, + { + "epoch": 2.883602595089683, + "ewc_loss": 0.008648574352264404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.648574294056743e-05, + "grad_norm": 4.278470039367676, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8805930614471436, + "num_tokens": 864934189.0, + "step": 22668 + }, + { + "epoch": 2.8837298053682736, + "ewc_loss": 0.008631398901343346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631398668512702e-05, + "grad_norm": 4.309274196624756, + "learning_rate": 1e-06, + "loss": 0.314, + "mean_token_accuracy": 0.8913644552230835, + "num_tokens": 864972085.0, + "step": 22669 + }, + { + "epoch": 2.883857015646864, + "ewc_loss": 0.008633054792881012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63305467646569e-05, + "grad_norm": 4.416053295135498, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8672171831130981, + "num_tokens": 865008674.0, + "step": 22670 + }, + { + "epoch": 2.8839842259254547, + "ewc_loss": 0.008674194104969501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.674193668412045e-05, + "grad_norm": 4.2811808586120605, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8748952150344849, + "num_tokens": 865046004.0, + "step": 22671 + }, + { + "epoch": 2.884111436204045, + "ewc_loss": 0.008570455014705658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570455247536302e-05, + "grad_norm": 4.265047073364258, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8870375156402588, + "num_tokens": 865082409.0, + "step": 22672 + }, + { + "epoch": 2.8842386464826357, + "ewc_loss": 0.008621291257441044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62129163579084e-05, + "grad_norm": 4.265599727630615, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8834373950958252, + "num_tokens": 865124457.0, + "step": 22673 + }, + { + "epoch": 2.8843658567612263, + "ewc_loss": 0.008598503656685352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598504064138979e-05, + "grad_norm": 4.303615093231201, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8943372964859009, + "num_tokens": 865162290.0, + "step": 22674 + }, + { + "epoch": 2.884493067039817, + "ewc_loss": 0.008627372793853283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627372881164774e-05, + "grad_norm": 4.303850173950195, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8819851875305176, + "num_tokens": 865201686.0, + "step": 22675 + }, + { + "epoch": 2.8846202773184073, + "ewc_loss": 0.008615449070930481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61544904182665e-05, + "grad_norm": 4.364619255065918, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8765045404434204, + "num_tokens": 865236876.0, + "step": 22676 + }, + { + "epoch": 2.884747487596998, + "ewc_loss": 0.008650966919958591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.650966628920287e-05, + "grad_norm": 4.255843639373779, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8855574131011963, + "num_tokens": 865276994.0, + "step": 22677 + }, + { + "epoch": 2.8848746978755884, + "ewc_loss": 0.00854556541889906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.545565651729703e-05, + "grad_norm": 4.289429187774658, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8839210271835327, + "num_tokens": 865311990.0, + "step": 22678 + }, + { + "epoch": 2.885001908154179, + "ewc_loss": 0.008607652969658375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607652853243053e-05, + "grad_norm": 4.262263774871826, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8893750905990601, + "num_tokens": 865352324.0, + "step": 22679 + }, + { + "epoch": 2.8851291184327694, + "ewc_loss": 0.008592098951339722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592099038651213e-05, + "grad_norm": 4.291380405426025, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8834881782531738, + "num_tokens": 865391428.0, + "step": 22680 + }, + { + "epoch": 2.88525632871136, + "ewc_loss": 0.008615335449576378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615335536887869e-05, + "grad_norm": 4.319517135620117, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8817614316940308, + "num_tokens": 865428478.0, + "step": 22681 + }, + { + "epoch": 2.8853835389899505, + "ewc_loss": 0.008621949702501297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621950109954923e-05, + "grad_norm": 4.324080467224121, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8833253383636475, + "num_tokens": 865466531.0, + "step": 22682 + }, + { + "epoch": 2.885510749268541, + "ewc_loss": 0.008625328540802002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625328337075189e-05, + "grad_norm": 4.281237602233887, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8896348476409912, + "num_tokens": 865503177.0, + "step": 22683 + }, + { + "epoch": 2.8856379595471315, + "ewc_loss": 0.008599690161645412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59969004523009e-05, + "grad_norm": 4.333776473999023, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8760498762130737, + "num_tokens": 865539454.0, + "step": 22684 + }, + { + "epoch": 2.885765169825722, + "ewc_loss": 0.008649048395454884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.649048686493188e-05, + "grad_norm": 4.288646221160889, + "learning_rate": 1e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8999664783477783, + "num_tokens": 865571937.0, + "step": 22685 + }, + { + "epoch": 2.8858923801043126, + "ewc_loss": 0.008602185174822807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602184971095994e-05, + "grad_norm": 4.3153839111328125, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8860205411911011, + "num_tokens": 865609457.0, + "step": 22686 + }, + { + "epoch": 2.8860195903829027, + "ewc_loss": 0.008659044280648232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659044397063553e-05, + "grad_norm": 4.341894626617432, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8713775277137756, + "num_tokens": 865647391.0, + "step": 22687 + }, + { + "epoch": 2.8861468006614936, + "ewc_loss": 0.008672332391142845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.672332478454337e-05, + "grad_norm": 4.352468967437744, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8836701512336731, + "num_tokens": 865680578.0, + "step": 22688 + }, + { + "epoch": 2.8862740109400837, + "ewc_loss": 0.008665741421282291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.665741188451648e-05, + "grad_norm": 4.302523136138916, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8809352517127991, + "num_tokens": 865714610.0, + "step": 22689 + }, + { + "epoch": 2.8864012212186747, + "ewc_loss": 0.008637889288365841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637889550300315e-05, + "grad_norm": 4.319637775421143, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8799651861190796, + "num_tokens": 865755672.0, + "step": 22690 + }, + { + "epoch": 2.886528431497265, + "ewc_loss": 0.008673902601003647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.673902630107477e-05, + "grad_norm": 4.283924102783203, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8902225494384766, + "num_tokens": 865790111.0, + "step": 22691 + }, + { + "epoch": 2.8866556417758558, + "ewc_loss": 0.008663649670779705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66364935063757e-05, + "grad_norm": 4.364604473114014, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8841243386268616, + "num_tokens": 865825580.0, + "step": 22692 + }, + { + "epoch": 2.886782852054446, + "ewc_loss": 0.008712118491530418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.712118142284453e-05, + "grad_norm": 4.2266316413879395, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.886692225933075, + "num_tokens": 865869099.0, + "step": 22693 + }, + { + "epoch": 2.8869100623330364, + "ewc_loss": 0.008632097393274307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632097160443664e-05, + "grad_norm": 4.360653877258301, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8737034797668457, + "num_tokens": 865906957.0, + "step": 22694 + }, + { + "epoch": 2.887037272611627, + "ewc_loss": 0.00875517912209034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.755179442232475e-05, + "grad_norm": 4.259143829345703, + "learning_rate": 1e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8929386138916016, + "num_tokens": 865945712.0, + "step": 22695 + }, + { + "epoch": 2.8871644828902174, + "ewc_loss": 0.008640959858894348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6409600044135e-05, + "grad_norm": 4.318824291229248, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8686076402664185, + "num_tokens": 865984469.0, + "step": 22696 + }, + { + "epoch": 2.887291693168808, + "ewc_loss": 0.008719495497643948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.719495235709473e-05, + "grad_norm": 4.254385948181152, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8844519853591919, + "num_tokens": 866024624.0, + "step": 22697 + }, + { + "epoch": 2.8874189034473985, + "ewc_loss": 0.0086548812687397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.654881094116718e-05, + "grad_norm": 4.274179935455322, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8820960521697998, + "num_tokens": 866062446.0, + "step": 22698 + }, + { + "epoch": 2.887546113725989, + "ewc_loss": 0.008685770444571972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.685770444571972e-05, + "grad_norm": 4.333807468414307, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8925632238388062, + "num_tokens": 866100896.0, + "step": 22699 + }, + { + "epoch": 2.8876733240045795, + "ewc_loss": 0.00872179213911295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.721792255528271e-05, + "grad_norm": 4.378098487854004, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8716925978660583, + "num_tokens": 866134668.0, + "step": 22700 + }, + { + "epoch": 2.88780053428317, + "ewc_loss": 0.008707495406270027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.707494998816401e-05, + "grad_norm": 4.297391891479492, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8757047653198242, + "num_tokens": 866173834.0, + "step": 22701 + }, + { + "epoch": 2.8879277445617606, + "ewc_loss": 0.008647143840789795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647143840789795e-05, + "grad_norm": 4.305196762084961, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.869120180606842, + "num_tokens": 866211691.0, + "step": 22702 + }, + { + "epoch": 2.888054954840351, + "ewc_loss": 0.008682943880558014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.682943735038862e-05, + "grad_norm": 4.345091342926025, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8834007978439331, + "num_tokens": 866240878.0, + "step": 22703 + }, + { + "epoch": 2.8881821651189417, + "ewc_loss": 0.008728589862585068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.728589455131441e-05, + "grad_norm": 4.263360023498535, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8705815076828003, + "num_tokens": 866289552.0, + "step": 22704 + }, + { + "epoch": 2.888309375397532, + "ewc_loss": 0.008650627918541431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.650628296891227e-05, + "grad_norm": 4.287914752960205, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8753212690353394, + "num_tokens": 866330361.0, + "step": 22705 + }, + { + "epoch": 2.8884365856761227, + "ewc_loss": 0.008710416033864021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.710415568202734e-05, + "grad_norm": 4.283740997314453, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8850029110908508, + "num_tokens": 866371557.0, + "step": 22706 + }, + { + "epoch": 2.8885637959547132, + "ewc_loss": 0.008698586374521255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.698586316313595e-05, + "grad_norm": 4.2958478927612305, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8841139078140259, + "num_tokens": 866410627.0, + "step": 22707 + }, + { + "epoch": 2.8886910062333038, + "ewc_loss": 0.008660692721605301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.660692401463166e-05, + "grad_norm": 4.320173740386963, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8805926442146301, + "num_tokens": 866446036.0, + "step": 22708 + }, + { + "epoch": 2.8888182165118943, + "ewc_loss": 0.00869839545339346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.698395686224103e-05, + "grad_norm": 4.2622880935668945, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.873717188835144, + "num_tokens": 866489655.0, + "step": 22709 + }, + { + "epoch": 2.888945426790485, + "ewc_loss": 0.008642908185720444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642907778266817e-05, + "grad_norm": 4.271476745605469, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8764106035232544, + "num_tokens": 866532750.0, + "step": 22710 + }, + { + "epoch": 2.8890726370690754, + "ewc_loss": 0.008685685694217682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.685685315867886e-05, + "grad_norm": 4.323072910308838, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8873165249824524, + "num_tokens": 866570748.0, + "step": 22711 + }, + { + "epoch": 2.8891998473476654, + "ewc_loss": 0.008689800277352333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.689800597494468e-05, + "grad_norm": 4.315303802490234, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8759931325912476, + "num_tokens": 866609422.0, + "step": 22712 + }, + { + "epoch": 2.8893270576262564, + "ewc_loss": 0.008655528537929058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65552865434438e-05, + "grad_norm": 4.290748596191406, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8842138648033142, + "num_tokens": 866650865.0, + "step": 22713 + }, + { + "epoch": 2.8894542679048465, + "ewc_loss": 0.00863777007907629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637770224595442e-05, + "grad_norm": 4.29184627532959, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.881234884262085, + "num_tokens": 866687605.0, + "step": 22714 + }, + { + "epoch": 2.8895814781834375, + "ewc_loss": 0.00865514948964119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655149576952681e-05, + "grad_norm": 4.343160629272461, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8745702505111694, + "num_tokens": 866722974.0, + "step": 22715 + }, + { + "epoch": 2.8897086884620276, + "ewc_loss": 0.00867689773440361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.676898141857237e-05, + "grad_norm": 4.322670936584473, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8810106515884399, + "num_tokens": 866760694.0, + "step": 22716 + }, + { + "epoch": 2.889835898740618, + "ewc_loss": 0.008657220751047134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.657220314489678e-05, + "grad_norm": 4.350889205932617, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8664528131484985, + "num_tokens": 866795102.0, + "step": 22717 + }, + { + "epoch": 2.8899631090192086, + "ewc_loss": 0.008683412335813046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.683412306709215e-05, + "grad_norm": 4.265766620635986, + "learning_rate": 1e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8945952653884888, + "num_tokens": 866831282.0, + "step": 22718 + }, + { + "epoch": 2.890090319297799, + "ewc_loss": 0.00863291323184967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632912795292214e-05, + "grad_norm": 4.307155609130859, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.87518310546875, + "num_tokens": 866872661.0, + "step": 22719 + }, + { + "epoch": 2.8902175295763897, + "ewc_loss": 0.00867085438221693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.670854003867134e-05, + "grad_norm": 4.271463394165039, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8734798431396484, + "num_tokens": 866917576.0, + "step": 22720 + }, + { + "epoch": 2.89034473985498, + "ewc_loss": 0.008646671660244465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646671631140634e-05, + "grad_norm": 4.298490047454834, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8856459856033325, + "num_tokens": 866957245.0, + "step": 22721 + }, + { + "epoch": 2.8904719501335707, + "ewc_loss": 0.008669175207614899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.669175440445542e-05, + "grad_norm": 4.324489116668701, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8548974990844727, + "num_tokens": 866998695.0, + "step": 22722 + }, + { + "epoch": 2.8905991604121613, + "ewc_loss": 0.008664549328386784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.664549386594445e-05, + "grad_norm": 4.364377975463867, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8709502220153809, + "num_tokens": 867033502.0, + "step": 22723 + }, + { + "epoch": 2.890726370690752, + "ewc_loss": 0.008668312802910805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668313239468262e-05, + "grad_norm": 4.3556365966796875, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8642758131027222, + "num_tokens": 867076032.0, + "step": 22724 + }, + { + "epoch": 2.8908535809693423, + "ewc_loss": 0.008647850714623928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647851063869894e-05, + "grad_norm": 4.380169868469238, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8826953768730164, + "num_tokens": 867107429.0, + "step": 22725 + }, + { + "epoch": 2.890980791247933, + "ewc_loss": 0.008657492697238922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65749316290021e-05, + "grad_norm": 4.366333484649658, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.885718822479248, + "num_tokens": 867139432.0, + "step": 22726 + }, + { + "epoch": 2.8911080015265234, + "ewc_loss": 0.008659320883452892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659320883452892e-05, + "grad_norm": 4.270534038543701, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8799247145652771, + "num_tokens": 867180588.0, + "step": 22727 + }, + { + "epoch": 2.891235211805114, + "ewc_loss": 0.008579080924391747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579080895287916e-05, + "grad_norm": 4.3108696937561035, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8817052245140076, + "num_tokens": 867215170.0, + "step": 22728 + }, + { + "epoch": 2.8913624220837044, + "ewc_loss": 0.00867176428437233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.671764226164669e-05, + "grad_norm": 4.290611743927002, + "learning_rate": 1e-06, + "loss": 0.2994, + "mean_token_accuracy": 0.8962323665618896, + "num_tokens": 867254958.0, + "step": 22729 + }, + { + "epoch": 2.891489632362295, + "ewc_loss": 0.00863068737089634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.630687079858035e-05, + "grad_norm": 4.273173809051514, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8823002576828003, + "num_tokens": 867291135.0, + "step": 22730 + }, + { + "epoch": 2.8916168426408855, + "ewc_loss": 0.008636420592665672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636420534458011e-05, + "grad_norm": 4.341578960418701, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8895775079727173, + "num_tokens": 867326926.0, + "step": 22731 + }, + { + "epoch": 2.891744052919476, + "ewc_loss": 0.00868561677634716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.685616921866313e-05, + "grad_norm": 4.343669414520264, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8750752210617065, + "num_tokens": 867361142.0, + "step": 22732 + }, + { + "epoch": 2.8918712631980665, + "ewc_loss": 0.008639538660645485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639538282295689e-05, + "grad_norm": 4.24845027923584, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8987977504730225, + "num_tokens": 867398344.0, + "step": 22733 + }, + { + "epoch": 2.891998473476657, + "ewc_loss": 0.008607964031398296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60796426422894e-05, + "grad_norm": 4.297676086425781, + "learning_rate": 1e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8910130262374878, + "num_tokens": 867437642.0, + "step": 22734 + }, + { + "epoch": 2.8921256837552476, + "ewc_loss": 0.008662404492497444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662404434289783e-05, + "grad_norm": 4.322120666503906, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8735121488571167, + "num_tokens": 867471646.0, + "step": 22735 + }, + { + "epoch": 2.892252894033838, + "ewc_loss": 0.008665981702506542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.665982022648677e-05, + "grad_norm": 4.331541538238525, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8833832144737244, + "num_tokens": 867509682.0, + "step": 22736 + }, + { + "epoch": 2.892380104312428, + "ewc_loss": 0.008667105808854103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667105430504307e-05, + "grad_norm": 4.322426795959473, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8770948648452759, + "num_tokens": 867544425.0, + "step": 22737 + }, + { + "epoch": 2.892507314591019, + "ewc_loss": 0.008656593970954418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.656593854539096e-05, + "grad_norm": 4.3118367195129395, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8745448589324951, + "num_tokens": 867584978.0, + "step": 22738 + }, + { + "epoch": 2.8926345248696093, + "ewc_loss": 0.008652947843074799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.652947872178629e-05, + "grad_norm": 4.300391674041748, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.882932186126709, + "num_tokens": 867620360.0, + "step": 22739 + }, + { + "epoch": 2.8927617351482002, + "ewc_loss": 0.008655210956931114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655211422592402e-05, + "grad_norm": 4.319188594818115, + "learning_rate": 1e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8908750414848328, + "num_tokens": 867655246.0, + "step": 22740 + }, + { + "epoch": 2.8928889454267903, + "ewc_loss": 0.00869698915630579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.696989243617281e-05, + "grad_norm": 4.310352802276611, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8749673366546631, + "num_tokens": 867692792.0, + "step": 22741 + }, + { + "epoch": 2.893016155705381, + "ewc_loss": 0.0086855823174119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.685582724865526e-05, + "grad_norm": 4.32916259765625, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.885661244392395, + "num_tokens": 867732365.0, + "step": 22742 + }, + { + "epoch": 2.8931433659839714, + "ewc_loss": 0.008710040710866451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.710040856385604e-05, + "grad_norm": 4.347828388214111, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8696929216384888, + "num_tokens": 867764800.0, + "step": 22743 + }, + { + "epoch": 2.893270576262562, + "ewc_loss": 0.008691223338246346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.691223774803802e-05, + "grad_norm": 4.289882659912109, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8758656978607178, + "num_tokens": 867807107.0, + "step": 22744 + }, + { + "epoch": 2.8933977865411524, + "ewc_loss": 0.008664047345519066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.664047345519066e-05, + "grad_norm": 4.2814555168151855, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8750507235527039, + "num_tokens": 867850225.0, + "step": 22745 + }, + { + "epoch": 2.893524996819743, + "ewc_loss": 0.008684445172548294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.684444765094668e-05, + "grad_norm": 4.27582311630249, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.877888560295105, + "num_tokens": 867890457.0, + "step": 22746 + }, + { + "epoch": 2.8936522070983335, + "ewc_loss": 0.008695149794220924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.695149881532416e-05, + "grad_norm": 4.351373672485352, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8899997472763062, + "num_tokens": 867925880.0, + "step": 22747 + }, + { + "epoch": 2.893779417376924, + "ewc_loss": 0.008719072677195072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.71907250257209e-05, + "grad_norm": 4.260219097137451, + "learning_rate": 1e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8937586545944214, + "num_tokens": 867963923.0, + "step": 22748 + }, + { + "epoch": 2.8939066276555145, + "ewc_loss": 0.008666491135954857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66649133968167e-05, + "grad_norm": 4.344139099121094, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8852540850639343, + "num_tokens": 868000489.0, + "step": 22749 + }, + { + "epoch": 2.894033837934105, + "ewc_loss": 0.008719407953321934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.719407924218103e-05, + "grad_norm": 4.337458610534668, + "learning_rate": 1e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8869283199310303, + "num_tokens": 868034001.0, + "step": 22750 + }, + { + "epoch": 2.8941610482126956, + "ewc_loss": 0.008685767650604248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.685767534188926e-05, + "grad_norm": 4.283825874328613, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8863397836685181, + "num_tokens": 868074956.0, + "step": 22751 + }, + { + "epoch": 2.894288258491286, + "ewc_loss": 0.00864358153194189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643581531941891e-05, + "grad_norm": 4.29666805267334, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8901385068893433, + "num_tokens": 868110727.0, + "step": 22752 + }, + { + "epoch": 2.8944154687698767, + "ewc_loss": 0.008663995191454887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663994958624244e-05, + "grad_norm": 4.326141834259033, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8673253655433655, + "num_tokens": 868151480.0, + "step": 22753 + }, + { + "epoch": 2.894542679048467, + "ewc_loss": 0.00868181698024273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.681816689204425e-05, + "grad_norm": 4.278046607971191, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8741523623466492, + "num_tokens": 868190620.0, + "step": 22754 + }, + { + "epoch": 2.8946698893270577, + "ewc_loss": 0.008642904460430145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642904867883772e-05, + "grad_norm": 4.281386375427246, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8800446391105652, + "num_tokens": 868230598.0, + "step": 22755 + }, + { + "epoch": 2.8947970996056482, + "ewc_loss": 0.008668726310133934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668726513860747e-05, + "grad_norm": 4.3632893562316895, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8757703900337219, + "num_tokens": 868265933.0, + "step": 22756 + }, + { + "epoch": 2.8949243098842388, + "ewc_loss": 0.008716943673789501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.71694355737418e-05, + "grad_norm": 4.358695030212402, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8844307065010071, + "num_tokens": 868300282.0, + "step": 22757 + }, + { + "epoch": 2.8950515201628293, + "ewc_loss": 0.00865031685680151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65031688590534e-05, + "grad_norm": 4.30804443359375, + "learning_rate": 1e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.8980407118797302, + "num_tokens": 868331031.0, + "step": 22758 + }, + { + "epoch": 2.89517873044142, + "ewc_loss": 0.008674709126353264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67470953380689e-05, + "grad_norm": 4.284628868103027, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8849632740020752, + "num_tokens": 868368268.0, + "step": 22759 + }, + { + "epoch": 2.89530594072001, + "ewc_loss": 0.008662911131978035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662911568535492e-05, + "grad_norm": 4.332440376281738, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8692175149917603, + "num_tokens": 868407551.0, + "step": 22760 + }, + { + "epoch": 2.895433150998601, + "ewc_loss": 0.00871206820011139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.712068665772676e-05, + "grad_norm": 4.267186641693115, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8800349235534668, + "num_tokens": 868449301.0, + "step": 22761 + }, + { + "epoch": 2.895560361277191, + "ewc_loss": 0.008640839718282223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640839951112866e-05, + "grad_norm": 4.319808006286621, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.879442572593689, + "num_tokens": 868482853.0, + "step": 22762 + }, + { + "epoch": 2.895687571555782, + "ewc_loss": 0.008727996610105038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.727996464585885e-05, + "grad_norm": 4.321492671966553, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8796223998069763, + "num_tokens": 868523960.0, + "step": 22763 + }, + { + "epoch": 2.895814781834372, + "ewc_loss": 0.00867010373622179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.670103852637112e-05, + "grad_norm": 4.295014381408691, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8758167028427124, + "num_tokens": 868566447.0, + "step": 22764 + }, + { + "epoch": 2.895941992112963, + "ewc_loss": 0.008684828877449036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.684828935656697e-05, + "grad_norm": 4.276702404022217, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8923133611679077, + "num_tokens": 868608342.0, + "step": 22765 + }, + { + "epoch": 2.896069202391553, + "ewc_loss": 0.008670962415635586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.670962415635586e-05, + "grad_norm": 4.288059234619141, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8939117193222046, + "num_tokens": 868647354.0, + "step": 22766 + }, + { + "epoch": 2.8961964126701436, + "ewc_loss": 0.00865325890481472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.653259283164516e-05, + "grad_norm": 4.323960304260254, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8647983074188232, + "num_tokens": 868690716.0, + "step": 22767 + }, + { + "epoch": 2.896323622948734, + "ewc_loss": 0.008663151413202286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663150947540998e-05, + "grad_norm": 4.284238815307617, + "learning_rate": 1e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8905697464942932, + "num_tokens": 868728318.0, + "step": 22768 + }, + { + "epoch": 2.8964508332273247, + "ewc_loss": 0.008620962500572205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620962034910917e-05, + "grad_norm": 4.303121089935303, + "learning_rate": 1e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.8990435600280762, + "num_tokens": 868764323.0, + "step": 22769 + }, + { + "epoch": 2.896578043505915, + "ewc_loss": 0.008661005645990372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.661005267640576e-05, + "grad_norm": 4.3492512702941895, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8671471476554871, + "num_tokens": 868801371.0, + "step": 22770 + }, + { + "epoch": 2.8967052537845057, + "ewc_loss": 0.008661894127726555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.661894389661029e-05, + "grad_norm": 4.327817916870117, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8879029750823975, + "num_tokens": 868846679.0, + "step": 22771 + }, + { + "epoch": 2.8968324640630962, + "ewc_loss": 0.008612141013145447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612141391495243e-05, + "grad_norm": 4.374836444854736, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8702256679534912, + "num_tokens": 868880076.0, + "step": 22772 + }, + { + "epoch": 2.8969596743416868, + "ewc_loss": 0.00864453800022602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644537592772394e-05, + "grad_norm": 4.3093085289001465, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8829726576805115, + "num_tokens": 868917086.0, + "step": 22773 + }, + { + "epoch": 2.8970868846202773, + "ewc_loss": 0.008582957088947296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582957525504753e-05, + "grad_norm": 4.282171726226807, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8889005184173584, + "num_tokens": 868954952.0, + "step": 22774 + }, + { + "epoch": 2.897214094898868, + "ewc_loss": 0.00859807524830103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598075510235503e-05, + "grad_norm": 4.328446388244629, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8770099878311157, + "num_tokens": 868990911.0, + "step": 22775 + }, + { + "epoch": 2.8973413051774584, + "ewc_loss": 0.008646898902952671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646898641018197e-05, + "grad_norm": 4.251573085784912, + "learning_rate": 1e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8926421999931335, + "num_tokens": 869033398.0, + "step": 22776 + }, + { + "epoch": 2.897468515456049, + "ewc_loss": 0.008580366149544716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580366556998342e-05, + "grad_norm": 4.311064720153809, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8709075450897217, + "num_tokens": 869070513.0, + "step": 22777 + }, + { + "epoch": 2.8975957257346394, + "ewc_loss": 0.008654402568936348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.654402336105704e-05, + "grad_norm": 4.293671607971191, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8895998001098633, + "num_tokens": 869113494.0, + "step": 22778 + }, + { + "epoch": 2.89772293601323, + "ewc_loss": 0.008607168681919575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607168274465948e-05, + "grad_norm": 4.341403007507324, + "learning_rate": 1e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.8929111957550049, + "num_tokens": 869147433.0, + "step": 22779 + }, + { + "epoch": 2.8978501462918205, + "ewc_loss": 0.0086365956813097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636595885036513e-05, + "grad_norm": 4.356088638305664, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8777649402618408, + "num_tokens": 869186763.0, + "step": 22780 + }, + { + "epoch": 2.897977356570411, + "ewc_loss": 0.008641446940600872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641446765977889e-05, + "grad_norm": 4.347554683685303, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8938912749290466, + "num_tokens": 869222624.0, + "step": 22781 + }, + { + "epoch": 2.8981045668490015, + "ewc_loss": 0.008623935282230377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623934991192073e-05, + "grad_norm": 4.2744317054748535, + "learning_rate": 1e-06, + "loss": 0.282, + "mean_token_accuracy": 0.9016499519348145, + "num_tokens": 869263027.0, + "step": 22782 + }, + { + "epoch": 2.898231777127592, + "ewc_loss": 0.008595769293606281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595769031671807e-05, + "grad_norm": 4.310652732849121, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8776622414588928, + "num_tokens": 869304232.0, + "step": 22783 + }, + { + "epoch": 2.8983589874061826, + "ewc_loss": 0.00862959772348404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629597869003192e-05, + "grad_norm": 4.32349967956543, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8880795240402222, + "num_tokens": 869340141.0, + "step": 22784 + }, + { + "epoch": 2.8984861976847727, + "ewc_loss": 0.008599030785262585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599030843470246e-05, + "grad_norm": 4.281026840209961, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8758295774459839, + "num_tokens": 869382428.0, + "step": 22785 + }, + { + "epoch": 2.8986134079633636, + "ewc_loss": 0.008567387238144875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.567387703806162e-05, + "grad_norm": 4.326929569244385, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8907182216644287, + "num_tokens": 869418890.0, + "step": 22786 + }, + { + "epoch": 2.8987406182419537, + "ewc_loss": 0.008620060048997402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620059816166759e-05, + "grad_norm": 4.314917087554932, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8844365477561951, + "num_tokens": 869453704.0, + "step": 22787 + }, + { + "epoch": 2.8988678285205447, + "ewc_loss": 0.00858638808131218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58638813951984e-05, + "grad_norm": 4.302433967590332, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8820054531097412, + "num_tokens": 869491845.0, + "step": 22788 + }, + { + "epoch": 2.898995038799135, + "ewc_loss": 0.008581843227148056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581843576394022e-05, + "grad_norm": 4.370707988739014, + "learning_rate": 1e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8928639888763428, + "num_tokens": 869523393.0, + "step": 22789 + }, + { + "epoch": 2.8991222490777258, + "ewc_loss": 0.008621524088084698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621524466434494e-05, + "grad_norm": 4.288321495056152, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8813496828079224, + "num_tokens": 869561927.0, + "step": 22790 + }, + { + "epoch": 2.899249459356316, + "ewc_loss": 0.008558014407753944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.558014815207571e-05, + "grad_norm": 4.324665546417236, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8826305866241455, + "num_tokens": 869599568.0, + "step": 22791 + }, + { + "epoch": 2.8993766696349064, + "ewc_loss": 0.008600008673965931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600008732173592e-05, + "grad_norm": 4.260274887084961, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8848203420639038, + "num_tokens": 869636329.0, + "step": 22792 + }, + { + "epoch": 2.899503879913497, + "ewc_loss": 0.008554025553166866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554025407647714e-05, + "grad_norm": 4.307455539703369, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8789069056510925, + "num_tokens": 869679720.0, + "step": 22793 + }, + { + "epoch": 2.8996310901920874, + "ewc_loss": 0.008632132783532143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632132812635973e-05, + "grad_norm": 4.291077613830566, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8863930106163025, + "num_tokens": 869719527.0, + "step": 22794 + }, + { + "epoch": 2.899758300470678, + "ewc_loss": 0.008583130314946175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.583129965700209e-05, + "grad_norm": 4.273054599761963, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8875526189804077, + "num_tokens": 869755139.0, + "step": 22795 + }, + { + "epoch": 2.8998855107492685, + "ewc_loss": 0.008581859059631824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581858855905011e-05, + "grad_norm": 4.327557563781738, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8770710229873657, + "num_tokens": 869791860.0, + "step": 22796 + }, + { + "epoch": 2.900012721027859, + "ewc_loss": 0.008635553531348705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635553240310401e-05, + "grad_norm": 4.294064998626709, + "learning_rate": 1e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.9032622575759888, + "num_tokens": 869827294.0, + "step": 22797 + }, + { + "epoch": 2.9001399313064495, + "ewc_loss": 0.008598501794040203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598501881351694e-05, + "grad_norm": 4.364288806915283, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8776276111602783, + "num_tokens": 869859398.0, + "step": 22798 + }, + { + "epoch": 2.90026714158504, + "ewc_loss": 0.008648189716041088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.648189395898953e-05, + "grad_norm": 4.283404350280762, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8826289176940918, + "num_tokens": 869898215.0, + "step": 22799 + }, + { + "epoch": 2.9003943518636306, + "ewc_loss": 0.008579738438129425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579738641856238e-05, + "grad_norm": 4.250940322875977, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8737310767173767, + "num_tokens": 869939787.0, + "step": 22800 + }, + { + "epoch": 2.900521562142221, + "ewc_loss": 0.008616043254733086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616043487563729e-05, + "grad_norm": 4.338482856750488, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8744722604751587, + "num_tokens": 869976790.0, + "step": 22801 + }, + { + "epoch": 2.9006487724208116, + "ewc_loss": 0.008660497143864632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.660496678203344e-05, + "grad_norm": 4.273170471191406, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8864858150482178, + "num_tokens": 870013547.0, + "step": 22802 + }, + { + "epoch": 2.900775982699402, + "ewc_loss": 0.008604445494711399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604445611126721e-05, + "grad_norm": 4.272949695587158, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.894094705581665, + "num_tokens": 870049540.0, + "step": 22803 + }, + { + "epoch": 2.9009031929779927, + "ewc_loss": 0.008639657869935036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639657608000562e-05, + "grad_norm": 4.340996742248535, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8883163332939148, + "num_tokens": 870088247.0, + "step": 22804 + }, + { + "epoch": 2.9010304032565832, + "ewc_loss": 0.00866636075079441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.666361100040376e-05, + "grad_norm": 4.2684760093688965, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8817984461784363, + "num_tokens": 870129112.0, + "step": 22805 + }, + { + "epoch": 2.9011576135351738, + "ewc_loss": 0.00860607996582985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606079791206867e-05, + "grad_norm": 4.335987567901611, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8845999240875244, + "num_tokens": 870162082.0, + "step": 22806 + }, + { + "epoch": 2.9012848238137643, + "ewc_loss": 0.008674736134707928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.674736454850063e-05, + "grad_norm": 4.2696919441223145, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8924372792243958, + "num_tokens": 870199868.0, + "step": 22807 + }, + { + "epoch": 2.901412034092355, + "ewc_loss": 0.008603799156844616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603799506090581e-05, + "grad_norm": 4.342695713043213, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8856382369995117, + "num_tokens": 870238121.0, + "step": 22808 + }, + { + "epoch": 2.9015392443709453, + "ewc_loss": 0.008659912273287773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659912418806925e-05, + "grad_norm": 4.267716884613037, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8730643391609192, + "num_tokens": 870278008.0, + "step": 22809 + }, + { + "epoch": 2.9016664546495354, + "ewc_loss": 0.00859763938933611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597638952778652e-05, + "grad_norm": 4.270035266876221, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8807201385498047, + "num_tokens": 870317671.0, + "step": 22810 + }, + { + "epoch": 2.9017936649281264, + "ewc_loss": 0.008641570806503296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64157045725733e-05, + "grad_norm": 4.343099594116211, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8912976980209351, + "num_tokens": 870352893.0, + "step": 22811 + }, + { + "epoch": 2.9019208752067165, + "ewc_loss": 0.008657876402139664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.657876605866477e-05, + "grad_norm": 4.305947780609131, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8907644152641296, + "num_tokens": 870387513.0, + "step": 22812 + }, + { + "epoch": 2.9020480854853075, + "ewc_loss": 0.008619801141321659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619800792075694e-05, + "grad_norm": 4.280101299285889, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8890560269355774, + "num_tokens": 870428649.0, + "step": 22813 + }, + { + "epoch": 2.9021752957638975, + "ewc_loss": 0.008618338033556938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618338324595243e-05, + "grad_norm": 4.275790691375732, + "learning_rate": 1e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.895573616027832, + "num_tokens": 870468329.0, + "step": 22814 + }, + { + "epoch": 2.902302506042488, + "ewc_loss": 0.008616545237600803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616544801043347e-05, + "grad_norm": 4.304214000701904, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8812555074691772, + "num_tokens": 870509273.0, + "step": 22815 + }, + { + "epoch": 2.9024297163210786, + "ewc_loss": 0.0086271483451128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627148054074496e-05, + "grad_norm": 4.3390655517578125, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8777939081192017, + "num_tokens": 870544206.0, + "step": 22816 + }, + { + "epoch": 2.902556926599669, + "ewc_loss": 0.00863595213741064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635951962787658e-05, + "grad_norm": 4.2963547706604, + "learning_rate": 1e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.8992871642112732, + "num_tokens": 870578708.0, + "step": 22817 + }, + { + "epoch": 2.9026841368782597, + "ewc_loss": 0.008611209690570831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611210068920627e-05, + "grad_norm": 4.279760360717773, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8893529772758484, + "num_tokens": 870620599.0, + "step": 22818 + }, + { + "epoch": 2.90281134715685, + "ewc_loss": 0.008609239012002945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609239012002945e-05, + "grad_norm": 4.330160140991211, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8975045680999756, + "num_tokens": 870657828.0, + "step": 22819 + }, + { + "epoch": 2.9029385574354407, + "ewc_loss": 0.008636836893856525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636836719233543e-05, + "grad_norm": 4.32362699508667, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8782105445861816, + "num_tokens": 870694786.0, + "step": 22820 + }, + { + "epoch": 2.9030657677140312, + "ewc_loss": 0.008621662855148315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621662709629163e-05, + "grad_norm": 4.2711052894592285, + "learning_rate": 1e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.8994483351707458, + "num_tokens": 870733850.0, + "step": 22821 + }, + { + "epoch": 2.9031929779926218, + "ewc_loss": 0.008577621541917324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577621338190511e-05, + "grad_norm": 4.333593845367432, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8826693892478943, + "num_tokens": 870771556.0, + "step": 22822 + }, + { + "epoch": 2.9033201882712123, + "ewc_loss": 0.008631966076791286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631966193206608e-05, + "grad_norm": 4.2855634689331055, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8945809602737427, + "num_tokens": 870809335.0, + "step": 22823 + }, + { + "epoch": 2.903447398549803, + "ewc_loss": 0.008573122322559357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.5731218860019e-05, + "grad_norm": 4.3336944580078125, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8814567923545837, + "num_tokens": 870844239.0, + "step": 22824 + }, + { + "epoch": 2.9035746088283934, + "ewc_loss": 0.008608329109847546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608329517301172e-05, + "grad_norm": 4.317641735076904, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.881829023361206, + "num_tokens": 870879540.0, + "step": 22825 + }, + { + "epoch": 2.903701819106984, + "ewc_loss": 0.008603885769844055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60388608998619e-05, + "grad_norm": 4.364024639129639, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8825640678405762, + "num_tokens": 870917219.0, + "step": 22826 + }, + { + "epoch": 2.9038290293855744, + "ewc_loss": 0.008623266592621803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623266330687329e-05, + "grad_norm": 4.348941326141357, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8850483298301697, + "num_tokens": 870952332.0, + "step": 22827 + }, + { + "epoch": 2.903956239664165, + "ewc_loss": 0.008599579334259033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599579450674355e-05, + "grad_norm": 4.300745010375977, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.873971700668335, + "num_tokens": 870991609.0, + "step": 22828 + }, + { + "epoch": 2.9040834499427555, + "ewc_loss": 0.008596187457442284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596187399234623e-05, + "grad_norm": 4.315936088562012, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8872889280319214, + "num_tokens": 871027456.0, + "step": 22829 + }, + { + "epoch": 2.904210660221346, + "ewc_loss": 0.008618867956101894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618868014309555e-05, + "grad_norm": 4.279537677764893, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8757918477058411, + "num_tokens": 871071833.0, + "step": 22830 + }, + { + "epoch": 2.9043378704999365, + "ewc_loss": 0.008604142814874649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604142931289971e-05, + "grad_norm": 4.355060577392578, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8813985586166382, + "num_tokens": 871105446.0, + "step": 22831 + }, + { + "epoch": 2.904465080778527, + "ewc_loss": 0.008663320913910866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66332120494917e-05, + "grad_norm": 4.298888683319092, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8812615871429443, + "num_tokens": 871141408.0, + "step": 22832 + }, + { + "epoch": 2.9045922910571176, + "ewc_loss": 0.008606153540313244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60615327837877e-05, + "grad_norm": 4.34329080581665, + "learning_rate": 1e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.887848436832428, + "num_tokens": 871173296.0, + "step": 22833 + }, + { + "epoch": 2.904719501335708, + "ewc_loss": 0.00866793654859066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667936344863847e-05, + "grad_norm": 4.3214430809021, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8805654048919678, + "num_tokens": 871210711.0, + "step": 22834 + }, + { + "epoch": 2.904846711614298, + "ewc_loss": 0.008651824668049812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.651824464322999e-05, + "grad_norm": 4.280618667602539, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8810668587684631, + "num_tokens": 871251179.0, + "step": 22835 + }, + { + "epoch": 2.904973921892889, + "ewc_loss": 0.008634255267679691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63425520947203e-05, + "grad_norm": 4.21975564956665, + "learning_rate": 1e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.9011653661727905, + "num_tokens": 871295758.0, + "step": 22836 + }, + { + "epoch": 2.9051011321714793, + "ewc_loss": 0.008610059507191181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610059740021825e-05, + "grad_norm": 4.377584934234619, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8691628575325012, + "num_tokens": 871331145.0, + "step": 22837 + }, + { + "epoch": 2.9052283424500702, + "ewc_loss": 0.008725726045668125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.72572636581026e-05, + "grad_norm": 4.327125072479248, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8603556156158447, + "num_tokens": 871372819.0, + "step": 22838 + }, + { + "epoch": 2.9053555527286603, + "ewc_loss": 0.00865465309470892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.654653356643394e-05, + "grad_norm": 4.270230770111084, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8774399757385254, + "num_tokens": 871414223.0, + "step": 22839 + }, + { + "epoch": 2.905482763007251, + "ewc_loss": 0.008636973798274994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63697350723669e-05, + "grad_norm": 4.287652015686035, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8882135152816772, + "num_tokens": 871455013.0, + "step": 22840 + }, + { + "epoch": 2.9056099732858414, + "ewc_loss": 0.008671123534440994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67112394189462e-05, + "grad_norm": 4.385905742645264, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8818881511688232, + "num_tokens": 871495462.0, + "step": 22841 + }, + { + "epoch": 2.905737183564432, + "ewc_loss": 0.008691977709531784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.691977564012632e-05, + "grad_norm": 4.310616493225098, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8854761123657227, + "num_tokens": 871534999.0, + "step": 22842 + }, + { + "epoch": 2.9058643938430224, + "ewc_loss": 0.00861162506043911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611624798504636e-05, + "grad_norm": 4.340031147003174, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.883907675743103, + "num_tokens": 871569816.0, + "step": 22843 + }, + { + "epoch": 2.905991604121613, + "ewc_loss": 0.008668797090649605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668797090649605e-05, + "grad_norm": 4.291904449462891, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8878891468048096, + "num_tokens": 871610441.0, + "step": 22844 + }, + { + "epoch": 2.9061188144002035, + "ewc_loss": 0.008628747425973415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.628747309558094e-05, + "grad_norm": 4.368088245391846, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8716133236885071, + "num_tokens": 871649005.0, + "step": 22845 + }, + { + "epoch": 2.906246024678794, + "ewc_loss": 0.00868017878383398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.680178871145472e-05, + "grad_norm": 4.311079502105713, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8794689178466797, + "num_tokens": 871689263.0, + "step": 22846 + }, + { + "epoch": 2.9063732349573845, + "ewc_loss": 0.008631841279566288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631841046735644e-05, + "grad_norm": 4.331147193908691, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.889998733997345, + "num_tokens": 871725440.0, + "step": 22847 + }, + { + "epoch": 2.906500445235975, + "ewc_loss": 0.008650227449834347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.650227391626686e-05, + "grad_norm": 4.338788032531738, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8792001008987427, + "num_tokens": 871767839.0, + "step": 22848 + }, + { + "epoch": 2.9066276555145656, + "ewc_loss": 0.008612468838691711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612468809587881e-05, + "grad_norm": 4.316498756408691, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8881184458732605, + "num_tokens": 871801516.0, + "step": 22849 + }, + { + "epoch": 2.906754865793156, + "ewc_loss": 0.008615306578576565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615306433057413e-05, + "grad_norm": 4.287432670593262, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8791137933731079, + "num_tokens": 871840359.0, + "step": 22850 + }, + { + "epoch": 2.9068820760717466, + "ewc_loss": 0.00861759576946497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617595449322835e-05, + "grad_norm": 4.3102312088012695, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8862869739532471, + "num_tokens": 871879396.0, + "step": 22851 + }, + { + "epoch": 2.907009286350337, + "ewc_loss": 0.008615382947027683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615382830612361e-05, + "grad_norm": 4.3089680671691895, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8756926655769348, + "num_tokens": 871915043.0, + "step": 22852 + }, + { + "epoch": 2.9071364966289277, + "ewc_loss": 0.008630766533315182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63076638779603e-05, + "grad_norm": 4.3237624168396, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8782145380973816, + "num_tokens": 871953391.0, + "step": 22853 + }, + { + "epoch": 2.9072637069075182, + "ewc_loss": 0.008623428642749786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623428584542125e-05, + "grad_norm": 4.293524742126465, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.886813759803772, + "num_tokens": 871989879.0, + "step": 22854 + }, + { + "epoch": 2.9073909171861088, + "ewc_loss": 0.00860794261097908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607942436356097e-05, + "grad_norm": 4.362424850463867, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.885696291923523, + "num_tokens": 872022428.0, + "step": 22855 + }, + { + "epoch": 2.9075181274646993, + "ewc_loss": 0.008668492548167706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668492955621332e-05, + "grad_norm": 4.274699687957764, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8892259001731873, + "num_tokens": 872060644.0, + "step": 22856 + }, + { + "epoch": 2.90764533774329, + "ewc_loss": 0.008598251268267632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598250860814005e-05, + "grad_norm": 4.329441070556641, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8807369470596313, + "num_tokens": 872098230.0, + "step": 22857 + }, + { + "epoch": 2.90777254802188, + "ewc_loss": 0.008662732318043709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662732579978183e-05, + "grad_norm": 4.343163967132568, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8726672530174255, + "num_tokens": 872131217.0, + "step": 22858 + }, + { + "epoch": 2.907899758300471, + "ewc_loss": 0.008644690737128258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644691115478054e-05, + "grad_norm": 4.350208759307861, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8920352458953857, + "num_tokens": 872163912.0, + "step": 22859 + }, + { + "epoch": 2.908026968579061, + "ewc_loss": 0.008626711554825306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626711496617645e-05, + "grad_norm": 4.354964256286621, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8599832057952881, + "num_tokens": 872200694.0, + "step": 22860 + }, + { + "epoch": 2.908154178857652, + "ewc_loss": 0.00866286363452673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662863547215238e-05, + "grad_norm": 4.292355060577393, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8846905827522278, + "num_tokens": 872241156.0, + "step": 22861 + }, + { + "epoch": 2.908281389136242, + "ewc_loss": 0.008602834306657314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602834714110941e-05, + "grad_norm": 4.373897552490234, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8858238458633423, + "num_tokens": 872273048.0, + "step": 22862 + }, + { + "epoch": 2.908408599414833, + "ewc_loss": 0.008685177192091942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.685176726430655e-05, + "grad_norm": 4.2715277671813965, + "learning_rate": 1e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.887718677520752, + "num_tokens": 872312165.0, + "step": 22863 + }, + { + "epoch": 2.908535809693423, + "ewc_loss": 0.008594084531068802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594084647484124e-05, + "grad_norm": 4.287688255310059, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.884757399559021, + "num_tokens": 872352740.0, + "step": 22864 + }, + { + "epoch": 2.9086630199720136, + "ewc_loss": 0.00864990521222353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.649905066704378e-05, + "grad_norm": 4.2969536781311035, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8704781532287598, + "num_tokens": 872397800.0, + "step": 22865 + }, + { + "epoch": 2.908790230250604, + "ewc_loss": 0.008654160425066948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.654160774312913e-05, + "grad_norm": 4.347640037536621, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8931629657745361, + "num_tokens": 872429307.0, + "step": 22866 + }, + { + "epoch": 2.9089174405291947, + "ewc_loss": 0.00865752063691616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.657520811539143e-05, + "grad_norm": 4.3000006675720215, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.88097083568573, + "num_tokens": 872465897.0, + "step": 22867 + }, + { + "epoch": 2.909044650807785, + "ewc_loss": 0.008601699955761433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601699664723128e-05, + "grad_norm": 4.282626628875732, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.874279260635376, + "num_tokens": 872503252.0, + "step": 22868 + }, + { + "epoch": 2.9091718610863757, + "ewc_loss": 0.008656179532408714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.656179124955088e-05, + "grad_norm": 4.289488792419434, + "learning_rate": 1e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8869104385375977, + "num_tokens": 872546276.0, + "step": 22869 + }, + { + "epoch": 2.9092990713649662, + "ewc_loss": 0.008646402508020401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64640242070891e-05, + "grad_norm": 4.321545600891113, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8818549513816833, + "num_tokens": 872585690.0, + "step": 22870 + }, + { + "epoch": 2.9094262816435568, + "ewc_loss": 0.008650308474898338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.650308882351965e-05, + "grad_norm": 4.356181621551514, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8719279766082764, + "num_tokens": 872621718.0, + "step": 22871 + }, + { + "epoch": 2.9095534919221473, + "ewc_loss": 0.008672142401337624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.672142575960606e-05, + "grad_norm": 4.276366710662842, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8866524696350098, + "num_tokens": 872659349.0, + "step": 22872 + }, + { + "epoch": 2.909680702200738, + "ewc_loss": 0.008606649935245514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606650226283818e-05, + "grad_norm": 4.2485761642456055, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8781636357307434, + "num_tokens": 872704468.0, + "step": 22873 + }, + { + "epoch": 2.9098079124793284, + "ewc_loss": 0.008643223904073238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643223554827273e-05, + "grad_norm": 4.341277599334717, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8825806379318237, + "num_tokens": 872740003.0, + "step": 22874 + }, + { + "epoch": 2.909935122757919, + "ewc_loss": 0.0086698979139328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.669897943036631e-05, + "grad_norm": 4.327916145324707, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8622355461120605, + "num_tokens": 872780362.0, + "step": 22875 + }, + { + "epoch": 2.9100623330365094, + "ewc_loss": 0.008638820610940456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63882087287493e-05, + "grad_norm": 4.265514850616455, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8905010223388672, + "num_tokens": 872823719.0, + "step": 22876 + }, + { + "epoch": 2.9101895433151, + "ewc_loss": 0.00860883854329586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608838106738403e-05, + "grad_norm": 4.336216449737549, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8876968026161194, + "num_tokens": 872856276.0, + "step": 22877 + }, + { + "epoch": 2.9103167535936905, + "ewc_loss": 0.00868086889386177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.680869359523058e-05, + "grad_norm": 4.290933132171631, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8767591714859009, + "num_tokens": 872893688.0, + "step": 22878 + }, + { + "epoch": 2.910443963872281, + "ewc_loss": 0.008622128516435623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622128370916471e-05, + "grad_norm": 4.250970363616943, + "learning_rate": 1e-06, + "loss": 0.285, + "mean_token_accuracy": 0.9000279307365417, + "num_tokens": 872931169.0, + "step": 22879 + }, + { + "epoch": 2.9105711741508715, + "ewc_loss": 0.008606135845184326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606135816080496e-05, + "grad_norm": 4.279464244842529, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8788032531738281, + "num_tokens": 872970836.0, + "step": 22880 + }, + { + "epoch": 2.910698384429462, + "ewc_loss": 0.008650494739413261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.650495146866888e-05, + "grad_norm": 4.285689353942871, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8956775069236755, + "num_tokens": 873010879.0, + "step": 22881 + }, + { + "epoch": 2.9108255947080526, + "ewc_loss": 0.00865220557898283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.652205724501982e-05, + "grad_norm": 4.28026008605957, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8842296600341797, + "num_tokens": 873058018.0, + "step": 22882 + }, + { + "epoch": 2.9109528049866427, + "ewc_loss": 0.008628438226878643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.628438081359491e-05, + "grad_norm": 4.363658428192139, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8736968636512756, + "num_tokens": 873096259.0, + "step": 22883 + }, + { + "epoch": 2.9110800152652336, + "ewc_loss": 0.008655444718897343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655444980831817e-05, + "grad_norm": 4.362448215484619, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.873907744884491, + "num_tokens": 873131955.0, + "step": 22884 + }, + { + "epoch": 2.9112072255438237, + "ewc_loss": 0.008636645041406155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636644633952528e-05, + "grad_norm": 4.327365875244141, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8778791427612305, + "num_tokens": 873171024.0, + "step": 22885 + }, + { + "epoch": 2.9113344358224147, + "ewc_loss": 0.008620605804026127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620606240583584e-05, + "grad_norm": 4.302474498748779, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8852239847183228, + "num_tokens": 873210568.0, + "step": 22886 + }, + { + "epoch": 2.9114616461010048, + "ewc_loss": 0.008611435070633888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611434896010906e-05, + "grad_norm": 4.319182872772217, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8846806883811951, + "num_tokens": 873250025.0, + "step": 22887 + }, + { + "epoch": 2.9115888563795957, + "ewc_loss": 0.008617321960628033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617321873316541e-05, + "grad_norm": 4.325746536254883, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8742837905883789, + "num_tokens": 873285284.0, + "step": 22888 + }, + { + "epoch": 2.911716066658186, + "ewc_loss": 0.008646830916404724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646830974612385e-05, + "grad_norm": 4.404017925262451, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8863779306411743, + "num_tokens": 873320102.0, + "step": 22889 + }, + { + "epoch": 2.9118432769367764, + "ewc_loss": 0.008668475784361362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668476220918819e-05, + "grad_norm": 4.309784412384033, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8876293897628784, + "num_tokens": 873354668.0, + "step": 22890 + }, + { + "epoch": 2.911970487215367, + "ewc_loss": 0.008606402203440666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606402116129175e-05, + "grad_norm": 4.431946754455566, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8798408508300781, + "num_tokens": 873387415.0, + "step": 22891 + }, + { + "epoch": 2.9120976974939574, + "ewc_loss": 0.008721359074115753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.721358608454466e-05, + "grad_norm": 4.2307915687561035, + "learning_rate": 1e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.9001458883285522, + "num_tokens": 873428679.0, + "step": 22892 + }, + { + "epoch": 2.912224907772548, + "ewc_loss": 0.008554378524422646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554378291592002e-05, + "grad_norm": 4.326775074005127, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.883243203163147, + "num_tokens": 873466973.0, + "step": 22893 + }, + { + "epoch": 2.9123521180511385, + "ewc_loss": 0.008694762364029884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.69476207299158e-05, + "grad_norm": 4.264840602874756, + "learning_rate": 1e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.8971023559570312, + "num_tokens": 873510957.0, + "step": 22894 + }, + { + "epoch": 2.912479328329729, + "ewc_loss": 0.008604091592133045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604091999586672e-05, + "grad_norm": 4.396676063537598, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8669232130050659, + "num_tokens": 873546456.0, + "step": 22895 + }, + { + "epoch": 2.9126065386083195, + "ewc_loss": 0.008689395152032375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.689395326655358e-05, + "grad_norm": 4.28019905090332, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.871991753578186, + "num_tokens": 873591972.0, + "step": 22896 + }, + { + "epoch": 2.91273374888691, + "ewc_loss": 0.00859170500189066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591705409344286e-05, + "grad_norm": 4.397172451019287, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8834193348884583, + "num_tokens": 873629790.0, + "step": 22897 + }, + { + "epoch": 2.9128609591655006, + "ewc_loss": 0.008677170611917973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.677170262672007e-05, + "grad_norm": 4.2729926109313965, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8814183473587036, + "num_tokens": 873667419.0, + "step": 22898 + }, + { + "epoch": 2.912988169444091, + "ewc_loss": 0.008577938191592693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577938569942489e-05, + "grad_norm": 4.327995300292969, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8912909030914307, + "num_tokens": 873705834.0, + "step": 22899 + }, + { + "epoch": 2.9131153797226816, + "ewc_loss": 0.008665459230542183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.665459608891979e-05, + "grad_norm": 4.28243350982666, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8745477199554443, + "num_tokens": 873748859.0, + "step": 22900 + }, + { + "epoch": 2.913242590001272, + "ewc_loss": 0.008613375015556812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613375393906608e-05, + "grad_norm": 4.353865623474121, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8755836486816406, + "num_tokens": 873788068.0, + "step": 22901 + }, + { + "epoch": 2.9133698002798627, + "ewc_loss": 0.00866403803229332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.664037886774167e-05, + "grad_norm": 4.335278034210205, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8895751237869263, + "num_tokens": 873827820.0, + "step": 22902 + }, + { + "epoch": 2.9134970105584532, + "ewc_loss": 0.008631779812276363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631779928691685e-05, + "grad_norm": 4.321875095367432, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8799603581428528, + "num_tokens": 873866271.0, + "step": 22903 + }, + { + "epoch": 2.9136242208370438, + "ewc_loss": 0.008619715459644794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619715663371608e-05, + "grad_norm": 4.282899856567383, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8793169260025024, + "num_tokens": 873907409.0, + "step": 22904 + }, + { + "epoch": 2.9137514311156343, + "ewc_loss": 0.008590321987867355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59032224980183e-05, + "grad_norm": 4.296074390411377, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8884791135787964, + "num_tokens": 873947940.0, + "step": 22905 + }, + { + "epoch": 2.913878641394225, + "ewc_loss": 0.008597392588853836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597393025411293e-05, + "grad_norm": 4.287322521209717, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8687434196472168, + "num_tokens": 873990062.0, + "step": 22906 + }, + { + "epoch": 2.9140058516728153, + "ewc_loss": 0.0085853710770607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585370960645378e-05, + "grad_norm": 4.327909469604492, + "learning_rate": 1e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.895245373249054, + "num_tokens": 874026660.0, + "step": 22907 + }, + { + "epoch": 2.9141330619514054, + "ewc_loss": 0.008602024987339973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602024900028482e-05, + "grad_norm": 4.302962303161621, + "learning_rate": 1e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8889408111572266, + "num_tokens": 874064935.0, + "step": 22908 + }, + { + "epoch": 2.9142602722299964, + "ewc_loss": 0.00855179037898779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.551790233468637e-05, + "grad_norm": 4.344114780426025, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8824719786643982, + "num_tokens": 874102807.0, + "step": 22909 + }, + { + "epoch": 2.9143874825085865, + "ewc_loss": 0.008580510504543781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580510620959103e-05, + "grad_norm": 4.297224998474121, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8847764730453491, + "num_tokens": 874139688.0, + "step": 22910 + }, + { + "epoch": 2.9145146927871775, + "ewc_loss": 0.008554494008421898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.554493979318067e-05, + "grad_norm": 4.261487007141113, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8870195150375366, + "num_tokens": 874184505.0, + "step": 22911 + }, + { + "epoch": 2.9146419030657675, + "ewc_loss": 0.00853973999619484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.539739792468026e-05, + "grad_norm": 4.291470050811768, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8824535012245178, + "num_tokens": 874223210.0, + "step": 22912 + }, + { + "epoch": 2.914769113344358, + "ewc_loss": 0.00856214202940464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562142465962097e-05, + "grad_norm": 4.318185806274414, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.872933566570282, + "num_tokens": 874263939.0, + "step": 22913 + }, + { + "epoch": 2.9148963236229486, + "ewc_loss": 0.008566619828343391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566620090277866e-05, + "grad_norm": 4.309681415557861, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8893375396728516, + "num_tokens": 874303946.0, + "step": 22914 + }, + { + "epoch": 2.915023533901539, + "ewc_loss": 0.008548918180167675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548918412998319e-05, + "grad_norm": 4.27855920791626, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8838900923728943, + "num_tokens": 874343701.0, + "step": 22915 + }, + { + "epoch": 2.9151507441801296, + "ewc_loss": 0.008528891019523144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.528891339665279e-05, + "grad_norm": 4.30885648727417, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8857536911964417, + "num_tokens": 874380425.0, + "step": 22916 + }, + { + "epoch": 2.91527795445872, + "ewc_loss": 0.008577986620366573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577986591262743e-05, + "grad_norm": 4.33083963394165, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8836435079574585, + "num_tokens": 874417643.0, + "step": 22917 + }, + { + "epoch": 2.9154051647373107, + "ewc_loss": 0.00856654904782772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.566549513489008e-05, + "grad_norm": 4.291318416595459, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8780866861343384, + "num_tokens": 874463939.0, + "step": 22918 + }, + { + "epoch": 2.9155323750159012, + "ewc_loss": 0.008540259674191475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.540259295841679e-05, + "grad_norm": 4.336089134216309, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8814647793769836, + "num_tokens": 874503156.0, + "step": 22919 + }, + { + "epoch": 2.9156595852944918, + "ewc_loss": 0.008567161858081818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.567161421524361e-05, + "grad_norm": 4.293962001800537, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.874214768409729, + "num_tokens": 874546754.0, + "step": 22920 + }, + { + "epoch": 2.9157867955730823, + "ewc_loss": 0.008532462641596794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53246237966232e-05, + "grad_norm": 4.241419315338135, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.88585364818573, + "num_tokens": 874590781.0, + "step": 22921 + }, + { + "epoch": 2.915914005851673, + "ewc_loss": 0.008509389124810696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.509388862876222e-05, + "grad_norm": 4.32399845123291, + "learning_rate": 1e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.890561580657959, + "num_tokens": 874629592.0, + "step": 22922 + }, + { + "epoch": 2.9160412161302633, + "ewc_loss": 0.008586935698986053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586936019128188e-05, + "grad_norm": 4.346529960632324, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8905768990516663, + "num_tokens": 874667372.0, + "step": 22923 + }, + { + "epoch": 2.916168426408854, + "ewc_loss": 0.008533329702913761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.533329673809931e-05, + "grad_norm": 4.303152084350586, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8814427256584167, + "num_tokens": 874704687.0, + "step": 22924 + }, + { + "epoch": 2.9162956366874444, + "ewc_loss": 0.00851460825651884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.514608634868637e-05, + "grad_norm": 4.3653364181518555, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8820403814315796, + "num_tokens": 874740471.0, + "step": 22925 + }, + { + "epoch": 2.916422846966035, + "ewc_loss": 0.008574994280934334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57499471749179e-05, + "grad_norm": 4.283044815063477, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.888495683670044, + "num_tokens": 874780747.0, + "step": 22926 + }, + { + "epoch": 2.9165500572446255, + "ewc_loss": 0.00849071517586708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.49071511765942e-05, + "grad_norm": 4.36561393737793, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8858881592750549, + "num_tokens": 874814204.0, + "step": 22927 + }, + { + "epoch": 2.916677267523216, + "ewc_loss": 0.008578780107200146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578779670642689e-05, + "grad_norm": 4.357577323913574, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8829172253608704, + "num_tokens": 874851872.0, + "step": 22928 + }, + { + "epoch": 2.9168044778018065, + "ewc_loss": 0.008541565388441086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.541565330233425e-05, + "grad_norm": 4.381836891174316, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8691773414611816, + "num_tokens": 874884215.0, + "step": 22929 + }, + { + "epoch": 2.916931688080397, + "ewc_loss": 0.008575554005801678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.575554238632321e-05, + "grad_norm": 4.345246315002441, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.879755437374115, + "num_tokens": 874919047.0, + "step": 22930 + }, + { + "epoch": 2.917058898358987, + "ewc_loss": 0.008549259975552559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549259655410424e-05, + "grad_norm": 4.297868728637695, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8766271471977234, + "num_tokens": 874956469.0, + "step": 22931 + }, + { + "epoch": 2.917186108637578, + "ewc_loss": 0.008535319939255714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.53531964821741e-05, + "grad_norm": 4.2890520095825195, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8931702375411987, + "num_tokens": 874994633.0, + "step": 22932 + }, + { + "epoch": 2.917313318916168, + "ewc_loss": 0.00856991857290268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.569919009460136e-05, + "grad_norm": 4.306224346160889, + "learning_rate": 1e-06, + "loss": 0.3039, + "mean_token_accuracy": 0.8947609663009644, + "num_tokens": 875028530.0, + "step": 22933 + }, + { + "epoch": 2.917440529194759, + "ewc_loss": 0.008590075187385082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590074867242947e-05, + "grad_norm": 4.399876594543457, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8775161504745483, + "num_tokens": 875059785.0, + "step": 22934 + }, + { + "epoch": 2.9175677394733492, + "ewc_loss": 0.008641823194921017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641822932986543e-05, + "grad_norm": 4.28411865234375, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8820421695709229, + "num_tokens": 875096057.0, + "step": 22935 + }, + { + "epoch": 2.91769494975194, + "ewc_loss": 0.008581448346376419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581448491895571e-05, + "grad_norm": 4.2323431968688965, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8885895013809204, + "num_tokens": 875141988.0, + "step": 22936 + }, + { + "epoch": 2.9178221600305303, + "ewc_loss": 0.008597486652433872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597486885264516e-05, + "grad_norm": 4.338250160217285, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8879754543304443, + "num_tokens": 875176973.0, + "step": 22937 + }, + { + "epoch": 2.917949370309121, + "ewc_loss": 0.008672291412949562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.672291733091697e-05, + "grad_norm": 4.324724197387695, + "learning_rate": 1e-06, + "loss": 0.3149, + "mean_token_accuracy": 0.8909786939620972, + "num_tokens": 875209271.0, + "step": 22938 + }, + { + "epoch": 2.9180765805877114, + "ewc_loss": 0.008639505133032799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639505540486425e-05, + "grad_norm": 4.262084484100342, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8885046243667603, + "num_tokens": 875247460.0, + "step": 22939 + }, + { + "epoch": 2.918203790866302, + "ewc_loss": 0.00862614531069994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626145427115262e-05, + "grad_norm": 4.338172435760498, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8815429210662842, + "num_tokens": 875286013.0, + "step": 22940 + }, + { + "epoch": 2.9183310011448924, + "ewc_loss": 0.008693153969943523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.693154086358845e-05, + "grad_norm": 4.301191806793213, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.880923330783844, + "num_tokens": 875324547.0, + "step": 22941 + }, + { + "epoch": 2.918458211423483, + "ewc_loss": 0.008638141676783562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638141298433766e-05, + "grad_norm": 4.273446559906006, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8753985166549683, + "num_tokens": 875366655.0, + "step": 22942 + }, + { + "epoch": 2.9185854217020735, + "ewc_loss": 0.008616816252470016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616816194262356e-05, + "grad_norm": 4.306297302246094, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8769262433052063, + "num_tokens": 875408571.0, + "step": 22943 + }, + { + "epoch": 2.918712631980664, + "ewc_loss": 0.00866225827485323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662258187541738e-05, + "grad_norm": 4.314001560211182, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8799929618835449, + "num_tokens": 875450730.0, + "step": 22944 + }, + { + "epoch": 2.9188398422592545, + "ewc_loss": 0.008624554611742496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624554902780801e-05, + "grad_norm": 4.299586772918701, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8868496417999268, + "num_tokens": 875486850.0, + "step": 22945 + }, + { + "epoch": 2.918967052537845, + "ewc_loss": 0.008625850081443787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625850023236126e-05, + "grad_norm": 4.323875427246094, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8770967721939087, + "num_tokens": 875528653.0, + "step": 22946 + }, + { + "epoch": 2.9190942628164356, + "ewc_loss": 0.008630527183413506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.630527008790523e-05, + "grad_norm": 4.300317287445068, + "learning_rate": 1e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8939275145530701, + "num_tokens": 875564155.0, + "step": 22947 + }, + { + "epoch": 2.919221473095026, + "ewc_loss": 0.008601359091699123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601359149906784e-05, + "grad_norm": 4.349273204803467, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8813419342041016, + "num_tokens": 875598548.0, + "step": 22948 + }, + { + "epoch": 2.9193486833736166, + "ewc_loss": 0.008635279722511768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635279664304107e-05, + "grad_norm": 4.324822902679443, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8968950510025024, + "num_tokens": 875634587.0, + "step": 22949 + }, + { + "epoch": 2.919475893652207, + "ewc_loss": 0.008595923893153667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595924009568989e-05, + "grad_norm": 4.3253703117370605, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8822638988494873, + "num_tokens": 875671956.0, + "step": 22950 + }, + { + "epoch": 2.9196031039307977, + "ewc_loss": 0.00859803892672062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598039130447432e-05, + "grad_norm": 4.338245868682861, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8855573534965515, + "num_tokens": 875706327.0, + "step": 22951 + }, + { + "epoch": 2.9197303142093882, + "ewc_loss": 0.00862210988998413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622109453426674e-05, + "grad_norm": 4.279513359069824, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8885360956192017, + "num_tokens": 875749320.0, + "step": 22952 + }, + { + "epoch": 2.9198575244879788, + "ewc_loss": 0.008557552471756935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.55755279189907e-05, + "grad_norm": 4.328281879425049, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8917492032051086, + "num_tokens": 875783871.0, + "step": 22953 + }, + { + "epoch": 2.9199847347665693, + "ewc_loss": 0.00861638505011797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616384729975834e-05, + "grad_norm": 4.316596508026123, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.883358359336853, + "num_tokens": 875821716.0, + "step": 22954 + }, + { + "epoch": 2.92011194504516, + "ewc_loss": 0.008581633679568768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581633301218972e-05, + "grad_norm": 4.363803863525391, + "learning_rate": 1e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8858445882797241, + "num_tokens": 875854339.0, + "step": 22955 + }, + { + "epoch": 2.92023915532375, + "ewc_loss": 0.008595895953476429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595896360930055e-05, + "grad_norm": 4.324377536773682, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.885585367679596, + "num_tokens": 875889456.0, + "step": 22956 + }, + { + "epoch": 2.920366365602341, + "ewc_loss": 0.008593014441430569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593014354119077e-05, + "grad_norm": 4.345615863800049, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8755571842193604, + "num_tokens": 875926659.0, + "step": 22957 + }, + { + "epoch": 2.920493575880931, + "ewc_loss": 0.008597935549914837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597935811849311e-05, + "grad_norm": 4.269312858581543, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8900213241577148, + "num_tokens": 875972766.0, + "step": 22958 + }, + { + "epoch": 2.920620786159522, + "ewc_loss": 0.008557952009141445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.557952241972089e-05, + "grad_norm": 4.360966205596924, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8795891404151917, + "num_tokens": 876011824.0, + "step": 22959 + }, + { + "epoch": 2.920747996438112, + "ewc_loss": 0.008638809435069561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638809231342748e-05, + "grad_norm": 4.382115364074707, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8668617606163025, + "num_tokens": 876050353.0, + "step": 22960 + }, + { + "epoch": 2.920875206716703, + "ewc_loss": 0.008601699955761433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601699664723128e-05, + "grad_norm": 4.337091445922852, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8683533668518066, + "num_tokens": 876089358.0, + "step": 22961 + }, + { + "epoch": 2.921002416995293, + "ewc_loss": 0.008594333194196224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594332757638767e-05, + "grad_norm": 4.321307182312012, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8709673881530762, + "num_tokens": 876126287.0, + "step": 22962 + }, + { + "epoch": 2.9211296272738836, + "ewc_loss": 0.008605563081800938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60556319821626e-05, + "grad_norm": 4.307648658752441, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8910321593284607, + "num_tokens": 876165157.0, + "step": 22963 + }, + { + "epoch": 2.921256837552474, + "ewc_loss": 0.008618833497166634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618833817308769e-05, + "grad_norm": 4.308228969573975, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8860737085342407, + "num_tokens": 876201391.0, + "step": 22964 + }, + { + "epoch": 2.9213840478310646, + "ewc_loss": 0.00863596796989441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635967969894409e-05, + "grad_norm": 4.253725528717041, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.889279305934906, + "num_tokens": 876243004.0, + "step": 22965 + }, + { + "epoch": 2.921511258109655, + "ewc_loss": 0.008625048212707043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625048212707043e-05, + "grad_norm": 4.338254451751709, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8828526735305786, + "num_tokens": 876279245.0, + "step": 22966 + }, + { + "epoch": 2.9216384683882457, + "ewc_loss": 0.008677779696881771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.677779260324314e-05, + "grad_norm": 4.4084649085998535, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8831992745399475, + "num_tokens": 876312145.0, + "step": 22967 + }, + { + "epoch": 2.9217656786668362, + "ewc_loss": 0.008695729076862335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.695729047758505e-05, + "grad_norm": 4.32283353805542, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8835740089416504, + "num_tokens": 876351354.0, + "step": 22968 + }, + { + "epoch": 2.9218928889454268, + "ewc_loss": 0.008607101626694202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607102063251659e-05, + "grad_norm": 4.3632049560546875, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8726859092712402, + "num_tokens": 876388621.0, + "step": 22969 + }, + { + "epoch": 2.9220200992240173, + "ewc_loss": 0.008686178363859653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.686178625794128e-05, + "grad_norm": 4.45531702041626, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8772749900817871, + "num_tokens": 876418377.0, + "step": 22970 + }, + { + "epoch": 2.922147309502608, + "ewc_loss": 0.008714498020708561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.714498108020052e-05, + "grad_norm": 4.307548522949219, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8856613636016846, + "num_tokens": 876453351.0, + "step": 22971 + }, + { + "epoch": 2.9222745197811983, + "ewc_loss": 0.008617483079433441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617483399575576e-05, + "grad_norm": 4.368175506591797, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8879415988922119, + "num_tokens": 876484830.0, + "step": 22972 + }, + { + "epoch": 2.922401730059789, + "ewc_loss": 0.008708156645298004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.70815638336353e-05, + "grad_norm": 4.272921085357666, + "learning_rate": 1e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8926692605018616, + "num_tokens": 876524318.0, + "step": 22973 + }, + { + "epoch": 2.9225289403383794, + "ewc_loss": 0.008643492124974728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643492037663236e-05, + "grad_norm": 4.287024021148682, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8809106349945068, + "num_tokens": 876562421.0, + "step": 22974 + }, + { + "epoch": 2.92265615061697, + "ewc_loss": 0.008703677915036678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.703678031452e-05, + "grad_norm": 4.2878336906433105, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8777822256088257, + "num_tokens": 876605681.0, + "step": 22975 + }, + { + "epoch": 2.9227833608955605, + "ewc_loss": 0.008681817911565304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.681818144395947e-05, + "grad_norm": 4.311631202697754, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8743027448654175, + "num_tokens": 876647371.0, + "step": 22976 + }, + { + "epoch": 2.922910571174151, + "ewc_loss": 0.008719361387193203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.719361358089373e-05, + "grad_norm": 4.311737060546875, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8895746469497681, + "num_tokens": 876685563.0, + "step": 22977 + }, + { + "epoch": 2.9230377814527415, + "ewc_loss": 0.008678040467202663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.678040467202663e-05, + "grad_norm": 4.361528396606445, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8779999613761902, + "num_tokens": 876718404.0, + "step": 22978 + }, + { + "epoch": 2.923164991731332, + "ewc_loss": 0.008713804185390472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.713803981663659e-05, + "grad_norm": 4.35382604598999, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8849725127220154, + "num_tokens": 876751888.0, + "step": 22979 + }, + { + "epoch": 2.9232922020099226, + "ewc_loss": 0.008701138198375702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.701137994648889e-05, + "grad_norm": 4.333381175994873, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8879202604293823, + "num_tokens": 876786813.0, + "step": 22980 + }, + { + "epoch": 2.9234194122885127, + "ewc_loss": 0.00869428925216198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.694289135746658e-05, + "grad_norm": 4.289815425872803, + "learning_rate": 1e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8918673992156982, + "num_tokens": 876824511.0, + "step": 22981 + }, + { + "epoch": 2.9235466225671036, + "ewc_loss": 0.008662044070661068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662044274387881e-05, + "grad_norm": 4.371901988983154, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8866243362426758, + "num_tokens": 876856409.0, + "step": 22982 + }, + { + "epoch": 2.9236738328456937, + "ewc_loss": 0.008719644509255886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.719644392840564e-05, + "grad_norm": 4.37305212020874, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8719121217727661, + "num_tokens": 876889204.0, + "step": 22983 + }, + { + "epoch": 2.9238010431242847, + "ewc_loss": 0.00869420450180769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.694204734638333e-05, + "grad_norm": 4.286905288696289, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8822662830352783, + "num_tokens": 876927185.0, + "step": 22984 + }, + { + "epoch": 2.9239282534028748, + "ewc_loss": 0.008659319020807743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659318700665608e-05, + "grad_norm": 4.302642822265625, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8797273635864258, + "num_tokens": 876965744.0, + "step": 22985 + }, + { + "epoch": 2.9240554636814657, + "ewc_loss": 0.008709367364645004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.70936710271053e-05, + "grad_norm": 4.352346897125244, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8886603713035583, + "num_tokens": 876997107.0, + "step": 22986 + }, + { + "epoch": 2.924182673960056, + "ewc_loss": 0.008743572048842907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.743572107050568e-05, + "grad_norm": 4.318449020385742, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8739220499992371, + "num_tokens": 877037894.0, + "step": 22987 + }, + { + "epoch": 2.9243098842386464, + "ewc_loss": 0.008714011870324612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.714012074051425e-05, + "grad_norm": 4.318185806274414, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.871825635433197, + "num_tokens": 877078724.0, + "step": 22988 + }, + { + "epoch": 2.924437094517237, + "ewc_loss": 0.008716459386050701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.716459706192836e-05, + "grad_norm": 4.274060249328613, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8820651769638062, + "num_tokens": 877119685.0, + "step": 22989 + }, + { + "epoch": 2.9245643047958274, + "ewc_loss": 0.008701513521373272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.70151343406178e-05, + "grad_norm": 4.345835208892822, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8880218267440796, + "num_tokens": 877157055.0, + "step": 22990 + }, + { + "epoch": 2.924691515074418, + "ewc_loss": 0.00875122006982565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.751219866098836e-05, + "grad_norm": 4.353428840637207, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.888099730014801, + "num_tokens": 877188569.0, + "step": 22991 + }, + { + "epoch": 2.9248187253530085, + "ewc_loss": 0.008727083913981915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.727084059501067e-05, + "grad_norm": 4.31504487991333, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8760500550270081, + "num_tokens": 877222745.0, + "step": 22992 + }, + { + "epoch": 2.924945935631599, + "ewc_loss": 0.008713539689779282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.713539864402264e-05, + "grad_norm": 4.3121843338012695, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8682088851928711, + "num_tokens": 877262510.0, + "step": 22993 + }, + { + "epoch": 2.9250731459101895, + "ewc_loss": 0.008728018030524254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.728018292458728e-05, + "grad_norm": 4.296204566955566, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8784271478652954, + "num_tokens": 877300164.0, + "step": 22994 + }, + { + "epoch": 2.92520035618878, + "ewc_loss": 0.008721188642084599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.721188351046294e-05, + "grad_norm": 4.312332630157471, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8921156525611877, + "num_tokens": 877337187.0, + "step": 22995 + }, + { + "epoch": 2.9253275664673706, + "ewc_loss": 0.008744937367737293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.744937076698989e-05, + "grad_norm": 4.321860313415527, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.864285945892334, + "num_tokens": 877375902.0, + "step": 22996 + }, + { + "epoch": 2.925454776745961, + "ewc_loss": 0.008739739656448364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.739739860175177e-05, + "grad_norm": 4.3101325035095215, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8818907737731934, + "num_tokens": 877414406.0, + "step": 22997 + }, + { + "epoch": 2.9255819870245516, + "ewc_loss": 0.008704142645001411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.704142237547785e-05, + "grad_norm": 4.28897762298584, + "learning_rate": 1e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.8892416954040527, + "num_tokens": 877448977.0, + "step": 22998 + }, + { + "epoch": 2.925709197303142, + "ewc_loss": 0.00871248822659254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.712488488527015e-05, + "grad_norm": 4.267971038818359, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8903756141662598, + "num_tokens": 877486766.0, + "step": 22999 + }, + { + "epoch": 2.9258364075817327, + "ewc_loss": 0.008687875233590603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.687875379109755e-05, + "grad_norm": 4.364546775817871, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.894707441329956, + "num_tokens": 877519320.0, + "step": 23000 + }, + { + "epoch": 2.925963617860323, + "ewc_loss": 0.008747733198106289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.74773322721012e-05, + "grad_norm": 4.305169105529785, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8606469631195068, + "num_tokens": 877557913.0, + "step": 23001 + }, + { + "epoch": 2.9260908281389137, + "ewc_loss": 0.008670528419315815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.670528040966019e-05, + "grad_norm": 4.371076583862305, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8832188844680786, + "num_tokens": 877591197.0, + "step": 23002 + }, + { + "epoch": 2.9262180384175043, + "ewc_loss": 0.008741163648664951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.741163765080273e-05, + "grad_norm": 4.273857593536377, + "learning_rate": 1e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.8977269530296326, + "num_tokens": 877625018.0, + "step": 23003 + }, + { + "epoch": 2.926345248696095, + "ewc_loss": 0.00864261481910944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642614557174966e-05, + "grad_norm": 4.28947639465332, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8878430128097534, + "num_tokens": 877662433.0, + "step": 23004 + }, + { + "epoch": 2.9264724589746853, + "ewc_loss": 0.00870499573647976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.704995707375929e-05, + "grad_norm": 4.276846885681152, + "learning_rate": 1e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8925820589065552, + "num_tokens": 877703576.0, + "step": 23005 + }, + { + "epoch": 2.9265996692532754, + "ewc_loss": 0.008663343265652657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663343032822013e-05, + "grad_norm": 4.38353157043457, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8783125281333923, + "num_tokens": 877734025.0, + "step": 23006 + }, + { + "epoch": 2.9267268795318664, + "ewc_loss": 0.008738769218325615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.738769247429445e-05, + "grad_norm": 4.271389484405518, + "learning_rate": 1e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.890887439250946, + "num_tokens": 877771942.0, + "step": 23007 + }, + { + "epoch": 2.9268540898104565, + "ewc_loss": 0.008625049144029617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625048940302804e-05, + "grad_norm": 4.328915119171143, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8863589763641357, + "num_tokens": 877815662.0, + "step": 23008 + }, + { + "epoch": 2.9269813000890474, + "ewc_loss": 0.008693342097103596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.693341806065291e-05, + "grad_norm": 4.308441162109375, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8691686987876892, + "num_tokens": 877855398.0, + "step": 23009 + }, + { + "epoch": 2.9271085103676375, + "ewc_loss": 0.008649011142551899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.649011579109356e-05, + "grad_norm": 4.261227130889893, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8773419857025146, + "num_tokens": 877898544.0, + "step": 23010 + }, + { + "epoch": 2.927235720646228, + "ewc_loss": 0.008621588349342346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621588494861498e-05, + "grad_norm": 4.298734664916992, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.881831705570221, + "num_tokens": 877937914.0, + "step": 23011 + }, + { + "epoch": 2.9273629309248186, + "ewc_loss": 0.008646433241665363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646432979730889e-05, + "grad_norm": 4.293636322021484, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8684495687484741, + "num_tokens": 877979266.0, + "step": 23012 + }, + { + "epoch": 2.927490141203409, + "ewc_loss": 0.008619824424386024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619824075140059e-05, + "grad_norm": 4.294615268707275, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8871191740036011, + "num_tokens": 878019762.0, + "step": 23013 + }, + { + "epoch": 2.9276173514819996, + "ewc_loss": 0.008600565604865551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600565342931077e-05, + "grad_norm": 4.319060802459717, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8777806758880615, + "num_tokens": 878062174.0, + "step": 23014 + }, + { + "epoch": 2.92774456176059, + "ewc_loss": 0.008622129447758198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622129098512232e-05, + "grad_norm": 4.3621625900268555, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8775904178619385, + "num_tokens": 878098197.0, + "step": 23015 + }, + { + "epoch": 2.9278717720391807, + "ewc_loss": 0.008631487376987934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631487435195595e-05, + "grad_norm": 4.360686779022217, + "learning_rate": 1e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8894022703170776, + "num_tokens": 878130478.0, + "step": 23016 + }, + { + "epoch": 2.9279989823177712, + "ewc_loss": 0.008613563142716885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613563113613054e-05, + "grad_norm": 4.271773338317871, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8652490973472595, + "num_tokens": 878175670.0, + "step": 23017 + }, + { + "epoch": 2.9281261925963618, + "ewc_loss": 0.008563953451812267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563953451812267e-05, + "grad_norm": 4.307027816772461, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8733335137367249, + "num_tokens": 878216415.0, + "step": 23018 + }, + { + "epoch": 2.9282534028749523, + "ewc_loss": 0.008624796755611897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624797192169353e-05, + "grad_norm": 4.335253715515137, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8826473951339722, + "num_tokens": 878249582.0, + "step": 23019 + }, + { + "epoch": 2.928380613153543, + "ewc_loss": 0.008632793091237545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632793469587341e-05, + "grad_norm": 4.302079200744629, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8924990892410278, + "num_tokens": 878290859.0, + "step": 23020 + }, + { + "epoch": 2.9285078234321333, + "ewc_loss": 0.008609126321971416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609126234659925e-05, + "grad_norm": 4.292800426483154, + "learning_rate": 1e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8985362648963928, + "num_tokens": 878322463.0, + "step": 23021 + }, + { + "epoch": 2.928635033710724, + "ewc_loss": 0.00860687531530857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.606875780969858e-05, + "grad_norm": 4.2486891746521, + "learning_rate": 1e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8920556902885437, + "num_tokens": 878365754.0, + "step": 23022 + }, + { + "epoch": 2.9287622439893144, + "ewc_loss": 0.00859697163105011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596971747465432e-05, + "grad_norm": 4.256099700927734, + "learning_rate": 1e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8952410817146301, + "num_tokens": 878410325.0, + "step": 23023 + }, + { + "epoch": 2.928889454267905, + "ewc_loss": 0.008590376004576683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590376091888174e-05, + "grad_norm": 4.273509502410889, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8891310691833496, + "num_tokens": 878451604.0, + "step": 23024 + }, + { + "epoch": 2.9290166645464955, + "ewc_loss": 0.008611636236310005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611636440036818e-05, + "grad_norm": 4.358154296875, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8862183094024658, + "num_tokens": 878485174.0, + "step": 23025 + }, + { + "epoch": 2.929143874825086, + "ewc_loss": 0.00864788331091404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647883078083396e-05, + "grad_norm": 4.351663589477539, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8752514123916626, + "num_tokens": 878525867.0, + "step": 23026 + }, + { + "epoch": 2.9292710851036765, + "ewc_loss": 0.008617796003818512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617795538157225e-05, + "grad_norm": 4.367255687713623, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8702316284179688, + "num_tokens": 878561327.0, + "step": 23027 + }, + { + "epoch": 2.929398295382267, + "ewc_loss": 0.008623936213552952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623936446383595e-05, + "grad_norm": 4.2935686111450195, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8894278407096863, + "num_tokens": 878603045.0, + "step": 23028 + }, + { + "epoch": 2.929525505660857, + "ewc_loss": 0.008589878678321838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589878416387364e-05, + "grad_norm": 4.259524822235107, + "learning_rate": 1e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8887269496917725, + "num_tokens": 878645051.0, + "step": 23029 + }, + { + "epoch": 2.929652715939448, + "ewc_loss": 0.0086011728271842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.601172885391861e-05, + "grad_norm": 4.344003677368164, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8659132122993469, + "num_tokens": 878682003.0, + "step": 23030 + }, + { + "epoch": 2.929779926218038, + "ewc_loss": 0.008645149879157543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.645150228403509e-05, + "grad_norm": 4.298280715942383, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8881653547286987, + "num_tokens": 878720509.0, + "step": 23031 + }, + { + "epoch": 2.929907136496629, + "ewc_loss": 0.008588217198848724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588217315264046e-05, + "grad_norm": 4.305160999298096, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8880501985549927, + "num_tokens": 878757950.0, + "step": 23032 + }, + { + "epoch": 2.9300343467752192, + "ewc_loss": 0.008614705875515938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614706166554242e-05, + "grad_norm": 4.3035101890563965, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8767879605293274, + "num_tokens": 878801477.0, + "step": 23033 + }, + { + "epoch": 2.93016155705381, + "ewc_loss": 0.008615520782768726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615521073807031e-05, + "grad_norm": 4.339975357055664, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8745522499084473, + "num_tokens": 878838802.0, + "step": 23034 + }, + { + "epoch": 2.9302887673324003, + "ewc_loss": 0.0086371386423707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637138671474531e-05, + "grad_norm": 4.362242698669434, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8787041902542114, + "num_tokens": 878875566.0, + "step": 23035 + }, + { + "epoch": 2.930415977610991, + "ewc_loss": 0.008635742589831352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63574241520837e-05, + "grad_norm": 4.3018670082092285, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8767898678779602, + "num_tokens": 878913463.0, + "step": 23036 + }, + { + "epoch": 2.9305431878895813, + "ewc_loss": 0.008593433536589146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593433449277654e-05, + "grad_norm": 4.318009376525879, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8724439740180969, + "num_tokens": 878950769.0, + "step": 23037 + }, + { + "epoch": 2.930670398168172, + "ewc_loss": 0.008627031929790974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627031638752669e-05, + "grad_norm": 4.295961856842041, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8929425477981567, + "num_tokens": 878989041.0, + "step": 23038 + }, + { + "epoch": 2.9307976084467624, + "ewc_loss": 0.008623064495623112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623064786661416e-05, + "grad_norm": 4.380030632019043, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8896578550338745, + "num_tokens": 879020195.0, + "step": 23039 + }, + { + "epoch": 2.930924818725353, + "ewc_loss": 0.008681395091116428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.681394683662802e-05, + "grad_norm": 4.298761367797852, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8820476531982422, + "num_tokens": 879057818.0, + "step": 23040 + }, + { + "epoch": 2.9310520290039435, + "ewc_loss": 0.008593415841460228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59341598697938e-05, + "grad_norm": 4.334479808807373, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8804081678390503, + "num_tokens": 879094747.0, + "step": 23041 + }, + { + "epoch": 2.931179239282534, + "ewc_loss": 0.008679795078933239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.679794700583443e-05, + "grad_norm": 4.347522735595703, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8754528760910034, + "num_tokens": 879131127.0, + "step": 23042 + }, + { + "epoch": 2.9313064495611245, + "ewc_loss": 0.008660821244120598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.660821185912937e-05, + "grad_norm": 4.291429042816162, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8845618963241577, + "num_tokens": 879168541.0, + "step": 23043 + }, + { + "epoch": 2.931433659839715, + "ewc_loss": 0.008634073659777641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634074038127437e-05, + "grad_norm": 4.386871337890625, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8799365758895874, + "num_tokens": 879199409.0, + "step": 23044 + }, + { + "epoch": 2.9315608701183056, + "ewc_loss": 0.008728723041713238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.728723332751542e-05, + "grad_norm": 4.286037921905518, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8949282765388489, + "num_tokens": 879236447.0, + "step": 23045 + }, + { + "epoch": 2.931688080396896, + "ewc_loss": 0.008613407611846924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61340740812011e-05, + "grad_norm": 4.277595520019531, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8829885721206665, + "num_tokens": 879275596.0, + "step": 23046 + }, + { + "epoch": 2.9318152906754866, + "ewc_loss": 0.008664368651807308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.664368215249851e-05, + "grad_norm": 4.309885501861572, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8817611932754517, + "num_tokens": 879315620.0, + "step": 23047 + }, + { + "epoch": 2.931942500954077, + "ewc_loss": 0.008674552664160728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.674552373122424e-05, + "grad_norm": 4.298508644104004, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8856860399246216, + "num_tokens": 879356220.0, + "step": 23048 + }, + { + "epoch": 2.9320697112326677, + "ewc_loss": 0.008667557500302792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667557267472148e-05, + "grad_norm": 4.335854530334473, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8686795234680176, + "num_tokens": 879397544.0, + "step": 23049 + }, + { + "epoch": 2.932196921511258, + "ewc_loss": 0.008658889681100845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.658890146762133e-05, + "grad_norm": 4.391563892364502, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8925343751907349, + "num_tokens": 879438165.0, + "step": 23050 + }, + { + "epoch": 2.9323241317898487, + "ewc_loss": 0.008704418316483498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.704417996341363e-05, + "grad_norm": 4.326305866241455, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8819100260734558, + "num_tokens": 879473601.0, + "step": 23051 + }, + { + "epoch": 2.9324513420684393, + "ewc_loss": 0.008629603311419487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629602962173522e-05, + "grad_norm": 4.28640079498291, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8741059303283691, + "num_tokens": 879510998.0, + "step": 23052 + }, + { + "epoch": 2.93257855234703, + "ewc_loss": 0.008660126477479935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.660126331960782e-05, + "grad_norm": 4.282967567443848, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8863328695297241, + "num_tokens": 879551469.0, + "step": 23053 + }, + { + "epoch": 2.93270576262562, + "ewc_loss": 0.008667461574077606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667461952427402e-05, + "grad_norm": 4.368039608001709, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8763710260391235, + "num_tokens": 879585620.0, + "step": 23054 + }, + { + "epoch": 2.932832972904211, + "ewc_loss": 0.008700334466993809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.700334728928283e-05, + "grad_norm": 4.299987316131592, + "learning_rate": 1e-06, + "loss": 0.304, + "mean_token_accuracy": 0.8924283981323242, + "num_tokens": 879621749.0, + "step": 23055 + }, + { + "epoch": 2.932960183182801, + "ewc_loss": 0.00865749642252922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.657496800879017e-05, + "grad_norm": 4.284694194793701, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8859132528305054, + "num_tokens": 879664529.0, + "step": 23056 + }, + { + "epoch": 2.933087393461392, + "ewc_loss": 0.008674586191773415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.674585842527449e-05, + "grad_norm": 4.321400165557861, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8883653283119202, + "num_tokens": 879696293.0, + "step": 23057 + }, + { + "epoch": 2.933214603739982, + "ewc_loss": 0.008702466264367104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.702466584509239e-05, + "grad_norm": 4.360097885131836, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8885154128074646, + "num_tokens": 879733562.0, + "step": 23058 + }, + { + "epoch": 2.933341814018573, + "ewc_loss": 0.008703465573489666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.703465573489666e-05, + "grad_norm": 4.281139850616455, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8817599415779114, + "num_tokens": 879771740.0, + "step": 23059 + }, + { + "epoch": 2.933469024297163, + "ewc_loss": 0.008648604154586792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.648604125482962e-05, + "grad_norm": 4.472427845001221, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8800152540206909, + "num_tokens": 879805578.0, + "step": 23060 + }, + { + "epoch": 2.9335962345757536, + "ewc_loss": 0.008808583952486515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.80858424352482e-05, + "grad_norm": 4.281662940979004, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8805948495864868, + "num_tokens": 879845175.0, + "step": 23061 + }, + { + "epoch": 2.933723444854344, + "ewc_loss": 0.008622070774435997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622070890851319e-05, + "grad_norm": 4.279323101043701, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8943032026290894, + "num_tokens": 879878975.0, + "step": 23062 + }, + { + "epoch": 2.9338506551329346, + "ewc_loss": 0.008681250736117363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.681250619702041e-05, + "grad_norm": 4.3147149085998535, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8761820197105408, + "num_tokens": 879919373.0, + "step": 23063 + }, + { + "epoch": 2.933977865411525, + "ewc_loss": 0.008698584511876106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.698584861122072e-05, + "grad_norm": 4.352457523345947, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8650710582733154, + "num_tokens": 879953821.0, + "step": 23064 + }, + { + "epoch": 2.9341050756901157, + "ewc_loss": 0.008719981648027897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.719981997273862e-05, + "grad_norm": 4.33596658706665, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8747844099998474, + "num_tokens": 879993505.0, + "step": 23065 + }, + { + "epoch": 2.934232285968706, + "ewc_loss": 0.008681694976985455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.681695180712268e-05, + "grad_norm": 4.276557445526123, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8892022967338562, + "num_tokens": 880030124.0, + "step": 23066 + }, + { + "epoch": 2.9343594962472968, + "ewc_loss": 0.008682589046657085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.682589395903051e-05, + "grad_norm": 4.471193790435791, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8650995492935181, + "num_tokens": 880059480.0, + "step": 23067 + }, + { + "epoch": 2.9344867065258873, + "ewc_loss": 0.008809574879705906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.80957450135611e-05, + "grad_norm": 4.3325347900390625, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8831249475479126, + "num_tokens": 880093402.0, + "step": 23068 + }, + { + "epoch": 2.934613916804478, + "ewc_loss": 0.008666466921567917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.666467329021543e-05, + "grad_norm": 4.304896831512451, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8828126192092896, + "num_tokens": 880127874.0, + "step": 23069 + }, + { + "epoch": 2.9347411270830683, + "ewc_loss": 0.0087375333532691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.737533789826557e-05, + "grad_norm": 4.266692161560059, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8914324045181274, + "num_tokens": 880165755.0, + "step": 23070 + }, + { + "epoch": 2.934868337361659, + "ewc_loss": 0.008730716072022915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.730716217542067e-05, + "grad_norm": 4.319838523864746, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.876152515411377, + "num_tokens": 880203931.0, + "step": 23071 + }, + { + "epoch": 2.9349955476402494, + "ewc_loss": 0.00875491090118885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.754910959396511e-05, + "grad_norm": 4.277467250823975, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8856818079948425, + "num_tokens": 880242844.0, + "step": 23072 + }, + { + "epoch": 2.93512275791884, + "ewc_loss": 0.008720846846699715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.720847108634189e-05, + "grad_norm": 4.323432922363281, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8694890141487122, + "num_tokens": 880280068.0, + "step": 23073 + }, + { + "epoch": 2.9352499681974304, + "ewc_loss": 0.00876405369490385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.764053927734494e-05, + "grad_norm": 4.39616060256958, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8892722129821777, + "num_tokens": 880314044.0, + "step": 23074 + }, + { + "epoch": 2.935377178476021, + "ewc_loss": 0.00878424197435379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.784241799730808e-05, + "grad_norm": 4.292737007141113, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8793807029724121, + "num_tokens": 880355077.0, + "step": 23075 + }, + { + "epoch": 2.9355043887546115, + "ewc_loss": 0.008711096830666065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.711096597835422e-05, + "grad_norm": 4.297140121459961, + "learning_rate": 1e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8968880772590637, + "num_tokens": 880388906.0, + "step": 23076 + }, + { + "epoch": 2.935631599033202, + "ewc_loss": 0.008755307644605637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.755307499086484e-05, + "grad_norm": 4.296335697174072, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8919116854667664, + "num_tokens": 880426834.0, + "step": 23077 + }, + { + "epoch": 2.9357588093117926, + "ewc_loss": 0.008733387105166912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.733387221582234e-05, + "grad_norm": 4.322397708892822, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8795628547668457, + "num_tokens": 880463477.0, + "step": 23078 + }, + { + "epoch": 2.9358860195903826, + "ewc_loss": 0.008749810978770256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.749810513108969e-05, + "grad_norm": 4.3971381187438965, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8667863011360168, + "num_tokens": 880500884.0, + "step": 23079 + }, + { + "epoch": 2.9360132298689736, + "ewc_loss": 0.008776701986789703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.776701724855229e-05, + "grad_norm": 4.298762321472168, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8754217624664307, + "num_tokens": 880539724.0, + "step": 23080 + }, + { + "epoch": 2.9361404401475637, + "ewc_loss": 0.008701604790985584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.701605111127719e-05, + "grad_norm": 4.2966814041137695, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8750321865081787, + "num_tokens": 880577820.0, + "step": 23081 + }, + { + "epoch": 2.9362676504261547, + "ewc_loss": 0.008740197867155075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.74019751790911e-05, + "grad_norm": 4.2967963218688965, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8851659297943115, + "num_tokens": 880612055.0, + "step": 23082 + }, + { + "epoch": 2.9363948607047448, + "ewc_loss": 0.008733021095395088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.73302124091424e-05, + "grad_norm": 4.27449893951416, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8910996913909912, + "num_tokens": 880654907.0, + "step": 23083 + }, + { + "epoch": 2.9365220709833357, + "ewc_loss": 0.008707390166819096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.707390225026757e-05, + "grad_norm": 4.265639781951904, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8981016874313354, + "num_tokens": 880692418.0, + "step": 23084 + }, + { + "epoch": 2.936649281261926, + "ewc_loss": 0.008687644265592098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.687644003657624e-05, + "grad_norm": 4.3849310874938965, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.870004415512085, + "num_tokens": 880727742.0, + "step": 23085 + }, + { + "epoch": 2.9367764915405163, + "ewc_loss": 0.008752964437007904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.752964640734717e-05, + "grad_norm": 4.267786502838135, + "learning_rate": 1e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8934579491615295, + "num_tokens": 880766673.0, + "step": 23086 + }, + { + "epoch": 2.936903701819107, + "ewc_loss": 0.008657622151076794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65762194734998e-05, + "grad_norm": 4.321172714233398, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8772665858268738, + "num_tokens": 880810390.0, + "step": 23087 + }, + { + "epoch": 2.9370309120976974, + "ewc_loss": 0.008709041401743889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.709041867405176e-05, + "grad_norm": 4.367068290710449, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8769633173942566, + "num_tokens": 880846920.0, + "step": 23088 + }, + { + "epoch": 2.937158122376288, + "ewc_loss": 0.008699890226125717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.699890167918056e-05, + "grad_norm": 4.331483840942383, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8904979228973389, + "num_tokens": 880885867.0, + "step": 23089 + }, + { + "epoch": 2.9372853326548785, + "ewc_loss": 0.008658829145133495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.658829028718174e-05, + "grad_norm": 4.288826942443848, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8934378027915955, + "num_tokens": 880928408.0, + "step": 23090 + }, + { + "epoch": 2.937412542933469, + "ewc_loss": 0.008644725196063519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64472531247884e-05, + "grad_norm": 4.373953342437744, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8834398984909058, + "num_tokens": 880960681.0, + "step": 23091 + }, + { + "epoch": 2.9375397532120595, + "ewc_loss": 0.008708066307008266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.708066161489114e-05, + "grad_norm": 4.299160957336426, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8701702356338501, + "num_tokens": 881000358.0, + "step": 23092 + }, + { + "epoch": 2.93766696349065, + "ewc_loss": 0.008611295372247696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611295197624713e-05, + "grad_norm": 4.336699962615967, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8856810331344604, + "num_tokens": 881037765.0, + "step": 23093 + }, + { + "epoch": 2.9377941737692406, + "ewc_loss": 0.008654946461319923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.654946577735245e-05, + "grad_norm": 4.320331573486328, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8962618708610535, + "num_tokens": 881076091.0, + "step": 23094 + }, + { + "epoch": 2.937921384047831, + "ewc_loss": 0.008634035475552082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634035475552082e-05, + "grad_norm": 4.317763805389404, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.880714476108551, + "num_tokens": 881112784.0, + "step": 23095 + }, + { + "epoch": 2.9380485943264216, + "ewc_loss": 0.008628259412944317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.628259820397943e-05, + "grad_norm": 4.3314104080200195, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8898279666900635, + "num_tokens": 881149938.0, + "step": 23096 + }, + { + "epoch": 2.938175804605012, + "ewc_loss": 0.008653756231069565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.653756231069565e-05, + "grad_norm": 4.301121234893799, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8747161626815796, + "num_tokens": 881187541.0, + "step": 23097 + }, + { + "epoch": 2.9383030148836027, + "ewc_loss": 0.008622792549431324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622792665846646e-05, + "grad_norm": 4.364391803741455, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8847855925559998, + "num_tokens": 881221920.0, + "step": 23098 + }, + { + "epoch": 2.938430225162193, + "ewc_loss": 0.008654038421809673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.654038538224995e-05, + "grad_norm": 4.33641242980957, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8685787916183472, + "num_tokens": 881264804.0, + "step": 23099 + }, + { + "epoch": 2.9385574354407837, + "ewc_loss": 0.008643196895718575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643197361379862e-05, + "grad_norm": 4.344560146331787, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8743559122085571, + "num_tokens": 881304482.0, + "step": 23100 + }, + { + "epoch": 2.9386846457193743, + "ewc_loss": 0.008627176284790039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627176430309191e-05, + "grad_norm": 4.348348140716553, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8730902671813965, + "num_tokens": 881340822.0, + "step": 23101 + }, + { + "epoch": 2.938811855997965, + "ewc_loss": 0.00863515492528677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635155245428905e-05, + "grad_norm": 4.363790512084961, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8809729218482971, + "num_tokens": 881374352.0, + "step": 23102 + }, + { + "epoch": 2.9389390662765553, + "ewc_loss": 0.008653797209262848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.653796976432204e-05, + "grad_norm": 4.328214645385742, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8790201544761658, + "num_tokens": 881413176.0, + "step": 23103 + }, + { + "epoch": 2.9390662765551454, + "ewc_loss": 0.008623935282230377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623934991192073e-05, + "grad_norm": 4.253896236419678, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.878913164138794, + "num_tokens": 881454309.0, + "step": 23104 + }, + { + "epoch": 2.9391934868337364, + "ewc_loss": 0.008611183613538742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611183875473216e-05, + "grad_norm": 4.311272144317627, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8838013410568237, + "num_tokens": 881490702.0, + "step": 23105 + }, + { + "epoch": 2.9393206971123265, + "ewc_loss": 0.00866614654660225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.666146459290758e-05, + "grad_norm": 4.265473365783691, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8880974054336548, + "num_tokens": 881530394.0, + "step": 23106 + }, + { + "epoch": 2.9394479073909174, + "ewc_loss": 0.008618342690169811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618342690169811e-05, + "grad_norm": 4.295088768005371, + "learning_rate": 1e-06, + "loss": 0.2959, + "mean_token_accuracy": 0.8955061435699463, + "num_tokens": 881567286.0, + "step": 23107 + }, + { + "epoch": 2.9395751176695075, + "ewc_loss": 0.008653681725263596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6536820163019e-05, + "grad_norm": 4.280158519744873, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8775947690010071, + "num_tokens": 881607017.0, + "step": 23108 + }, + { + "epoch": 2.939702327948098, + "ewc_loss": 0.008631638251245022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631638047518209e-05, + "grad_norm": 4.298361301422119, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8832881450653076, + "num_tokens": 881645793.0, + "step": 23109 + }, + { + "epoch": 2.9398295382266886, + "ewc_loss": 0.008652860298752785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.652859833091497e-05, + "grad_norm": 4.301624298095703, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8846343755722046, + "num_tokens": 881683596.0, + "step": 23110 + }, + { + "epoch": 2.939956748505279, + "ewc_loss": 0.008650565519928932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.650565723655745e-05, + "grad_norm": 4.407016277313232, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8690586090087891, + "num_tokens": 881717607.0, + "step": 23111 + }, + { + "epoch": 2.9400839587838696, + "ewc_loss": 0.008720929734408855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.72093005455099e-05, + "grad_norm": 4.333010673522949, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8937125205993652, + "num_tokens": 881750884.0, + "step": 23112 + }, + { + "epoch": 2.94021116906246, + "ewc_loss": 0.008646768517792225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646768401376903e-05, + "grad_norm": 4.33946418762207, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8788905739784241, + "num_tokens": 881789345.0, + "step": 23113 + }, + { + "epoch": 2.9403383793410507, + "ewc_loss": 0.008670512586832047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.670512761455029e-05, + "grad_norm": 4.295350551605225, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8798695802688599, + "num_tokens": 881828459.0, + "step": 23114 + }, + { + "epoch": 2.940465589619641, + "ewc_loss": 0.008660830557346344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.660830644657835e-05, + "grad_norm": 4.385564804077148, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8776025772094727, + "num_tokens": 881865294.0, + "step": 23115 + }, + { + "epoch": 2.9405927998982317, + "ewc_loss": 0.008699646219611168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.699646423337981e-05, + "grad_norm": 4.284602642059326, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8846020102500916, + "num_tokens": 881904936.0, + "step": 23116 + }, + { + "epoch": 2.9407200101768223, + "ewc_loss": 0.008608645759522915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608646021457389e-05, + "grad_norm": 4.335892200469971, + "learning_rate": 1e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.8927715420722961, + "num_tokens": 881936300.0, + "step": 23117 + }, + { + "epoch": 2.940847220455413, + "ewc_loss": 0.008679602295160294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.679601887706667e-05, + "grad_norm": 4.2644805908203125, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8937284350395203, + "num_tokens": 881974092.0, + "step": 23118 + }, + { + "epoch": 2.9409744307340033, + "ewc_loss": 0.008596977218985558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596977568231523e-05, + "grad_norm": 4.261228084564209, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8843773603439331, + "num_tokens": 882019339.0, + "step": 23119 + }, + { + "epoch": 2.941101641012594, + "ewc_loss": 0.008613920770585537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613921090727672e-05, + "grad_norm": 4.3081278800964355, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8873351812362671, + "num_tokens": 882052837.0, + "step": 23120 + }, + { + "epoch": 2.9412288512911844, + "ewc_loss": 0.00865998212248087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659982268000022e-05, + "grad_norm": 4.315984725952148, + "learning_rate": 1e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8878138065338135, + "num_tokens": 882088449.0, + "step": 23121 + }, + { + "epoch": 2.941356061569775, + "ewc_loss": 0.008625738322734833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625738701084629e-05, + "grad_norm": 4.32605504989624, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8735514879226685, + "num_tokens": 882124844.0, + "step": 23122 + }, + { + "epoch": 2.9414832718483654, + "ewc_loss": 0.008626879192888737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626878843642771e-05, + "grad_norm": 4.2928667068481445, + "learning_rate": 1e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.8858731389045715, + "num_tokens": 882162246.0, + "step": 23123 + }, + { + "epoch": 2.941610482126956, + "ewc_loss": 0.008640162646770477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640162559458986e-05, + "grad_norm": 4.3264970779418945, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.871680498123169, + "num_tokens": 882202825.0, + "step": 23124 + }, + { + "epoch": 2.9417376924055465, + "ewc_loss": 0.00864516943693161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.645169145893306e-05, + "grad_norm": 4.347681522369385, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8681433200836182, + "num_tokens": 882237951.0, + "step": 23125 + }, + { + "epoch": 2.941864902684137, + "ewc_loss": 0.008658806793391705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.658807200845331e-05, + "grad_norm": 4.259616374969482, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8822668790817261, + "num_tokens": 882279147.0, + "step": 23126 + }, + { + "epoch": 2.941992112962727, + "ewc_loss": 0.008619772270321846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619772415840998e-05, + "grad_norm": 4.235756874084473, + "learning_rate": 1e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.9013606905937195, + "num_tokens": 882322777.0, + "step": 23127 + }, + { + "epoch": 2.942119323241318, + "ewc_loss": 0.008639466017484665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639466250315309e-05, + "grad_norm": 4.324035167694092, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8879343271255493, + "num_tokens": 882360681.0, + "step": 23128 + }, + { + "epoch": 2.942246533519908, + "ewc_loss": 0.008684760890901089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.684761269250885e-05, + "grad_norm": 4.281294345855713, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8633524179458618, + "num_tokens": 882403823.0, + "step": 23129 + }, + { + "epoch": 2.942373743798499, + "ewc_loss": 0.008639879524707794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639879524707794e-05, + "grad_norm": 4.295245170593262, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8896627426147461, + "num_tokens": 882443723.0, + "step": 23130 + }, + { + "epoch": 2.9425009540770892, + "ewc_loss": 0.008634191937744617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634191908640787e-05, + "grad_norm": 4.35211181640625, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8693070411682129, + "num_tokens": 882480610.0, + "step": 23131 + }, + { + "epoch": 2.94262816435568, + "ewc_loss": 0.008663739077746868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663738844916224e-05, + "grad_norm": 4.325828552246094, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8847185969352722, + "num_tokens": 882518789.0, + "step": 23132 + }, + { + "epoch": 2.9427553746342703, + "ewc_loss": 0.008636406622827053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636406710138544e-05, + "grad_norm": 4.328120708465576, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8721190690994263, + "num_tokens": 882556598.0, + "step": 23133 + }, + { + "epoch": 2.942882584912861, + "ewc_loss": 0.00863021519035101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.630214870208874e-05, + "grad_norm": 4.299393653869629, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8800011873245239, + "num_tokens": 882593788.0, + "step": 23134 + }, + { + "epoch": 2.9430097951914513, + "ewc_loss": 0.008631113916635513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631114178569987e-05, + "grad_norm": 4.313747406005859, + "learning_rate": 1e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8895233869552612, + "num_tokens": 882630861.0, + "step": 23135 + }, + { + "epoch": 2.943137005470042, + "ewc_loss": 0.008644451387226582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644451008876786e-05, + "grad_norm": 4.26839017868042, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8891486525535583, + "num_tokens": 882670950.0, + "step": 23136 + }, + { + "epoch": 2.9432642157486324, + "ewc_loss": 0.008602292276918888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602292655268684e-05, + "grad_norm": 4.331733226776123, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8729043006896973, + "num_tokens": 882705680.0, + "step": 23137 + }, + { + "epoch": 2.943391426027223, + "ewc_loss": 0.008669929578900337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.669929957250133e-05, + "grad_norm": 4.326690196990967, + "learning_rate": 1e-06, + "loss": 0.3121, + "mean_token_accuracy": 0.8910045623779297, + "num_tokens": 882737589.0, + "step": 23138 + }, + { + "epoch": 2.9435186363058135, + "ewc_loss": 0.008640959858894348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6409600044135e-05, + "grad_norm": 4.329568386077881, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8687549233436584, + "num_tokens": 882774502.0, + "step": 23139 + }, + { + "epoch": 2.943645846584404, + "ewc_loss": 0.008663120679557323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66312111611478e-05, + "grad_norm": 4.280437469482422, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8721929788589478, + "num_tokens": 882815184.0, + "step": 23140 + }, + { + "epoch": 2.9437730568629945, + "ewc_loss": 0.008647850714623928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647850336274132e-05, + "grad_norm": 4.265954494476318, + "learning_rate": 1e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8959786891937256, + "num_tokens": 882856793.0, + "step": 23141 + }, + { + "epoch": 2.943900267141585, + "ewc_loss": 0.00864403322339058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644033368909732e-05, + "grad_norm": 4.29127836227417, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8932483792304993, + "num_tokens": 882896128.0, + "step": 23142 + }, + { + "epoch": 2.9440274774201756, + "ewc_loss": 0.008669806644320488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.669806265970692e-05, + "grad_norm": 4.341943740844727, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8899328112602234, + "num_tokens": 882931253.0, + "step": 23143 + }, + { + "epoch": 2.944154687698766, + "ewc_loss": 0.008675104938447475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.675105345901102e-05, + "grad_norm": 4.224366664886475, + "learning_rate": 1e-06, + "loss": 0.2919, + "mean_token_accuracy": 0.8979412317276001, + "num_tokens": 882975072.0, + "step": 23144 + }, + { + "epoch": 2.9442818979773566, + "ewc_loss": 0.008599824272096157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.599823922850192e-05, + "grad_norm": 4.306521892547607, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8937327861785889, + "num_tokens": 883013576.0, + "step": 23145 + }, + { + "epoch": 2.944409108255947, + "ewc_loss": 0.008685735985636711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.685736247571185e-05, + "grad_norm": 4.317070484161377, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.884926974773407, + "num_tokens": 883046052.0, + "step": 23146 + }, + { + "epoch": 2.9445363185345377, + "ewc_loss": 0.008656376041471958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.656376303406432e-05, + "grad_norm": 4.393814563751221, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8654646873474121, + "num_tokens": 883080398.0, + "step": 23147 + }, + { + "epoch": 2.944663528813128, + "ewc_loss": 0.008685525506734848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.685525244800374e-05, + "grad_norm": 4.2807936668396, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8753137588500977, + "num_tokens": 883120321.0, + "step": 23148 + }, + { + "epoch": 2.9447907390917187, + "ewc_loss": 0.008599997498095036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59999781823717e-05, + "grad_norm": 4.278262615203857, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8887879252433777, + "num_tokens": 883161285.0, + "step": 23149 + }, + { + "epoch": 2.9449179493703093, + "ewc_loss": 0.008671574294567108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.671574323670939e-05, + "grad_norm": 4.322764873504639, + "learning_rate": 1e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8813315033912659, + "num_tokens": 883199857.0, + "step": 23150 + }, + { + "epoch": 2.9450451596489, + "ewc_loss": 0.008655710145831108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655709825688973e-05, + "grad_norm": 4.294931888580322, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8896079659461975, + "num_tokens": 883238325.0, + "step": 23151 + }, + { + "epoch": 2.94517236992749, + "ewc_loss": 0.008641085587441921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641085878480226e-05, + "grad_norm": 4.29182767868042, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8755703568458557, + "num_tokens": 883281814.0, + "step": 23152 + }, + { + "epoch": 2.945299580206081, + "ewc_loss": 0.008638846687972546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638847066322342e-05, + "grad_norm": 4.336066722869873, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8873854875564575, + "num_tokens": 883317385.0, + "step": 23153 + }, + { + "epoch": 2.945426790484671, + "ewc_loss": 0.008661441504955292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.661441825097427e-05, + "grad_norm": 4.287382125854492, + "learning_rate": 1e-06, + "loss": 0.312, + "mean_token_accuracy": 0.8898628950119019, + "num_tokens": 883353498.0, + "step": 23154 + }, + { + "epoch": 2.945554000763262, + "ewc_loss": 0.00861579179763794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615791739430279e-05, + "grad_norm": 4.313683986663818, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8905224800109863, + "num_tokens": 883391709.0, + "step": 23155 + }, + { + "epoch": 2.945681211041852, + "ewc_loss": 0.008647770620882511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647771028336138e-05, + "grad_norm": 4.374837875366211, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8680674433708191, + "num_tokens": 883426329.0, + "step": 23156 + }, + { + "epoch": 2.945808421320443, + "ewc_loss": 0.008667320013046265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667320071253926e-05, + "grad_norm": 4.332297325134277, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8815901279449463, + "num_tokens": 883465797.0, + "step": 23157 + }, + { + "epoch": 2.945935631599033, + "ewc_loss": 0.008632106706500053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632106619188562e-05, + "grad_norm": 4.3359808921813965, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8866086006164551, + "num_tokens": 883501632.0, + "step": 23158 + }, + { + "epoch": 2.9460628418776236, + "ewc_loss": 0.008660872466862202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.660872117616236e-05, + "grad_norm": 4.32557487487793, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8933813571929932, + "num_tokens": 883537622.0, + "step": 23159 + }, + { + "epoch": 2.946190052156214, + "ewc_loss": 0.008646443486213684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64644389366731e-05, + "grad_norm": 4.301607131958008, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.883908212184906, + "num_tokens": 883576458.0, + "step": 23160 + }, + { + "epoch": 2.9463172624348046, + "ewc_loss": 0.008634842000901699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634842379251495e-05, + "grad_norm": 4.277446746826172, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8757227063179016, + "num_tokens": 883620929.0, + "step": 23161 + }, + { + "epoch": 2.946444472713395, + "ewc_loss": 0.008639348670840263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639348379801959e-05, + "grad_norm": 4.415283679962158, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8615310788154602, + "num_tokens": 883652589.0, + "step": 23162 + }, + { + "epoch": 2.9465716829919857, + "ewc_loss": 0.008721469901502132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.721469930605963e-05, + "grad_norm": 4.270358085632324, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8824983835220337, + "num_tokens": 883691844.0, + "step": 23163 + }, + { + "epoch": 2.946698893270576, + "ewc_loss": 0.008589881472289562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58988132677041e-05, + "grad_norm": 4.320284843444824, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8650176525115967, + "num_tokens": 883732838.0, + "step": 23164 + }, + { + "epoch": 2.9468261035491667, + "ewc_loss": 0.008699445985257626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.699446334503591e-05, + "grad_norm": 4.281974792480469, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8806217312812805, + "num_tokens": 883771415.0, + "step": 23165 + }, + { + "epoch": 2.9469533138277573, + "ewc_loss": 0.008651958778500557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6519583419431e-05, + "grad_norm": 4.357356548309326, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8708193898200989, + "num_tokens": 883808807.0, + "step": 23166 + }, + { + "epoch": 2.947080524106348, + "ewc_loss": 0.008695824071764946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.695823635207489e-05, + "grad_norm": 4.270177364349365, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8964075446128845, + "num_tokens": 883844300.0, + "step": 23167 + }, + { + "epoch": 2.9472077343849383, + "ewc_loss": 0.008657120168209076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.657119906274602e-05, + "grad_norm": 4.322465896606445, + "learning_rate": 1e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.896558403968811, + "num_tokens": 883879337.0, + "step": 23168 + }, + { + "epoch": 2.947334944663529, + "ewc_loss": 0.008704249747097492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.704249921720475e-05, + "grad_norm": 4.278257846832275, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8744603395462036, + "num_tokens": 883925810.0, + "step": 23169 + }, + { + "epoch": 2.9474621549421194, + "ewc_loss": 0.008651942946016788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65194306243211e-05, + "grad_norm": 4.293102741241455, + "learning_rate": 1e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8917360305786133, + "num_tokens": 883964753.0, + "step": 23170 + }, + { + "epoch": 2.94758936522071, + "ewc_loss": 0.008683894760906696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.683894702699035e-05, + "grad_norm": 4.304981231689453, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8760513067245483, + "num_tokens": 884005737.0, + "step": 23171 + }, + { + "epoch": 2.9477165754993004, + "ewc_loss": 0.008686576969921589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.686577348271385e-05, + "grad_norm": 4.3429412841796875, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8839167356491089, + "num_tokens": 884043414.0, + "step": 23172 + }, + { + "epoch": 2.947843785777891, + "ewc_loss": 0.008691743016242981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.691742550581694e-05, + "grad_norm": 4.351665496826172, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8878362774848938, + "num_tokens": 884077399.0, + "step": 23173 + }, + { + "epoch": 2.9479709960564815, + "ewc_loss": 0.008667457848787308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667457586852834e-05, + "grad_norm": 4.263127326965332, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8826658725738525, + "num_tokens": 884116539.0, + "step": 23174 + }, + { + "epoch": 2.948098206335072, + "ewc_loss": 0.008646911941468716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646911737741902e-05, + "grad_norm": 4.380027770996094, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8869245052337646, + "num_tokens": 884149756.0, + "step": 23175 + }, + { + "epoch": 2.9482254166136626, + "ewc_loss": 0.008726116269826889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.72611635713838e-05, + "grad_norm": 4.341686248779297, + "learning_rate": 1e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.9013787508010864, + "num_tokens": 884181960.0, + "step": 23176 + }, + { + "epoch": 2.9483526268922526, + "ewc_loss": 0.00864981859922409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.649818482808769e-05, + "grad_norm": 4.297929763793945, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.878743588924408, + "num_tokens": 884223562.0, + "step": 23177 + }, + { + "epoch": 2.9484798371708436, + "ewc_loss": 0.00864603091031313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646030619274825e-05, + "grad_norm": 4.287123680114746, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8903000950813293, + "num_tokens": 884264541.0, + "step": 23178 + }, + { + "epoch": 2.9486070474494337, + "ewc_loss": 0.008663001470267773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663001790409908e-05, + "grad_norm": 4.415743350982666, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.868674635887146, + "num_tokens": 884301171.0, + "step": 23179 + }, + { + "epoch": 2.9487342577280247, + "ewc_loss": 0.00872106198221445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.721061749383807e-05, + "grad_norm": 4.359848499298096, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8811535835266113, + "num_tokens": 884332481.0, + "step": 23180 + }, + { + "epoch": 2.9488614680066147, + "ewc_loss": 0.008640670217573643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640670421300456e-05, + "grad_norm": 4.2480363845825195, + "learning_rate": 1e-06, + "loss": 0.2819, + "mean_token_accuracy": 0.9016375541687012, + "num_tokens": 884371616.0, + "step": 23181 + }, + { + "epoch": 2.9489886782852053, + "ewc_loss": 0.008641683496534824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64168323460035e-05, + "grad_norm": 4.330695152282715, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8721085786819458, + "num_tokens": 884411245.0, + "step": 23182 + }, + { + "epoch": 2.949115888563796, + "ewc_loss": 0.008702429942786694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.702430204721168e-05, + "grad_norm": 4.329446315765381, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8818190097808838, + "num_tokens": 884445712.0, + "step": 23183 + }, + { + "epoch": 2.9492430988423863, + "ewc_loss": 0.00867714174091816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.677141886437312e-05, + "grad_norm": 4.2785797119140625, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8848453760147095, + "num_tokens": 884484935.0, + "step": 23184 + }, + { + "epoch": 2.949370309120977, + "ewc_loss": 0.00863648485392332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636485290480778e-05, + "grad_norm": 4.303439140319824, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8839683532714844, + "num_tokens": 884518714.0, + "step": 23185 + }, + { + "epoch": 2.9494975193995674, + "ewc_loss": 0.008696426637470722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.696426812093705e-05, + "grad_norm": 4.309164047241211, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8747255802154541, + "num_tokens": 884560430.0, + "step": 23186 + }, + { + "epoch": 2.949624729678158, + "ewc_loss": 0.008654508739709854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.654508565086871e-05, + "grad_norm": 4.346202373504639, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8764940500259399, + "num_tokens": 884595937.0, + "step": 23187 + }, + { + "epoch": 2.9497519399567484, + "ewc_loss": 0.008684703148901463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.684703061589971e-05, + "grad_norm": 4.329954624176025, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.87836754322052, + "num_tokens": 884633201.0, + "step": 23188 + }, + { + "epoch": 2.949879150235339, + "ewc_loss": 0.008667136542499065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667136717122048e-05, + "grad_norm": 4.276325225830078, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.884231686592102, + "num_tokens": 884672602.0, + "step": 23189 + }, + { + "epoch": 2.9500063605139295, + "ewc_loss": 0.008658956736326218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.658957085572183e-05, + "grad_norm": 4.341801166534424, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8862231969833374, + "num_tokens": 884708378.0, + "step": 23190 + }, + { + "epoch": 2.95013357079252, + "ewc_loss": 0.00871961284428835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.719613106222823e-05, + "grad_norm": 4.300805568695068, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8806668519973755, + "num_tokens": 884747527.0, + "step": 23191 + }, + { + "epoch": 2.9502607810711106, + "ewc_loss": 0.008648950606584549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.648950461065397e-05, + "grad_norm": 4.293857097625732, + "learning_rate": 1e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.8979712724685669, + "num_tokens": 884781738.0, + "step": 23192 + }, + { + "epoch": 2.950387991349701, + "ewc_loss": 0.008683617226779461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.683617488713935e-05, + "grad_norm": 4.371513366699219, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8709295392036438, + "num_tokens": 884818884.0, + "step": 23193 + }, + { + "epoch": 2.9505152016282916, + "ewc_loss": 0.008731207810342312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.731208072276786e-05, + "grad_norm": 4.359463214874268, + "learning_rate": 1e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.8925080299377441, + "num_tokens": 884851751.0, + "step": 23194 + }, + { + "epoch": 2.950642411906882, + "ewc_loss": 0.008696782402694225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.696782606421039e-05, + "grad_norm": 4.297107696533203, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8871824741363525, + "num_tokens": 884892935.0, + "step": 23195 + }, + { + "epoch": 2.9507696221854727, + "ewc_loss": 0.008662267588078976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662267646286637e-05, + "grad_norm": 4.270149230957031, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8862497210502625, + "num_tokens": 884935820.0, + "step": 23196 + }, + { + "epoch": 2.950896832464063, + "ewc_loss": 0.00868420023471117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.684200292918831e-05, + "grad_norm": 4.321421146392822, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8808988928794861, + "num_tokens": 884976042.0, + "step": 23197 + }, + { + "epoch": 2.9510240427426537, + "ewc_loss": 0.008700169622898102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.700169564690441e-05, + "grad_norm": 4.256772994995117, + "learning_rate": 1e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.9024870991706848, + "num_tokens": 885013511.0, + "step": 23198 + }, + { + "epoch": 2.9511512530212443, + "ewc_loss": 0.008643041364848614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643041655886918e-05, + "grad_norm": 4.43443489074707, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8727482557296753, + "num_tokens": 885047291.0, + "step": 23199 + }, + { + "epoch": 2.951278463299835, + "ewc_loss": 0.008773731999099255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.773732406552881e-05, + "grad_norm": 4.342166900634766, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8781952857971191, + "num_tokens": 885083116.0, + "step": 23200 + }, + { + "epoch": 2.9514056735784253, + "ewc_loss": 0.008640483021736145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640483429189771e-05, + "grad_norm": 4.32793664932251, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8795404434204102, + "num_tokens": 885123282.0, + "step": 23201 + }, + { + "epoch": 2.9515328838570154, + "ewc_loss": 0.008683114312589169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.683114720042795e-05, + "grad_norm": 4.247188091278076, + "learning_rate": 1e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8881577253341675, + "num_tokens": 885166277.0, + "step": 23202 + }, + { + "epoch": 2.9516600941356064, + "ewc_loss": 0.008621261455118656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62126107676886e-05, + "grad_norm": 4.279992580413818, + "learning_rate": 1e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8943015336990356, + "num_tokens": 885206414.0, + "step": 23203 + }, + { + "epoch": 2.9517873044141965, + "ewc_loss": 0.008663740009069443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663739572511986e-05, + "grad_norm": 4.351864337921143, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8916430473327637, + "num_tokens": 885240977.0, + "step": 23204 + }, + { + "epoch": 2.9519145146927874, + "ewc_loss": 0.00868162326514721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.681623148731887e-05, + "grad_norm": 4.251176834106445, + "learning_rate": 1e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.8975344896316528, + "num_tokens": 885283190.0, + "step": 23205 + }, + { + "epoch": 2.9520417249713775, + "ewc_loss": 0.008607004769146442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607004565419629e-05, + "grad_norm": 4.329125881195068, + "learning_rate": 1e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8968853950500488, + "num_tokens": 885321340.0, + "step": 23206 + }, + { + "epoch": 2.952168935249968, + "ewc_loss": 0.00866881012916565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66881018737331e-05, + "grad_norm": 4.306605815887451, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8912800550460815, + "num_tokens": 885357524.0, + "step": 23207 + }, + { + "epoch": 2.9522961455285586, + "ewc_loss": 0.00865201000124216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.652010001242161e-05, + "grad_norm": 4.452993392944336, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8606090545654297, + "num_tokens": 885389696.0, + "step": 23208 + }, + { + "epoch": 2.952423355807149, + "ewc_loss": 0.008727570064365864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.727570093469694e-05, + "grad_norm": 4.273731231689453, + "learning_rate": 1e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.896388590335846, + "num_tokens": 885429574.0, + "step": 23209 + }, + { + "epoch": 2.9525505660857396, + "ewc_loss": 0.008588667958974838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588667697040364e-05, + "grad_norm": 4.380764007568359, + "learning_rate": 1e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.88683021068573, + "num_tokens": 885459961.0, + "step": 23210 + }, + { + "epoch": 2.95267777636433, + "ewc_loss": 0.008730735629796982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.730735862627625e-05, + "grad_norm": 4.26448392868042, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8783235549926758, + "num_tokens": 885506725.0, + "step": 23211 + }, + { + "epoch": 2.9528049866429207, + "ewc_loss": 0.008609932847321033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609933138359338e-05, + "grad_norm": 4.264973163604736, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8887577652931213, + "num_tokens": 885549409.0, + "step": 23212 + }, + { + "epoch": 2.952932196921511, + "ewc_loss": 0.00865978840738535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659788727527484e-05, + "grad_norm": 4.311017990112305, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8870995044708252, + "num_tokens": 885589272.0, + "step": 23213 + }, + { + "epoch": 2.9530594072001017, + "ewc_loss": 0.008688343688845634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.688343223184347e-05, + "grad_norm": 4.370776176452637, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8658211827278137, + "num_tokens": 885625951.0, + "step": 23214 + }, + { + "epoch": 2.9531866174786923, + "ewc_loss": 0.008691973052918911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.691973198438063e-05, + "grad_norm": 4.339043617248535, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.88543701171875, + "num_tokens": 885665844.0, + "step": 23215 + }, + { + "epoch": 2.953313827757283, + "ewc_loss": 0.008662855252623558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662854816066101e-05, + "grad_norm": 4.316204071044922, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8835605978965759, + "num_tokens": 885705616.0, + "step": 23216 + }, + { + "epoch": 2.9534410380358733, + "ewc_loss": 0.008661211468279362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.661211177241057e-05, + "grad_norm": 4.2817277908325195, + "learning_rate": 1e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.8937828540802002, + "num_tokens": 885752362.0, + "step": 23217 + }, + { + "epoch": 2.953568248314464, + "ewc_loss": 0.008623732253909111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623732719570398e-05, + "grad_norm": 4.286070346832275, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8792279958724976, + "num_tokens": 885792485.0, + "step": 23218 + }, + { + "epoch": 2.9536954585930544, + "ewc_loss": 0.008641108870506287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.641109161544591e-05, + "grad_norm": 4.304633617401123, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8844054937362671, + "num_tokens": 885834809.0, + "step": 23219 + }, + { + "epoch": 2.953822668871645, + "ewc_loss": 0.008622289635241032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622289897175506e-05, + "grad_norm": 4.29766321182251, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.887118935585022, + "num_tokens": 885870555.0, + "step": 23220 + }, + { + "epoch": 2.9539498791502354, + "ewc_loss": 0.008614696562290192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614696707809344e-05, + "grad_norm": 4.31660795211792, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8847411870956421, + "num_tokens": 885911911.0, + "step": 23221 + }, + { + "epoch": 2.954077089428826, + "ewc_loss": 0.008605274371802807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605274342698976e-05, + "grad_norm": 4.3936004638671875, + "learning_rate": 1e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.8947733640670776, + "num_tokens": 885941094.0, + "step": 23222 + }, + { + "epoch": 2.9542042997074165, + "ewc_loss": 0.008640915155410767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640915621072054e-05, + "grad_norm": 4.369184970855713, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8817683458328247, + "num_tokens": 885978094.0, + "step": 23223 + }, + { + "epoch": 2.954331509986007, + "ewc_loss": 0.008594267070293427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59426727402024e-05, + "grad_norm": 4.290111541748047, + "learning_rate": 1e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8982624411582947, + "num_tokens": 886014859.0, + "step": 23224 + }, + { + "epoch": 2.954458720264597, + "ewc_loss": 0.008570678532123566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570678619435057e-05, + "grad_norm": 4.322311878204346, + "learning_rate": 1e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.888109028339386, + "num_tokens": 886048317.0, + "step": 23225 + }, + { + "epoch": 2.954585930543188, + "ewc_loss": 0.008623236790299416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623236499261111e-05, + "grad_norm": 4.339136123657227, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.882753312587738, + "num_tokens": 886081797.0, + "step": 23226 + }, + { + "epoch": 2.954713140821778, + "ewc_loss": 0.008593186736106873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593186794314533e-05, + "grad_norm": 4.279119968414307, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8709999918937683, + "num_tokens": 886124111.0, + "step": 23227 + }, + { + "epoch": 2.954840351100369, + "ewc_loss": 0.008590585552155972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.590585639467463e-05, + "grad_norm": 4.281773090362549, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8922947645187378, + "num_tokens": 886162823.0, + "step": 23228 + }, + { + "epoch": 2.954967561378959, + "ewc_loss": 0.00859842449426651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598424028605223e-05, + "grad_norm": 4.278722763061523, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8867452144622803, + "num_tokens": 886204706.0, + "step": 23229 + }, + { + "epoch": 2.95509477165755, + "ewc_loss": 0.00859543215483427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59543215483427e-05, + "grad_norm": 4.299363613128662, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8916648626327515, + "num_tokens": 886241461.0, + "step": 23230 + }, + { + "epoch": 2.9552219819361403, + "ewc_loss": 0.008610709570348263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610709483036771e-05, + "grad_norm": 4.3284912109375, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8805875778198242, + "num_tokens": 886279276.0, + "step": 23231 + }, + { + "epoch": 2.955349192214731, + "ewc_loss": 0.00861950870603323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619508298579603e-05, + "grad_norm": 4.303643703460693, + "learning_rate": 1e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8852201104164124, + "num_tokens": 886317703.0, + "step": 23232 + }, + { + "epoch": 2.9554764024933213, + "ewc_loss": 0.008587504737079144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587504999013618e-05, + "grad_norm": 4.312491416931152, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8884035348892212, + "num_tokens": 886355235.0, + "step": 23233 + }, + { + "epoch": 2.955603612771912, + "ewc_loss": 0.008625540882349014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625540795037523e-05, + "grad_norm": 4.302398681640625, + "learning_rate": 1e-06, + "loss": 0.2688, + "mean_token_accuracy": 0.9045432806015015, + "num_tokens": 886387354.0, + "step": 23234 + }, + { + "epoch": 2.9557308230505024, + "ewc_loss": 0.008609218522906303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609218639321625e-05, + "grad_norm": 4.394256591796875, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8821009397506714, + "num_tokens": 886419742.0, + "step": 23235 + }, + { + "epoch": 2.955858033329093, + "ewc_loss": 0.008682645857334137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.682646148372442e-05, + "grad_norm": 4.321951866149902, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8858157992362976, + "num_tokens": 886455218.0, + "step": 23236 + }, + { + "epoch": 2.9559852436076834, + "ewc_loss": 0.008614334277808666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614334365120158e-05, + "grad_norm": 4.3192853927612305, + "learning_rate": 1e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8859233260154724, + "num_tokens": 886494454.0, + "step": 23237 + }, + { + "epoch": 2.956112453886274, + "ewc_loss": 0.00865511316806078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65511319716461e-05, + "grad_norm": 4.356684684753418, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.864640474319458, + "num_tokens": 886532854.0, + "step": 23238 + }, + { + "epoch": 2.9562396641648645, + "ewc_loss": 0.008672221563756466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67222115630284e-05, + "grad_norm": 4.377042293548584, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8820885419845581, + "num_tokens": 886564288.0, + "step": 23239 + }, + { + "epoch": 2.956366874443455, + "ewc_loss": 0.00868722703307867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.687227091286331e-05, + "grad_norm": 4.342929363250732, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8868706226348877, + "num_tokens": 886597155.0, + "step": 23240 + }, + { + "epoch": 2.9564940847220456, + "ewc_loss": 0.00865707453340292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.657074795337394e-05, + "grad_norm": 4.359599590301514, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8869286775588989, + "num_tokens": 886632015.0, + "step": 23241 + }, + { + "epoch": 2.956621295000636, + "ewc_loss": 0.008695612661540508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.695612632436678e-05, + "grad_norm": 4.282733917236328, + "learning_rate": 1e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.8998007774353027, + "num_tokens": 886665736.0, + "step": 23242 + }, + { + "epoch": 2.9567485052792266, + "ewc_loss": 0.008662685751914978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662685286253691e-05, + "grad_norm": 4.321723461151123, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8850001692771912, + "num_tokens": 886702652.0, + "step": 23243 + }, + { + "epoch": 2.956875715557817, + "ewc_loss": 0.0087081678211689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.708168024895713e-05, + "grad_norm": 4.279496192932129, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8814713954925537, + "num_tokens": 886743573.0, + "step": 23244 + }, + { + "epoch": 2.9570029258364077, + "ewc_loss": 0.008691330440342426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.691330731380731e-05, + "grad_norm": 4.346759796142578, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8874795436859131, + "num_tokens": 886777932.0, + "step": 23245 + }, + { + "epoch": 2.957130136114998, + "ewc_loss": 0.008729959838092327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.729959517950192e-05, + "grad_norm": 4.3927412033081055, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8726953268051147, + "num_tokens": 886813211.0, + "step": 23246 + }, + { + "epoch": 2.9572573463935887, + "ewc_loss": 0.008738982491195202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.738982432987541e-05, + "grad_norm": 4.3112568855285645, + "learning_rate": 1e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8897636532783508, + "num_tokens": 886849416.0, + "step": 23247 + }, + { + "epoch": 2.9573845566721793, + "ewc_loss": 0.00867959763854742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.679597522132099e-05, + "grad_norm": 4.327316761016846, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8689950704574585, + "num_tokens": 886887983.0, + "step": 23248 + }, + { + "epoch": 2.95751176695077, + "ewc_loss": 0.008727439679205418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.7274398538284e-05, + "grad_norm": 4.331668376922607, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8676975965499878, + "num_tokens": 886930867.0, + "step": 23249 + }, + { + "epoch": 2.95763897722936, + "ewc_loss": 0.008715257048606873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.715256990399212e-05, + "grad_norm": 4.267114162445068, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8812941312789917, + "num_tokens": 886971862.0, + "step": 23250 + }, + { + "epoch": 2.957766187507951, + "ewc_loss": 0.008691488765180111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.691488619660959e-05, + "grad_norm": 4.339121341705322, + "learning_rate": 1e-06, + "loss": 0.3113, + "mean_token_accuracy": 0.8900858163833618, + "num_tokens": 887005238.0, + "step": 23251 + }, + { + "epoch": 2.957893397786541, + "ewc_loss": 0.008751082234084606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.751082350499928e-05, + "grad_norm": 4.409094333648682, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8781813979148865, + "num_tokens": 887035588.0, + "step": 23252 + }, + { + "epoch": 2.958020608065132, + "ewc_loss": 0.008760114200413227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.760113996686414e-05, + "grad_norm": 4.2721028327941895, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8849157691001892, + "num_tokens": 887079962.0, + "step": 23253 + }, + { + "epoch": 2.958147818343722, + "ewc_loss": 0.008670664392411709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.670664101373404e-05, + "grad_norm": 4.312856197357178, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.9029427766799927, + "num_tokens": 887114572.0, + "step": 23254 + }, + { + "epoch": 2.958275028622313, + "ewc_loss": 0.008744591847062111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.744591468712315e-05, + "grad_norm": 4.340365886688232, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8891226649284363, + "num_tokens": 887149396.0, + "step": 23255 + }, + { + "epoch": 2.958402238900903, + "ewc_loss": 0.008731561712920666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.731561683816835e-05, + "grad_norm": 4.302967071533203, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8786231279373169, + "num_tokens": 887191916.0, + "step": 23256 + }, + { + "epoch": 2.9585294491794936, + "ewc_loss": 0.008685928769409657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6859283328522e-05, + "grad_norm": 4.27901029586792, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8923538327217102, + "num_tokens": 887228263.0, + "step": 23257 + }, + { + "epoch": 2.958656659458084, + "ewc_loss": 0.008697343990206718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.697343582753092e-05, + "grad_norm": 4.356573581695557, + "learning_rate": 1e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.8981000781059265, + "num_tokens": 887258688.0, + "step": 23258 + }, + { + "epoch": 2.9587838697366746, + "ewc_loss": 0.008745472878217697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.745472587179393e-05, + "grad_norm": 4.266489028930664, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8800435066223145, + "num_tokens": 887301565.0, + "step": 23259 + }, + { + "epoch": 2.958911080015265, + "ewc_loss": 0.008661115542054176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66111513460055e-05, + "grad_norm": 4.319618225097656, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8805263042449951, + "num_tokens": 887337511.0, + "step": 23260 + }, + { + "epoch": 2.9590382902938557, + "ewc_loss": 0.008705789223313332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.705789514351636e-05, + "grad_norm": 4.325450420379639, + "learning_rate": 1e-06, + "loss": 0.3112, + "mean_token_accuracy": 0.8878694772720337, + "num_tokens": 887370694.0, + "step": 23261 + }, + { + "epoch": 2.959165500572446, + "ewc_loss": 0.008698915131390095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.698915189597756e-05, + "grad_norm": 4.2825398445129395, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8845748901367188, + "num_tokens": 887412131.0, + "step": 23262 + }, + { + "epoch": 2.9592927108510367, + "ewc_loss": 0.008656994439661503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.656994032207876e-05, + "grad_norm": 4.35865592956543, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8784250020980835, + "num_tokens": 887447179.0, + "step": 23263 + }, + { + "epoch": 2.9594199211296273, + "ewc_loss": 0.0087093161419034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.709316171007231e-05, + "grad_norm": 4.309996604919434, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.87931227684021, + "num_tokens": 887486079.0, + "step": 23264 + }, + { + "epoch": 2.959547131408218, + "ewc_loss": 0.008681317791342735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.681317558512092e-05, + "grad_norm": 4.373546600341797, + "learning_rate": 1e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8909154534339905, + "num_tokens": 887519092.0, + "step": 23265 + }, + { + "epoch": 2.9596743416868083, + "ewc_loss": 0.008728298358619213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.728298416826874e-05, + "grad_norm": 4.376766681671143, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8869482278823853, + "num_tokens": 887552008.0, + "step": 23266 + }, + { + "epoch": 2.959801551965399, + "ewc_loss": 0.008711555041372776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.711554983165115e-05, + "grad_norm": 4.379950046539307, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8793630003929138, + "num_tokens": 887585827.0, + "step": 23267 + }, + { + "epoch": 2.9599287622439894, + "ewc_loss": 0.00870058685541153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.700587204657495e-05, + "grad_norm": 4.2787981033325195, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8824331760406494, + "num_tokens": 887628041.0, + "step": 23268 + }, + { + "epoch": 2.96005597252258, + "ewc_loss": 0.008669203147292137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.669203089084476e-05, + "grad_norm": 4.281789779663086, + "learning_rate": 1e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8886555433273315, + "num_tokens": 887667210.0, + "step": 23269 + }, + { + "epoch": 2.9601831828011704, + "ewc_loss": 0.008691256865859032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.691257244208828e-05, + "grad_norm": 4.2934889793396, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8914493918418884, + "num_tokens": 887705429.0, + "step": 23270 + }, + { + "epoch": 2.960310393079761, + "ewc_loss": 0.008675288408994675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.675287972437218e-05, + "grad_norm": 4.30604887008667, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8665033578872681, + "num_tokens": 887753188.0, + "step": 23271 + }, + { + "epoch": 2.9604376033583515, + "ewc_loss": 0.008680155500769615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.680155588081107e-05, + "grad_norm": 4.344817638397217, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8668370842933655, + "num_tokens": 887795907.0, + "step": 23272 + }, + { + "epoch": 2.960564813636942, + "ewc_loss": 0.008694110438227654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.69411087478511e-05, + "grad_norm": 4.333471298217773, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8824708461761475, + "num_tokens": 887834859.0, + "step": 23273 + }, + { + "epoch": 2.9606920239155325, + "ewc_loss": 0.008660154417157173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.660154708195478e-05, + "grad_norm": 4.281630039215088, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8788676857948303, + "num_tokens": 887879041.0, + "step": 23274 + }, + { + "epoch": 2.9608192341941226, + "ewc_loss": 0.008655283600091934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655283454572782e-05, + "grad_norm": 4.3862457275390625, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8814451694488525, + "num_tokens": 887913752.0, + "step": 23275 + }, + { + "epoch": 2.9609464444727136, + "ewc_loss": 0.008720271289348602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.720271580386907e-05, + "grad_norm": 4.30687141418457, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.889491081237793, + "num_tokens": 887950946.0, + "step": 23276 + }, + { + "epoch": 2.9610736547513037, + "ewc_loss": 0.008642240427434444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642240572953597e-05, + "grad_norm": 4.287035942077637, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.881212592124939, + "num_tokens": 887993155.0, + "step": 23277 + }, + { + "epoch": 2.9612008650298947, + "ewc_loss": 0.008675267919898033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67526832735166e-05, + "grad_norm": 4.360683441162109, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8825613260269165, + "num_tokens": 888027831.0, + "step": 23278 + }, + { + "epoch": 2.9613280753084847, + "ewc_loss": 0.00870670098811388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.706701191840693e-05, + "grad_norm": 4.2605767250061035, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8938791751861572, + "num_tokens": 888066941.0, + "step": 23279 + }, + { + "epoch": 2.9614552855870753, + "ewc_loss": 0.008640013635158539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640013402327895e-05, + "grad_norm": 4.31837797164917, + "learning_rate": 1e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8976576328277588, + "num_tokens": 888099739.0, + "step": 23280 + }, + { + "epoch": 2.961582495865666, + "ewc_loss": 0.00870165228843689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.701652404852211e-05, + "grad_norm": 4.289346218109131, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8853535652160645, + "num_tokens": 888139675.0, + "step": 23281 + }, + { + "epoch": 2.9617097061442563, + "ewc_loss": 0.008659206330776215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659205923322588e-05, + "grad_norm": 4.345088005065918, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8725305199623108, + "num_tokens": 888179374.0, + "step": 23282 + }, + { + "epoch": 2.961836916422847, + "ewc_loss": 0.008687335066497326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.687335503054783e-05, + "grad_norm": 4.362061023712158, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8705406188964844, + "num_tokens": 888214139.0, + "step": 23283 + }, + { + "epoch": 2.9619641267014374, + "ewc_loss": 0.008689590729773045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.689590322319418e-05, + "grad_norm": 4.365300178527832, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8826313018798828, + "num_tokens": 888246365.0, + "step": 23284 + }, + { + "epoch": 2.962091336980028, + "ewc_loss": 0.008673781529068947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67378112161532e-05, + "grad_norm": 4.264366149902344, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8786778450012207, + "num_tokens": 888290903.0, + "step": 23285 + }, + { + "epoch": 2.9622185472586184, + "ewc_loss": 0.008640739135444164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64073954289779e-05, + "grad_norm": 4.290750980377197, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8809539079666138, + "num_tokens": 888331329.0, + "step": 23286 + }, + { + "epoch": 2.962345757537209, + "ewc_loss": 0.008677320554852486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.677320147398859e-05, + "grad_norm": 4.307944297790527, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8771581649780273, + "num_tokens": 888372411.0, + "step": 23287 + }, + { + "epoch": 2.9624729678157995, + "ewc_loss": 0.008653376251459122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.653376426082104e-05, + "grad_norm": 4.286880016326904, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8833192586898804, + "num_tokens": 888409778.0, + "step": 23288 + }, + { + "epoch": 2.96260017809439, + "ewc_loss": 0.008644848130643368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64484827616252e-05, + "grad_norm": 4.270484924316406, + "learning_rate": 1e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8893324136734009, + "num_tokens": 888453593.0, + "step": 23289 + }, + { + "epoch": 2.9627273883729806, + "ewc_loss": 0.008631866425275803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631866512587294e-05, + "grad_norm": 4.3249125480651855, + "learning_rate": 1e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.8957817554473877, + "num_tokens": 888488245.0, + "step": 23290 + }, + { + "epoch": 2.962854598651571, + "ewc_loss": 0.00868831854313612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.688318484928459e-05, + "grad_norm": 4.378006458282471, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8837335109710693, + "num_tokens": 888524306.0, + "step": 23291 + }, + { + "epoch": 2.9629818089301616, + "ewc_loss": 0.008655451238155365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655450801597908e-05, + "grad_norm": 4.252892971038818, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8959494233131409, + "num_tokens": 888563791.0, + "step": 23292 + }, + { + "epoch": 2.963109019208752, + "ewc_loss": 0.008585089817643166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.58509010868147e-05, + "grad_norm": 4.308065414428711, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8895683288574219, + "num_tokens": 888605021.0, + "step": 23293 + }, + { + "epoch": 2.9632362294873427, + "ewc_loss": 0.00866885855793953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668858936289325e-05, + "grad_norm": 4.288392066955566, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8850204348564148, + "num_tokens": 888647382.0, + "step": 23294 + }, + { + "epoch": 2.963363439765933, + "ewc_loss": 0.008595460094511509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595460531068966e-05, + "grad_norm": 4.3416829109191895, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8830190896987915, + "num_tokens": 888682174.0, + "step": 23295 + }, + { + "epoch": 2.9634906500445237, + "ewc_loss": 0.008645779453217983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.645779598737136e-05, + "grad_norm": 4.310208320617676, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8847112059593201, + "num_tokens": 888724401.0, + "step": 23296 + }, + { + "epoch": 2.9636178603231143, + "ewc_loss": 0.008605092763900757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.605092443758622e-05, + "grad_norm": 4.2690277099609375, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8906127214431763, + "num_tokens": 888766230.0, + "step": 23297 + }, + { + "epoch": 2.963745070601705, + "ewc_loss": 0.008595972321927547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595972758485004e-05, + "grad_norm": 4.3292365074157715, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8632718324661255, + "num_tokens": 888811865.0, + "step": 23298 + }, + { + "epoch": 2.9638722808802953, + "ewc_loss": 0.008616255596280098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.616255217930302e-05, + "grad_norm": 4.32299280166626, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8833811283111572, + "num_tokens": 888849549.0, + "step": 23299 + }, + { + "epoch": 2.9639994911588854, + "ewc_loss": 0.008587528020143509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.587528282077983e-05, + "grad_norm": 4.458217620849609, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.885373055934906, + "num_tokens": 888881305.0, + "step": 23300 + }, + { + "epoch": 2.9641267014374764, + "ewc_loss": 0.008667641319334507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667641668580472e-05, + "grad_norm": 4.296905517578125, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.889284610748291, + "num_tokens": 888917862.0, + "step": 23301 + }, + { + "epoch": 2.9642539117160664, + "ewc_loss": 0.008526045829057693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.526045712642372e-05, + "grad_norm": 4.326314449310303, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8836453557014465, + "num_tokens": 888953883.0, + "step": 23302 + }, + { + "epoch": 2.9643811219946574, + "ewc_loss": 0.008615085855126381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615085971541703e-05, + "grad_norm": 4.323075771331787, + "learning_rate": 1e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.8994723558425903, + "num_tokens": 888991052.0, + "step": 23303 + }, + { + "epoch": 2.9645083322732475, + "ewc_loss": 0.008597810752689838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597810665378347e-05, + "grad_norm": 4.352302074432373, + "learning_rate": 1e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8960614204406738, + "num_tokens": 889022523.0, + "step": 23304 + }, + { + "epoch": 2.964635542551838, + "ewc_loss": 0.008603517897427082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.603517926530913e-05, + "grad_norm": 4.30791711807251, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8763025999069214, + "num_tokens": 889063470.0, + "step": 23305 + }, + { + "epoch": 2.9647627528304286, + "ewc_loss": 0.008585406467318535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.585406612837687e-05, + "grad_norm": 4.280208587646484, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8906928896903992, + "num_tokens": 889101160.0, + "step": 23306 + }, + { + "epoch": 2.964889963109019, + "ewc_loss": 0.008582491427659988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582491864217445e-05, + "grad_norm": 4.338918209075928, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8805351853370667, + "num_tokens": 889139128.0, + "step": 23307 + }, + { + "epoch": 2.9650171733876096, + "ewc_loss": 0.008613945916295052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61394582898356e-05, + "grad_norm": 4.335398197174072, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8615471124649048, + "num_tokens": 889175574.0, + "step": 23308 + }, + { + "epoch": 2.9651443836662, + "ewc_loss": 0.008586169220507145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586169133195654e-05, + "grad_norm": 4.281201362609863, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8819496631622314, + "num_tokens": 889216520.0, + "step": 23309 + }, + { + "epoch": 2.9652715939447907, + "ewc_loss": 0.008577036671340466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.577037078794092e-05, + "grad_norm": 4.276432037353516, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8956407308578491, + "num_tokens": 889256746.0, + "step": 23310 + }, + { + "epoch": 2.965398804223381, + "ewc_loss": 0.008600154891610146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.600154978921637e-05, + "grad_norm": 4.303246974945068, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8781143426895142, + "num_tokens": 889298868.0, + "step": 23311 + }, + { + "epoch": 2.9655260145019717, + "ewc_loss": 0.008617665618658066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617666026111692e-05, + "grad_norm": 4.353945732116699, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8812426328659058, + "num_tokens": 889337758.0, + "step": 23312 + }, + { + "epoch": 2.9656532247805623, + "ewc_loss": 0.008627718314528465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627718489151448e-05, + "grad_norm": 4.3106889724731445, + "learning_rate": 1e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8871055841445923, + "num_tokens": 889375308.0, + "step": 23313 + }, + { + "epoch": 2.965780435059153, + "ewc_loss": 0.008591527119278908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.591527148382738e-05, + "grad_norm": 4.301520824432373, + "learning_rate": 1e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8920261263847351, + "num_tokens": 889413735.0, + "step": 23314 + }, + { + "epoch": 2.9659076453377433, + "ewc_loss": 0.008600044064223766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60004365677014e-05, + "grad_norm": 4.338191509246826, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8861185908317566, + "num_tokens": 889451585.0, + "step": 23315 + }, + { + "epoch": 2.966034855616334, + "ewc_loss": 0.008614303544163704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614303078502417e-05, + "grad_norm": 4.384599685668945, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8694277405738831, + "num_tokens": 889487405.0, + "step": 23316 + }, + { + "epoch": 2.9661620658949244, + "ewc_loss": 0.008638338185846806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63833847688511e-05, + "grad_norm": 4.3486480712890625, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8734580278396606, + "num_tokens": 889523051.0, + "step": 23317 + }, + { + "epoch": 2.966289276173515, + "ewc_loss": 0.0085973609238863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.59736101119779e-05, + "grad_norm": 4.342955589294434, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8651363849639893, + "num_tokens": 889559822.0, + "step": 23318 + }, + { + "epoch": 2.9664164864521054, + "ewc_loss": 0.008609585464000702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60958534758538e-05, + "grad_norm": 4.349495887756348, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.883142352104187, + "num_tokens": 889591598.0, + "step": 23319 + }, + { + "epoch": 2.966543696730696, + "ewc_loss": 0.008627130649983883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627130591776222e-05, + "grad_norm": 4.30210542678833, + "learning_rate": 1e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8915551900863647, + "num_tokens": 889629787.0, + "step": 23320 + }, + { + "epoch": 2.9666709070092865, + "ewc_loss": 0.008619872853159904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619872824056074e-05, + "grad_norm": 4.277740478515625, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.888938307762146, + "num_tokens": 889668730.0, + "step": 23321 + }, + { + "epoch": 2.966798117287877, + "ewc_loss": 0.008631153032183647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631152741145343e-05, + "grad_norm": 4.3842644691467285, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8815129399299622, + "num_tokens": 889705215.0, + "step": 23322 + }, + { + "epoch": 2.966925327566467, + "ewc_loss": 0.008697004057466984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.69700379553251e-05, + "grad_norm": 4.366096019744873, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8837666511535645, + "num_tokens": 889738528.0, + "step": 23323 + }, + { + "epoch": 2.967052537845058, + "ewc_loss": 0.008665339089930058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.665338827995583e-05, + "grad_norm": 4.3795881271362305, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8867290019989014, + "num_tokens": 889771694.0, + "step": 23324 + }, + { + "epoch": 2.967179748123648, + "ewc_loss": 0.008679105900228024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67910566739738e-05, + "grad_norm": 4.265106201171875, + "learning_rate": 1e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8958265781402588, + "num_tokens": 889806096.0, + "step": 23325 + }, + { + "epoch": 2.967306958402239, + "ewc_loss": 0.008629064075648785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62906381371431e-05, + "grad_norm": 4.288341522216797, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8824378252029419, + "num_tokens": 889848514.0, + "step": 23326 + }, + { + "epoch": 2.967434168680829, + "ewc_loss": 0.008692730218172073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.692729898029938e-05, + "grad_norm": 4.2366623878479, + "learning_rate": 1e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.903141975402832, + "num_tokens": 889889689.0, + "step": 23327 + }, + { + "epoch": 2.96756137895942, + "ewc_loss": 0.008652121759951115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.652121323393658e-05, + "grad_norm": 4.337015151977539, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.88703453540802, + "num_tokens": 889925550.0, + "step": 23328 + }, + { + "epoch": 2.9676885892380103, + "ewc_loss": 0.008714785799384117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.714785508345813e-05, + "grad_norm": 4.331064701080322, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8716787099838257, + "num_tokens": 889961797.0, + "step": 23329 + }, + { + "epoch": 2.967815799516601, + "ewc_loss": 0.008670536801218987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.670536772115156e-05, + "grad_norm": 4.294966697692871, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.889004111289978, + "num_tokens": 889997935.0, + "step": 23330 + }, + { + "epoch": 2.9679430097951913, + "ewc_loss": 0.008679941296577454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67994167492725e-05, + "grad_norm": 4.2865447998046875, + "learning_rate": 1e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8942360877990723, + "num_tokens": 890037900.0, + "step": 23331 + }, + { + "epoch": 2.968070220073782, + "ewc_loss": 0.008707286790013313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.707286906428635e-05, + "grad_norm": 4.348871231079102, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8892829418182373, + "num_tokens": 890074814.0, + "step": 23332 + }, + { + "epoch": 2.9681974303523724, + "ewc_loss": 0.008698715828359127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.698715828359127e-05, + "grad_norm": 4.271356582641602, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.895915150642395, + "num_tokens": 890107091.0, + "step": 23333 + }, + { + "epoch": 2.968324640630963, + "ewc_loss": 0.008646161295473576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64616158651188e-05, + "grad_norm": 4.346397399902344, + "learning_rate": 1e-06, + "loss": 0.3152, + "mean_token_accuracy": 0.893559455871582, + "num_tokens": 890140558.0, + "step": 23334 + }, + { + "epoch": 2.9684518509095534, + "ewc_loss": 0.008734961040318012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.734961011214182e-05, + "grad_norm": 4.34760856628418, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8844438791275024, + "num_tokens": 890174563.0, + "step": 23335 + }, + { + "epoch": 2.968579061188144, + "ewc_loss": 0.00868038646876812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.680386235937476e-05, + "grad_norm": 4.350035190582275, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8833153247833252, + "num_tokens": 890210584.0, + "step": 23336 + }, + { + "epoch": 2.9687062714667345, + "ewc_loss": 0.008686883375048637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.68688293849118e-05, + "grad_norm": 4.264599323272705, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8958872556686401, + "num_tokens": 890251955.0, + "step": 23337 + }, + { + "epoch": 2.968833481745325, + "ewc_loss": 0.00864248163998127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642481407150626e-05, + "grad_norm": 4.287277698516846, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8792058229446411, + "num_tokens": 890289095.0, + "step": 23338 + }, + { + "epoch": 2.9689606920239155, + "ewc_loss": 0.008675161749124527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.675162098370492e-05, + "grad_norm": 4.286869049072266, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8867080807685852, + "num_tokens": 890327864.0, + "step": 23339 + }, + { + "epoch": 2.969087902302506, + "ewc_loss": 0.008659019134938717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659018931211904e-05, + "grad_norm": 4.315829753875732, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8861500024795532, + "num_tokens": 890364244.0, + "step": 23340 + }, + { + "epoch": 2.9692151125810966, + "ewc_loss": 0.008675830438733101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.675830031279474e-05, + "grad_norm": 4.367390155792236, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8734438419342041, + "num_tokens": 890397652.0, + "step": 23341 + }, + { + "epoch": 2.969342322859687, + "ewc_loss": 0.008708253502845764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.708253153599799e-05, + "grad_norm": 4.299558639526367, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8898884654045105, + "num_tokens": 890435598.0, + "step": 23342 + }, + { + "epoch": 2.9694695331382777, + "ewc_loss": 0.008658998645842075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.658998558530584e-05, + "grad_norm": 4.3257927894592285, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8863136172294617, + "num_tokens": 890473445.0, + "step": 23343 + }, + { + "epoch": 2.969596743416868, + "ewc_loss": 0.008678491227328777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.678491576574743e-05, + "grad_norm": 4.294393062591553, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8878843784332275, + "num_tokens": 890514263.0, + "step": 23344 + }, + { + "epoch": 2.9697239536954587, + "ewc_loss": 0.008659523911774158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659523882670328e-05, + "grad_norm": 4.312843322753906, + "learning_rate": 1e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8888997435569763, + "num_tokens": 890554123.0, + "step": 23345 + }, + { + "epoch": 2.9698511639740492, + "ewc_loss": 0.008653025142848492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65302499732934e-05, + "grad_norm": 4.322430610656738, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8841150403022766, + "num_tokens": 890593938.0, + "step": 23346 + }, + { + "epoch": 2.9699783742526398, + "ewc_loss": 0.0086610596626997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.661059837322682e-05, + "grad_norm": 4.3665313720703125, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8746416568756104, + "num_tokens": 890632260.0, + "step": 23347 + }, + { + "epoch": 2.97010558453123, + "ewc_loss": 0.008664289489388466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.664289634907618e-05, + "grad_norm": 4.438138961791992, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8824710845947266, + "num_tokens": 890666852.0, + "step": 23348 + }, + { + "epoch": 2.970232794809821, + "ewc_loss": 0.008697602897882462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.697603334439918e-05, + "grad_norm": 4.301601409912109, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8805826902389526, + "num_tokens": 890707959.0, + "step": 23349 + }, + { + "epoch": 2.970360005088411, + "ewc_loss": 0.008613385260105133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.613384852651507e-05, + "grad_norm": 4.36422872543335, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8649373650550842, + "num_tokens": 890748729.0, + "step": 23350 + }, + { + "epoch": 2.970487215367002, + "ewc_loss": 0.00869803037494421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.698030433151871e-05, + "grad_norm": 4.349289894104004, + "learning_rate": 1e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8954877257347107, + "num_tokens": 890782116.0, + "step": 23351 + }, + { + "epoch": 2.970614425645592, + "ewc_loss": 0.0086291478946805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629148214822635e-05, + "grad_norm": 4.247544288635254, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8724149465560913, + "num_tokens": 890827309.0, + "step": 23352 + }, + { + "epoch": 2.970741635924183, + "ewc_loss": 0.0085978452116251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597844862379134e-05, + "grad_norm": 4.381828308105469, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8745759725570679, + "num_tokens": 890863879.0, + "step": 23353 + }, + { + "epoch": 2.970868846202773, + "ewc_loss": 0.008717648684978485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.717648597666994e-05, + "grad_norm": 4.315158367156982, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8870912790298462, + "num_tokens": 890901184.0, + "step": 23354 + }, + { + "epoch": 2.9709960564813636, + "ewc_loss": 0.00862459372729063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624593465356156e-05, + "grad_norm": 4.305096626281738, + "learning_rate": 1e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8932561874389648, + "num_tokens": 890937779.0, + "step": 23355 + }, + { + "epoch": 2.971123266759954, + "ewc_loss": 0.008644585497677326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644585614092648e-05, + "grad_norm": 4.2769036293029785, + "learning_rate": 1e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.8946858644485474, + "num_tokens": 890978055.0, + "step": 23356 + }, + { + "epoch": 2.9712504770385446, + "ewc_loss": 0.008640618063509464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640618034405634e-05, + "grad_norm": 4.431256294250488, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8554310202598572, + "num_tokens": 891010985.0, + "step": 23357 + }, + { + "epoch": 2.971377687317135, + "ewc_loss": 0.00872774701565504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.727746899239719e-05, + "grad_norm": 4.345442771911621, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8877040147781372, + "num_tokens": 891045904.0, + "step": 23358 + }, + { + "epoch": 2.9715048975957257, + "ewc_loss": 0.008654074743390083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.654074918013066e-05, + "grad_norm": 4.26443338394165, + "learning_rate": 1e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8886773586273193, + "num_tokens": 891086240.0, + "step": 23359 + }, + { + "epoch": 2.971632107874316, + "ewc_loss": 0.008621304295957088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621304004918784e-05, + "grad_norm": 4.3120245933532715, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8824101686477661, + "num_tokens": 891126482.0, + "step": 23360 + }, + { + "epoch": 2.9717593181529067, + "ewc_loss": 0.008683579042553902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.68357892613858e-05, + "grad_norm": 4.276408672332764, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8736706376075745, + "num_tokens": 891165982.0, + "step": 23361 + }, + { + "epoch": 2.9718865284314973, + "ewc_loss": 0.008660529740154743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.660529420012608e-05, + "grad_norm": 4.333343982696533, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8712056875228882, + "num_tokens": 891199101.0, + "step": 23362 + }, + { + "epoch": 2.972013738710088, + "ewc_loss": 0.008707016706466675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.707016968401149e-05, + "grad_norm": 4.338675022125244, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8833127617835999, + "num_tokens": 891231511.0, + "step": 23363 + }, + { + "epoch": 2.9721409489886783, + "ewc_loss": 0.008698398247361183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.698397869011387e-05, + "grad_norm": 4.401567459106445, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8726797103881836, + "num_tokens": 891262369.0, + "step": 23364 + }, + { + "epoch": 2.972268159267269, + "ewc_loss": 0.00871824100613594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.718240860616788e-05, + "grad_norm": 4.304309368133545, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8698169589042664, + "num_tokens": 891304206.0, + "step": 23365 + }, + { + "epoch": 2.9723953695458594, + "ewc_loss": 0.008688582107424736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.688581874594092e-05, + "grad_norm": 4.272772789001465, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8720746040344238, + "num_tokens": 891348006.0, + "step": 23366 + }, + { + "epoch": 2.97252257982445, + "ewc_loss": 0.008705423213541508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.705422806087881e-05, + "grad_norm": 4.257593154907227, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8913271427154541, + "num_tokens": 891386851.0, + "step": 23367 + }, + { + "epoch": 2.9726497901030404, + "ewc_loss": 0.008689149282872677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.689149399287999e-05, + "grad_norm": 4.317683219909668, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8928842544555664, + "num_tokens": 891423582.0, + "step": 23368 + }, + { + "epoch": 2.972777000381631, + "ewc_loss": 0.008740521036088467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.740521298022941e-05, + "grad_norm": 4.276134967803955, + "learning_rate": 1e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.8974431157112122, + "num_tokens": 891463773.0, + "step": 23369 + }, + { + "epoch": 2.9729042106602215, + "ewc_loss": 0.008681935258209705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.681935287313536e-05, + "grad_norm": 4.35921049118042, + "learning_rate": 1e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8929859399795532, + "num_tokens": 891498117.0, + "step": 23370 + }, + { + "epoch": 2.973031420938812, + "ewc_loss": 0.008720587939023972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.720588084543124e-05, + "grad_norm": 4.359237194061279, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8912761211395264, + "num_tokens": 891529055.0, + "step": 23371 + }, + { + "epoch": 2.9731586312174025, + "ewc_loss": 0.008683515712618828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.683515625307336e-05, + "grad_norm": 4.2957353591918945, + "learning_rate": 1e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.8933373689651489, + "num_tokens": 891567370.0, + "step": 23372 + }, + { + "epoch": 2.9732858414959926, + "ewc_loss": 0.008639986626803875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639986481284723e-05, + "grad_norm": 4.375500679016113, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8862980008125305, + "num_tokens": 891602737.0, + "step": 23373 + }, + { + "epoch": 2.9734130517745836, + "ewc_loss": 0.00869008433073759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.690084359841421e-05, + "grad_norm": 4.351991176605225, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8787904977798462, + "num_tokens": 891639713.0, + "step": 23374 + }, + { + "epoch": 2.9735402620531737, + "ewc_loss": 0.008661321364343166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.661321771796793e-05, + "grad_norm": 4.327950477600098, + "learning_rate": 1e-06, + "loss": 0.3049, + "mean_token_accuracy": 0.8948347568511963, + "num_tokens": 891673931.0, + "step": 23375 + }, + { + "epoch": 2.9736674723317646, + "ewc_loss": 0.00865234062075615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.652341057313606e-05, + "grad_norm": 4.396280765533447, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8824199438095093, + "num_tokens": 891708015.0, + "step": 23376 + }, + { + "epoch": 2.9737946826103547, + "ewc_loss": 0.008683650754392147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.68365095811896e-05, + "grad_norm": 4.293766021728516, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8795400857925415, + "num_tokens": 891746023.0, + "step": 23377 + }, + { + "epoch": 2.9739218928889453, + "ewc_loss": 0.008626763708889484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626763883512467e-05, + "grad_norm": 4.43919563293457, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8821983933448792, + "num_tokens": 891777767.0, + "step": 23378 + }, + { + "epoch": 2.974049103167536, + "ewc_loss": 0.008732028305530548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.732028072699904e-05, + "grad_norm": 4.297858715057373, + "learning_rate": 1e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8859874606132507, + "num_tokens": 891810816.0, + "step": 23379 + }, + { + "epoch": 2.9741763134461263, + "ewc_loss": 0.008617378771305084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.617378625785932e-05, + "grad_norm": 4.356601238250732, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8776306509971619, + "num_tokens": 891848810.0, + "step": 23380 + }, + { + "epoch": 2.974303523724717, + "ewc_loss": 0.0087232431396842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.72324308147654e-05, + "grad_norm": 4.27524471282959, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8872807025909424, + "num_tokens": 891892553.0, + "step": 23381 + }, + { + "epoch": 2.9744307340033074, + "ewc_loss": 0.008648287504911423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.648287621326745e-05, + "grad_norm": 4.334353446960449, + "learning_rate": 1e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.8950610756874084, + "num_tokens": 891927539.0, + "step": 23382 + }, + { + "epoch": 2.974557944281898, + "ewc_loss": 0.008704476989805698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.704476931598037e-05, + "grad_norm": 4.359163761138916, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.888943076133728, + "num_tokens": 891959577.0, + "step": 23383 + }, + { + "epoch": 2.9746851545604884, + "ewc_loss": 0.00869604293256998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.696043369127437e-05, + "grad_norm": 4.348682403564453, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8608283996582031, + "num_tokens": 892000799.0, + "step": 23384 + }, + { + "epoch": 2.974812364839079, + "ewc_loss": 0.008705195970833302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.705195796210319e-05, + "grad_norm": 4.27435302734375, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.877665638923645, + "num_tokens": 892042200.0, + "step": 23385 + }, + { + "epoch": 2.9749395751176695, + "ewc_loss": 0.008672118186950684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67211856530048e-05, + "grad_norm": 4.299699306488037, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8749070167541504, + "num_tokens": 892084838.0, + "step": 23386 + }, + { + "epoch": 2.97506678539626, + "ewc_loss": 0.008708911016583443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.708910900168121e-05, + "grad_norm": 4.291628837585449, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8685726523399353, + "num_tokens": 892126442.0, + "step": 23387 + }, + { + "epoch": 2.9751939956748505, + "ewc_loss": 0.008715680800378323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.715680451132357e-05, + "grad_norm": 4.319972515106201, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8753321170806885, + "num_tokens": 892165019.0, + "step": 23388 + }, + { + "epoch": 2.975321205953441, + "ewc_loss": 0.008726410567760468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.726410305825993e-05, + "grad_norm": 4.290044784545898, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8862552642822266, + "num_tokens": 892202291.0, + "step": 23389 + }, + { + "epoch": 2.9754484162320316, + "ewc_loss": 0.008694914169609547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.694914140505716e-05, + "grad_norm": 4.3618483543396, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.876430869102478, + "num_tokens": 892235159.0, + "step": 23390 + }, + { + "epoch": 2.975575626510622, + "ewc_loss": 0.0087412279099226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.741227793507278e-05, + "grad_norm": 4.32789945602417, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.873356282711029, + "num_tokens": 892271967.0, + "step": 23391 + }, + { + "epoch": 2.9757028367892127, + "ewc_loss": 0.008682909421622753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.682909538038075e-05, + "grad_norm": 4.3325724601745605, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8702811002731323, + "num_tokens": 892306230.0, + "step": 23392 + }, + { + "epoch": 2.975830047067803, + "ewc_loss": 0.008736081421375275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.736081508686766e-05, + "grad_norm": 4.304448127746582, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.885858952999115, + "num_tokens": 892342118.0, + "step": 23393 + }, + { + "epoch": 2.9759572573463937, + "ewc_loss": 0.008722040802240372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.722041093278676e-05, + "grad_norm": 4.283142566680908, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.88798987865448, + "num_tokens": 892380764.0, + "step": 23394 + }, + { + "epoch": 2.9760844676249842, + "ewc_loss": 0.008715716190636158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.715716103324667e-05, + "grad_norm": 4.308436870574951, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8813737034797668, + "num_tokens": 892421113.0, + "step": 23395 + }, + { + "epoch": 2.9762116779035748, + "ewc_loss": 0.008768380619585514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.768380939727649e-05, + "grad_norm": 4.323971748352051, + "learning_rate": 1e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8891544938087463, + "num_tokens": 892457512.0, + "step": 23396 + }, + { + "epoch": 2.9763388881821653, + "ewc_loss": 0.008753832429647446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.75383266247809e-05, + "grad_norm": 4.332186222076416, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.887466311454773, + "num_tokens": 892493599.0, + "step": 23397 + }, + { + "epoch": 2.9764660984607554, + "ewc_loss": 0.008742752484977245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.74275210662745e-05, + "grad_norm": 4.358430862426758, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.876078188419342, + "num_tokens": 892528474.0, + "step": 23398 + }, + { + "epoch": 2.9765933087393464, + "ewc_loss": 0.008746630512177944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.746630192035809e-05, + "grad_norm": 4.278365135192871, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.890083372592926, + "num_tokens": 892569591.0, + "step": 23399 + }, + { + "epoch": 2.9767205190179364, + "ewc_loss": 0.00869825854897499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.698258898220956e-05, + "grad_norm": 4.300124168395996, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8882001638412476, + "num_tokens": 892607185.0, + "step": 23400 + }, + { + "epoch": 2.9768477292965274, + "ewc_loss": 0.008744115941226482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.744115621084347e-05, + "grad_norm": 4.345037937164307, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8844934701919556, + "num_tokens": 892641453.0, + "step": 23401 + }, + { + "epoch": 2.9769749395751175, + "ewc_loss": 0.00874553993344307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.745540253585204e-05, + "grad_norm": 4.366065502166748, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8768700361251831, + "num_tokens": 892676802.0, + "step": 23402 + }, + { + "epoch": 2.977102149853708, + "ewc_loss": 0.008736229501664639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.736229938222095e-05, + "grad_norm": 4.332549571990967, + "learning_rate": 1e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8937954306602478, + "num_tokens": 892711694.0, + "step": 23403 + }, + { + "epoch": 2.9772293601322986, + "ewc_loss": 0.00872253067791462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.722530765226111e-05, + "grad_norm": 4.357610702514648, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.876340389251709, + "num_tokens": 892752662.0, + "step": 23404 + }, + { + "epoch": 2.977356570410889, + "ewc_loss": 0.008740093559026718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.740093471715227e-05, + "grad_norm": 4.31594181060791, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8855142593383789, + "num_tokens": 892790170.0, + "step": 23405 + }, + { + "epoch": 2.9774837806894796, + "ewc_loss": 0.008675538003444672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.675538265379146e-05, + "grad_norm": 4.313218116760254, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8676223754882812, + "num_tokens": 892831437.0, + "step": 23406 + }, + { + "epoch": 2.97761099096807, + "ewc_loss": 0.008699610829353333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.699610771145672e-05, + "grad_norm": 4.2718424797058105, + "learning_rate": 1e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.904326856136322, + "num_tokens": 892869022.0, + "step": 23407 + }, + { + "epoch": 2.9777382012466607, + "ewc_loss": 0.008663724176585674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663724293000996e-05, + "grad_norm": 4.302885055541992, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8848164677619934, + "num_tokens": 892910738.0, + "step": 23408 + }, + { + "epoch": 2.977865411525251, + "ewc_loss": 0.008688390254974365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.68839051690884e-05, + "grad_norm": 4.305611610412598, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.893151044845581, + "num_tokens": 892952055.0, + "step": 23409 + }, + { + "epoch": 2.9779926218038417, + "ewc_loss": 0.00865522213280201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.655222336528823e-05, + "grad_norm": 4.362618923187256, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8765360713005066, + "num_tokens": 892985843.0, + "step": 23410 + }, + { + "epoch": 2.9781198320824323, + "ewc_loss": 0.008707279339432716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.707279630471021e-05, + "grad_norm": 4.322564601898193, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8734489679336548, + "num_tokens": 893031197.0, + "step": 23411 + }, + { + "epoch": 2.978247042361023, + "ewc_loss": 0.008645093068480492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.645092748338357e-05, + "grad_norm": 4.345842361450195, + "learning_rate": 1e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8967692852020264, + "num_tokens": 893068113.0, + "step": 23412 + }, + { + "epoch": 2.9783742526396133, + "ewc_loss": 0.008659550920128822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.6595508037135e-05, + "grad_norm": 4.318097114562988, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8942458629608154, + "num_tokens": 893109148.0, + "step": 23413 + }, + { + "epoch": 2.978501462918204, + "ewc_loss": 0.008637214079499245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637213613837957e-05, + "grad_norm": 4.277859687805176, + "learning_rate": 1e-06, + "loss": 0.321, + "mean_token_accuracy": 0.8879024982452393, + "num_tokens": 893151406.0, + "step": 23414 + }, + { + "epoch": 2.9786286731967944, + "ewc_loss": 0.008610950782895088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610950317233801e-05, + "grad_norm": 4.307282447814941, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8869258165359497, + "num_tokens": 893190209.0, + "step": 23415 + }, + { + "epoch": 2.978755883475385, + "ewc_loss": 0.008637185208499432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637185237603262e-05, + "grad_norm": 4.314513683319092, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8836334347724915, + "num_tokens": 893232949.0, + "step": 23416 + }, + { + "epoch": 2.9788830937539754, + "ewc_loss": 0.008633418008685112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633417746750638e-05, + "grad_norm": 4.299533367156982, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8842698335647583, + "num_tokens": 893273931.0, + "step": 23417 + }, + { + "epoch": 2.979010304032566, + "ewc_loss": 0.008610718883574009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.61071894178167e-05, + "grad_norm": 4.29824161529541, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8664916753768921, + "num_tokens": 893317465.0, + "step": 23418 + }, + { + "epoch": 2.9791375143111565, + "ewc_loss": 0.008602076210081577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602076559327543e-05, + "grad_norm": 4.368373870849609, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8707545399665833, + "num_tokens": 893354089.0, + "step": 23419 + }, + { + "epoch": 2.979264724589747, + "ewc_loss": 0.008640692569315434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640692249173298e-05, + "grad_norm": 4.321940898895264, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.882097601890564, + "num_tokens": 893388051.0, + "step": 23420 + }, + { + "epoch": 2.979391934868337, + "ewc_loss": 0.008588516153395176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.588516357121989e-05, + "grad_norm": 4.31258487701416, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8784648776054382, + "num_tokens": 893426191.0, + "step": 23421 + }, + { + "epoch": 2.979519145146928, + "ewc_loss": 0.008619637228548527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.619637083029374e-05, + "grad_norm": 4.368597507476807, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8771494626998901, + "num_tokens": 893462134.0, + "step": 23422 + }, + { + "epoch": 2.979646355425518, + "ewc_loss": 0.008621541783213615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.621541928732768e-05, + "grad_norm": 4.282717704772949, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8792338967323303, + "num_tokens": 893503159.0, + "step": 23423 + }, + { + "epoch": 2.979773565704109, + "ewc_loss": 0.008573118597269058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573118248023093e-05, + "grad_norm": 4.405706882476807, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8681846261024475, + "num_tokens": 893538379.0, + "step": 23424 + }, + { + "epoch": 2.979900775982699, + "ewc_loss": 0.008702185936272144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.702185732545331e-05, + "grad_norm": 4.311557769775391, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8717254996299744, + "num_tokens": 893580431.0, + "step": 23425 + }, + { + "epoch": 2.98002798626129, + "ewc_loss": 0.008584723807871342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584723400417715e-05, + "grad_norm": 4.308064937591553, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8776933550834656, + "num_tokens": 893622412.0, + "step": 23426 + }, + { + "epoch": 2.9801551965398803, + "ewc_loss": 0.008647256530821323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647256618132815e-05, + "grad_norm": 4.303828716278076, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8874722123146057, + "num_tokens": 893658575.0, + "step": 23427 + }, + { + "epoch": 2.980282406818471, + "ewc_loss": 0.008635804988443851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635804988443851e-05, + "grad_norm": 4.231787204742432, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8920398950576782, + "num_tokens": 893702812.0, + "step": 23428 + }, + { + "epoch": 2.9804096170970613, + "ewc_loss": 0.008602732792496681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.602732850704342e-05, + "grad_norm": 4.31481409072876, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8779289126396179, + "num_tokens": 893742785.0, + "step": 23429 + }, + { + "epoch": 2.980536827375652, + "ewc_loss": 0.008672296069562435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.672296098666266e-05, + "grad_norm": 4.287077903747559, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8892449736595154, + "num_tokens": 893782310.0, + "step": 23430 + }, + { + "epoch": 2.9806640376542424, + "ewc_loss": 0.008636943995952606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636943675810471e-05, + "grad_norm": 4.313461780548096, + "learning_rate": 1e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.8985729813575745, + "num_tokens": 893822794.0, + "step": 23431 + }, + { + "epoch": 2.980791247932833, + "ewc_loss": 0.008637208491563797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637208520667627e-05, + "grad_norm": 4.246140956878662, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8868503570556641, + "num_tokens": 893870462.0, + "step": 23432 + }, + { + "epoch": 2.9809184582114234, + "ewc_loss": 0.008578800596296787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.57880077091977e-05, + "grad_norm": 4.352297306060791, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8830723762512207, + "num_tokens": 893910654.0, + "step": 23433 + }, + { + "epoch": 2.981045668490014, + "ewc_loss": 0.008653249591588974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.653249824419618e-05, + "grad_norm": 4.312850475311279, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8802096247673035, + "num_tokens": 893948819.0, + "step": 23434 + }, + { + "epoch": 2.9811728787686045, + "ewc_loss": 0.008579904213547707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.579904533689842e-05, + "grad_norm": 4.3221755027771, + "learning_rate": 1e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8898993730545044, + "num_tokens": 893983325.0, + "step": 23435 + }, + { + "epoch": 2.981300089047195, + "ewc_loss": 0.008612509816884995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.612510282546282e-05, + "grad_norm": 4.273534297943115, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8822205066680908, + "num_tokens": 894023712.0, + "step": 23436 + }, + { + "epoch": 2.9814272993257855, + "ewc_loss": 0.008580612950026989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580613211961463e-05, + "grad_norm": 4.3301167488098145, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8902711868286133, + "num_tokens": 894059719.0, + "step": 23437 + }, + { + "epoch": 2.981554509604376, + "ewc_loss": 0.00862068124115467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62068118294701e-05, + "grad_norm": 4.310391902923584, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8843604326248169, + "num_tokens": 894100150.0, + "step": 23438 + }, + { + "epoch": 2.9816817198829666, + "ewc_loss": 0.008598670363426208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598670683568344e-05, + "grad_norm": 4.40748405456543, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8638925552368164, + "num_tokens": 894134336.0, + "step": 23439 + }, + { + "epoch": 2.981808930161557, + "ewc_loss": 0.008667521178722382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.667521615279838e-05, + "grad_norm": 4.369409084320068, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8705737590789795, + "num_tokens": 894168972.0, + "step": 23440 + }, + { + "epoch": 2.9819361404401477, + "ewc_loss": 0.008636916987597942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636916754767299e-05, + "grad_norm": 4.384817123413086, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8796221017837524, + "num_tokens": 894204641.0, + "step": 23441 + }, + { + "epoch": 2.982063350718738, + "ewc_loss": 0.008653327822685242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.65332767716609e-05, + "grad_norm": 4.2676100730896, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8849414587020874, + "num_tokens": 894248403.0, + "step": 23442 + }, + { + "epoch": 2.9821905609973287, + "ewc_loss": 0.008594943210482597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594943210482597e-05, + "grad_norm": 4.3350138664245605, + "learning_rate": 1e-06, + "loss": 0.2789, + "mean_token_accuracy": 0.9006542563438416, + "num_tokens": 894284035.0, + "step": 23443 + }, + { + "epoch": 2.9823177712759192, + "ewc_loss": 0.008663158863782883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663158951094374e-05, + "grad_norm": 4.288338661193848, + "learning_rate": 1e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8867924213409424, + "num_tokens": 894321804.0, + "step": 23444 + }, + { + "epoch": 2.9824449815545098, + "ewc_loss": 0.008624444715678692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624445035820827e-05, + "grad_norm": 4.339970111846924, + "learning_rate": 1e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8914023637771606, + "num_tokens": 894355648.0, + "step": 23445 + }, + { + "epoch": 2.9825721918331, + "ewc_loss": 0.008671283721923828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.671284012962133e-05, + "grad_norm": 4.2959370613098145, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.884294867515564, + "num_tokens": 894392947.0, + "step": 23446 + }, + { + "epoch": 2.982699402111691, + "ewc_loss": 0.008620223961770535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.620224252808839e-05, + "grad_norm": 4.321161270141602, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8759204745292664, + "num_tokens": 894433437.0, + "step": 23447 + }, + { + "epoch": 2.982826612390281, + "ewc_loss": 0.00867350772023201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.673507545609027e-05, + "grad_norm": 4.263701438903809, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8870865702629089, + "num_tokens": 894481428.0, + "step": 23448 + }, + { + "epoch": 2.982953822668872, + "ewc_loss": 0.008633637800812721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633637480670586e-05, + "grad_norm": 4.333775997161865, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8827624320983887, + "num_tokens": 894518055.0, + "step": 23449 + }, + { + "epoch": 2.983081032947462, + "ewc_loss": 0.008692111819982529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.692112169228494e-05, + "grad_norm": 4.271435737609863, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8783714771270752, + "num_tokens": 894562693.0, + "step": 23450 + }, + { + "epoch": 2.983208243226053, + "ewc_loss": 0.008628297597169876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.628297655377537e-05, + "grad_norm": 4.289600849151611, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8833633065223694, + "num_tokens": 894604973.0, + "step": 23451 + }, + { + "epoch": 2.983335453504643, + "ewc_loss": 0.008669826202094555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.669826638652012e-05, + "grad_norm": 4.332244396209717, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8895919322967529, + "num_tokens": 894645672.0, + "step": 23452 + }, + { + "epoch": 2.9834626637832335, + "ewc_loss": 0.008670434355735779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.670434181112796e-05, + "grad_norm": 4.278093338012695, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8818079233169556, + "num_tokens": 894686702.0, + "step": 23453 + }, + { + "epoch": 2.983589874061824, + "ewc_loss": 0.00862781424075365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627813804196194e-05, + "grad_norm": 4.297054767608643, + "learning_rate": 1e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8922927975654602, + "num_tokens": 894722768.0, + "step": 23454 + }, + { + "epoch": 2.9837170843404146, + "ewc_loss": 0.008651716634631157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.651716780150309e-05, + "grad_norm": 4.32420539855957, + "learning_rate": 1e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8938945531845093, + "num_tokens": 894759216.0, + "step": 23455 + }, + { + "epoch": 2.983844294619005, + "ewc_loss": 0.008649912662804127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.649913070257753e-05, + "grad_norm": 4.2904181480407715, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8893959522247314, + "num_tokens": 894799691.0, + "step": 23456 + }, + { + "epoch": 2.9839715048975957, + "ewc_loss": 0.008611130528151989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611130760982633e-05, + "grad_norm": 4.3398118019104, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8802253007888794, + "num_tokens": 894835869.0, + "step": 23457 + }, + { + "epoch": 2.984098715176186, + "ewc_loss": 0.008660975843667984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.660976163810119e-05, + "grad_norm": 4.3559346199035645, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.862596869468689, + "num_tokens": 894875706.0, + "step": 23458 + }, + { + "epoch": 2.9842259254547767, + "ewc_loss": 0.008666062727570534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.666062785778195e-05, + "grad_norm": 4.3355488777160645, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8887709379196167, + "num_tokens": 894920135.0, + "step": 23459 + }, + { + "epoch": 2.9843531357333672, + "ewc_loss": 0.008604660630226135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604660979472101e-05, + "grad_norm": 4.291653633117676, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.884947657585144, + "num_tokens": 894960825.0, + "step": 23460 + }, + { + "epoch": 2.9844803460119578, + "ewc_loss": 0.008622205816209316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622206223662943e-05, + "grad_norm": 4.358714580535889, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8860753178596497, + "num_tokens": 895001935.0, + "step": 23461 + }, + { + "epoch": 2.9846075562905483, + "ewc_loss": 0.008640462532639503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64046232891269e-05, + "grad_norm": 4.395585060119629, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8907784223556519, + "num_tokens": 895033559.0, + "step": 23462 + }, + { + "epoch": 2.984734766569139, + "ewc_loss": 0.008631216362118721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631216041976586e-05, + "grad_norm": 4.33811092376709, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8835548162460327, + "num_tokens": 895070958.0, + "step": 23463 + }, + { + "epoch": 2.9848619768477294, + "ewc_loss": 0.008571201004087925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.571201033191755e-05, + "grad_norm": 4.3358235359191895, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8819109201431274, + "num_tokens": 895111533.0, + "step": 23464 + }, + { + "epoch": 2.98498918712632, + "ewc_loss": 0.008614202961325645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614202670287341e-05, + "grad_norm": 4.362996578216553, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8805496096611023, + "num_tokens": 895149861.0, + "step": 23465 + }, + { + "epoch": 2.9851163974049104, + "ewc_loss": 0.00861199852079153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611998782726005e-05, + "grad_norm": 4.268999099731445, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8878463506698608, + "num_tokens": 895192958.0, + "step": 23466 + }, + { + "epoch": 2.985243607683501, + "ewc_loss": 0.008563945069909096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563945448258892e-05, + "grad_norm": 4.356866359710693, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8908462524414062, + "num_tokens": 895226710.0, + "step": 23467 + }, + { + "epoch": 2.9853708179620915, + "ewc_loss": 0.00863688811659813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636888378532603e-05, + "grad_norm": 4.325926780700684, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8889647722244263, + "num_tokens": 895267695.0, + "step": 23468 + }, + { + "epoch": 2.985498028240682, + "ewc_loss": 0.008576128631830215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576129039283842e-05, + "grad_norm": 4.35784387588501, + "learning_rate": 1e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.8992931842803955, + "num_tokens": 895303101.0, + "step": 23469 + }, + { + "epoch": 2.9856252385192725, + "ewc_loss": 0.008604618720710278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604618778917938e-05, + "grad_norm": 4.333049774169922, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8898502588272095, + "num_tokens": 895340437.0, + "step": 23470 + }, + { + "epoch": 2.9857524487978626, + "ewc_loss": 0.0085927564650774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592756785219535e-05, + "grad_norm": 4.310871601104736, + "learning_rate": 1e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8972411751747131, + "num_tokens": 895377695.0, + "step": 23471 + }, + { + "epoch": 2.9858796590764536, + "ewc_loss": 0.008578028529882431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578028791816905e-05, + "grad_norm": 4.330672740936279, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.875785231590271, + "num_tokens": 895418599.0, + "step": 23472 + }, + { + "epoch": 2.9860068693550437, + "ewc_loss": 0.008582474663853645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.582475129514933e-05, + "grad_norm": 4.324310302734375, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8814243078231812, + "num_tokens": 895454832.0, + "step": 23473 + }, + { + "epoch": 2.9861340796336346, + "ewc_loss": 0.008562551811337471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.562551374780014e-05, + "grad_norm": 4.40018367767334, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.869788646697998, + "num_tokens": 895490607.0, + "step": 23474 + }, + { + "epoch": 2.9862612899122247, + "ewc_loss": 0.008627843111753464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62784290802665e-05, + "grad_norm": 4.301316738128662, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8773401379585266, + "num_tokens": 895531340.0, + "step": 23475 + }, + { + "epoch": 2.9863885001908153, + "ewc_loss": 0.00852212868630886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.522129064658657e-05, + "grad_norm": 4.329935550689697, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8843311071395874, + "num_tokens": 895568688.0, + "step": 23476 + }, + { + "epoch": 2.986515710469406, + "ewc_loss": 0.008607683703303337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607683412265033e-05, + "grad_norm": 4.355930328369141, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8745901584625244, + "num_tokens": 895607316.0, + "step": 23477 + }, + { + "epoch": 2.9866429207479963, + "ewc_loss": 0.00859781913459301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.597818668931723e-05, + "grad_norm": 4.272263526916504, + "learning_rate": 1e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.8998277187347412, + "num_tokens": 895648694.0, + "step": 23478 + }, + { + "epoch": 2.986770131026587, + "ewc_loss": 0.008518501184880733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.518501272192225e-05, + "grad_norm": 4.294685363769531, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8807695508003235, + "num_tokens": 895687996.0, + "step": 23479 + }, + { + "epoch": 2.9868973413051774, + "ewc_loss": 0.008592967875301838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.592967787990347e-05, + "grad_norm": 4.345211505889893, + "learning_rate": 1e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8902632594108582, + "num_tokens": 895726727.0, + "step": 23480 + }, + { + "epoch": 2.987024551583768, + "ewc_loss": 0.008581457659602165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.581457223044708e-05, + "grad_norm": 4.31043004989624, + "learning_rate": 1e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.888729989528656, + "num_tokens": 895760688.0, + "step": 23481 + }, + { + "epoch": 2.9871517618623584, + "ewc_loss": 0.008570384234189987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.570384670747444e-05, + "grad_norm": 4.380072116851807, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.864019513130188, + "num_tokens": 895797303.0, + "step": 23482 + }, + { + "epoch": 2.987278972140949, + "ewc_loss": 0.008611716330051422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.611716475570574e-05, + "grad_norm": 4.332513809204102, + "learning_rate": 1e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8883224725723267, + "num_tokens": 895834078.0, + "step": 23483 + }, + { + "epoch": 2.9874061824195395, + "ewc_loss": 0.00857323594391346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.573236118536443e-05, + "grad_norm": 4.330318927764893, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8744632601737976, + "num_tokens": 895872640.0, + "step": 23484 + }, + { + "epoch": 2.98753339269813, + "ewc_loss": 0.008598565123975277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598565182182938e-05, + "grad_norm": 4.330672740936279, + "learning_rate": 1e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.896308422088623, + "num_tokens": 895907646.0, + "step": 23485 + }, + { + "epoch": 2.9876606029767205, + "ewc_loss": 0.008580435067415237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.580434950999916e-05, + "grad_norm": 4.292377471923828, + "learning_rate": 1e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.893258273601532, + "num_tokens": 895944183.0, + "step": 23486 + }, + { + "epoch": 2.987787813255311, + "ewc_loss": 0.008598989807069302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.598990098107606e-05, + "grad_norm": 4.288021564483643, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8881315588951111, + "num_tokens": 895983987.0, + "step": 23487 + }, + { + "epoch": 2.9879150235339016, + "ewc_loss": 0.008578264154493809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.578263805247843e-05, + "grad_norm": 4.3698859214782715, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8743329644203186, + "num_tokens": 896020928.0, + "step": 23488 + }, + { + "epoch": 2.988042233812492, + "ewc_loss": 0.008634813129901886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634813275421038e-05, + "grad_norm": 4.337372303009033, + "learning_rate": 1e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8903330564498901, + "num_tokens": 896057459.0, + "step": 23489 + }, + { + "epoch": 2.9881694440910826, + "ewc_loss": 0.008596984669566154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.596984844189137e-05, + "grad_norm": 4.312501907348633, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8796803951263428, + "num_tokens": 896095049.0, + "step": 23490 + }, + { + "epoch": 2.988296654369673, + "ewc_loss": 0.008589492179453373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.589492063038051e-05, + "grad_norm": 4.330530643463135, + "learning_rate": 1e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8929942846298218, + "num_tokens": 896127433.0, + "step": 23491 + }, + { + "epoch": 2.9884238646482637, + "ewc_loss": 0.008642315864562988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642316242912784e-05, + "grad_norm": 4.334576606750488, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8911337852478027, + "num_tokens": 896165662.0, + "step": 23492 + }, + { + "epoch": 2.9885510749268542, + "ewc_loss": 0.008622333407402039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.622333552921191e-05, + "grad_norm": 4.280190467834473, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8835507035255432, + "num_tokens": 896207753.0, + "step": 23493 + }, + { + "epoch": 2.9886782852054448, + "ewc_loss": 0.008610570803284645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.610571239842102e-05, + "grad_norm": 4.309207916259766, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8831579089164734, + "num_tokens": 896249105.0, + "step": 23494 + }, + { + "epoch": 2.9888054954840353, + "ewc_loss": 0.008647438138723373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647437789477408e-05, + "grad_norm": 4.293323516845703, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8857039213180542, + "num_tokens": 896291728.0, + "step": 23495 + }, + { + "epoch": 2.9889327057626254, + "ewc_loss": 0.008626917377114296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626917406218126e-05, + "grad_norm": 4.302209854125977, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8742079734802246, + "num_tokens": 896331729.0, + "step": 23496 + }, + { + "epoch": 2.9890599160412163, + "ewc_loss": 0.008625389076769352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625389455119148e-05, + "grad_norm": 4.3128156661987305, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8736070394515991, + "num_tokens": 896371804.0, + "step": 23497 + }, + { + "epoch": 2.9891871263198064, + "ewc_loss": 0.008630371652543545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.630371303297579e-05, + "grad_norm": 4.350964069366455, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8814398050308228, + "num_tokens": 896408398.0, + "step": 23498 + }, + { + "epoch": 2.9893143365983974, + "ewc_loss": 0.008643297478556633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643297769594938e-05, + "grad_norm": 4.331809997558594, + "learning_rate": 1e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.8973226547241211, + "num_tokens": 896440496.0, + "step": 23499 + }, + { + "epoch": 2.9894415468769875, + "ewc_loss": 0.008632338605821133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.632338722236454e-05, + "grad_norm": 4.362188816070557, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8850269317626953, + "num_tokens": 896475212.0, + "step": 23500 + }, + { + "epoch": 2.989568757155578, + "ewc_loss": 0.008663773536682129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.663773769512773e-05, + "grad_norm": 4.3034586906433105, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.881083607673645, + "num_tokens": 896517122.0, + "step": 23501 + }, + { + "epoch": 2.9896959674341685, + "ewc_loss": 0.008646617643535137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.64661778905429e-05, + "grad_norm": 4.325439929962158, + "learning_rate": 1e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8921616077423096, + "num_tokens": 896557259.0, + "step": 23502 + }, + { + "epoch": 2.989823177712759, + "ewc_loss": 0.008672156371176243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.672156400280073e-05, + "grad_norm": 4.308811664581299, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8736689686775208, + "num_tokens": 896600237.0, + "step": 23503 + }, + { + "epoch": 2.9899503879913496, + "ewc_loss": 0.008644110523164272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.644110494060442e-05, + "grad_norm": 4.316110134124756, + "learning_rate": 1e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.8941210508346558, + "num_tokens": 896640564.0, + "step": 23504 + }, + { + "epoch": 2.99007759826994, + "ewc_loss": 0.008660001680254936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.66000191308558e-05, + "grad_norm": 4.311495780944824, + "learning_rate": 1e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8940557837486267, + "num_tokens": 896674164.0, + "step": 23505 + }, + { + "epoch": 2.9902048085485307, + "ewc_loss": 0.00862673670053482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.626736962469295e-05, + "grad_norm": 4.335923194885254, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8779180645942688, + "num_tokens": 896716613.0, + "step": 23506 + }, + { + "epoch": 2.990332018827121, + "ewc_loss": 0.008636554703116417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.636554412078112e-05, + "grad_norm": 4.328085899353027, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8758605122566223, + "num_tokens": 896755586.0, + "step": 23507 + }, + { + "epoch": 2.9904592291057117, + "ewc_loss": 0.008635123260319233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.635123231215402e-05, + "grad_norm": 4.3317694664001465, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8842991590499878, + "num_tokens": 896797641.0, + "step": 23508 + }, + { + "epoch": 2.9905864393843022, + "ewc_loss": 0.008633197285234928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633197285234928e-05, + "grad_norm": 4.325343608856201, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8782217502593994, + "num_tokens": 896835113.0, + "step": 23509 + }, + { + "epoch": 2.9907136496628928, + "ewc_loss": 0.008614648133516312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.614647958893329e-05, + "grad_norm": 4.377368927001953, + "learning_rate": 1e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8845685720443726, + "num_tokens": 896866496.0, + "step": 23510 + }, + { + "epoch": 2.9908408599414833, + "ewc_loss": 0.00864856131374836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.648561197333038e-05, + "grad_norm": 4.272014141082764, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8825474977493286, + "num_tokens": 896907535.0, + "step": 23511 + }, + { + "epoch": 2.990968070220074, + "ewc_loss": 0.00859439093619585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.594390965299681e-05, + "grad_norm": 4.294436454772949, + "learning_rate": 1e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.9007657766342163, + "num_tokens": 896948574.0, + "step": 23512 + }, + { + "epoch": 2.9910952804986644, + "ewc_loss": 0.008608575910329819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.608576172264293e-05, + "grad_norm": 4.2830281257629395, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8799146413803101, + "num_tokens": 896992377.0, + "step": 23513 + }, + { + "epoch": 2.991222490777255, + "ewc_loss": 0.008595121093094349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.595120743848383e-05, + "grad_norm": 4.3131585121154785, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8793119192123413, + "num_tokens": 897035411.0, + "step": 23514 + }, + { + "epoch": 2.9913497010558454, + "ewc_loss": 0.00860734935849905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607349445810542e-05, + "grad_norm": 4.330111503601074, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8856797814369202, + "num_tokens": 897074661.0, + "step": 23515 + }, + { + "epoch": 2.991476911334436, + "ewc_loss": 0.00860921386629343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609214273747057e-05, + "grad_norm": 4.328500270843506, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8855781555175781, + "num_tokens": 897115550.0, + "step": 23516 + }, + { + "epoch": 2.9916041216130265, + "ewc_loss": 0.008609248325228691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.609248470747843e-05, + "grad_norm": 4.35908842086792, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8769643306732178, + "num_tokens": 897155091.0, + "step": 23517 + }, + { + "epoch": 2.991731331891617, + "ewc_loss": 0.00861585233360529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.615852129878476e-05, + "grad_norm": 4.297248840332031, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.879081130027771, + "num_tokens": 897196551.0, + "step": 23518 + }, + { + "epoch": 2.991858542170207, + "ewc_loss": 0.008584503084421158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.584502938902006e-05, + "grad_norm": 4.323158264160156, + "learning_rate": 1e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.8935297727584839, + "num_tokens": 897231885.0, + "step": 23519 + }, + { + "epoch": 2.991985752448798, + "ewc_loss": 0.00860784761607647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.607847848907113e-05, + "grad_norm": 4.435649871826172, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8826647996902466, + "num_tokens": 897267944.0, + "step": 23520 + }, + { + "epoch": 2.992112962727388, + "ewc_loss": 0.008643646724522114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643647015560418e-05, + "grad_norm": 4.284305095672607, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8861538171768188, + "num_tokens": 897307330.0, + "step": 23521 + }, + { + "epoch": 2.992240173005979, + "ewc_loss": 0.008530868217349052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.530868217349052e-05, + "grad_norm": 4.32639741897583, + "learning_rate": 1e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8876863718032837, + "num_tokens": 897344028.0, + "step": 23522 + }, + { + "epoch": 2.992367383284569, + "ewc_loss": 0.008629484102129936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629483636468649e-05, + "grad_norm": 4.290510654449463, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8760127425193787, + "num_tokens": 897391213.0, + "step": 23523 + }, + { + "epoch": 2.99249459356316, + "ewc_loss": 0.008586789481341839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.586789044784382e-05, + "grad_norm": 4.336902141571045, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8767657279968262, + "num_tokens": 897428222.0, + "step": 23524 + }, + { + "epoch": 2.9926218038417502, + "ewc_loss": 0.008624633774161339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624633483123034e-05, + "grad_norm": 4.449669361114502, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.87440425157547, + "num_tokens": 897457737.0, + "step": 23525 + }, + { + "epoch": 2.992749014120341, + "ewc_loss": 0.008669394068419933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.669393719173968e-05, + "grad_norm": 4.294137954711914, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8885160684585571, + "num_tokens": 897498925.0, + "step": 23526 + }, + { + "epoch": 2.9928762243989313, + "ewc_loss": 0.0085494639351964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.549464109819382e-05, + "grad_norm": 4.364720344543457, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8781787157058716, + "num_tokens": 897533157.0, + "step": 23527 + }, + { + "epoch": 2.993003434677522, + "ewc_loss": 0.008674412965774536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.674413402331993e-05, + "grad_norm": 4.298618793487549, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8860476016998291, + "num_tokens": 897574159.0, + "step": 23528 + }, + { + "epoch": 2.9931306449561124, + "ewc_loss": 0.008605394512414932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60539439599961e-05, + "grad_norm": 4.385303020477295, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8766818046569824, + "num_tokens": 897606970.0, + "step": 23529 + }, + { + "epoch": 2.993257855234703, + "ewc_loss": 0.008669109083712101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.669109229231253e-05, + "grad_norm": 4.328532695770264, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8629868626594543, + "num_tokens": 897647725.0, + "step": 23530 + }, + { + "epoch": 2.9933850655132934, + "ewc_loss": 0.008623221889138222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623221947345883e-05, + "grad_norm": 4.2827887535095215, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.887096107006073, + "num_tokens": 897687109.0, + "step": 23531 + }, + { + "epoch": 2.993512275791884, + "ewc_loss": 0.008625250309705734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.625250484328717e-05, + "grad_norm": 4.317567825317383, + "learning_rate": 1e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8943722248077393, + "num_tokens": 897720994.0, + "step": 23532 + }, + { + "epoch": 2.9936394860704745, + "ewc_loss": 0.008688248693943024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.688248635735363e-05, + "grad_norm": 4.381496906280518, + "learning_rate": 1e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.9005312919616699, + "num_tokens": 897749855.0, + "step": 23533 + }, + { + "epoch": 2.993766696349065, + "ewc_loss": 0.00870009046047926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.700090256752446e-05, + "grad_norm": 4.289973735809326, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.88287752866745, + "num_tokens": 897792846.0, + "step": 23534 + }, + { + "epoch": 2.9938939066276555, + "ewc_loss": 0.008629935793578625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62993547343649e-05, + "grad_norm": 4.275728225708008, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8869659900665283, + "num_tokens": 897836765.0, + "step": 23535 + }, + { + "epoch": 2.994021116906246, + "ewc_loss": 0.008657931350171566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.657931175548583e-05, + "grad_norm": 4.290627479553223, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8836735486984253, + "num_tokens": 897880918.0, + "step": 23536 + }, + { + "epoch": 2.9941483271848366, + "ewc_loss": 0.008673197589814663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.673197589814663e-05, + "grad_norm": 4.363211631774902, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8837763667106628, + "num_tokens": 897916676.0, + "step": 23537 + }, + { + "epoch": 2.994275537463427, + "ewc_loss": 0.00868306215852499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.683062333147973e-05, + "grad_norm": 4.3257737159729, + "learning_rate": 1e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.8890602588653564, + "num_tokens": 897953968.0, + "step": 23538 + }, + { + "epoch": 2.9944027477420176, + "ewc_loss": 0.00864268559962511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.642685861559585e-05, + "grad_norm": 4.374516487121582, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8825905323028564, + "num_tokens": 897993055.0, + "step": 23539 + }, + { + "epoch": 2.994529958020608, + "ewc_loss": 0.00869485642760992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.694856660440564e-05, + "grad_norm": 4.360523700714111, + "learning_rate": 1e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8903620839118958, + "num_tokens": 898030202.0, + "step": 23540 + }, + { + "epoch": 2.9946571682991987, + "ewc_loss": 0.008648348040878773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.648348011774942e-05, + "grad_norm": 4.388321399688721, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.880314826965332, + "num_tokens": 898067234.0, + "step": 23541 + }, + { + "epoch": 2.9947843785777892, + "ewc_loss": 0.00865983497351408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.659835293656215e-05, + "grad_norm": 4.302679061889648, + "learning_rate": 1e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.8866020441055298, + "num_tokens": 898105423.0, + "step": 23542 + }, + { + "epoch": 2.9949115888563798, + "ewc_loss": 0.008627491071820259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.627491479273885e-05, + "grad_norm": 4.366286277770996, + "learning_rate": 1e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.900652289390564, + "num_tokens": 898144550.0, + "step": 23543 + }, + { + "epoch": 2.99503879913497, + "ewc_loss": 0.008677839301526546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67783892317675e-05, + "grad_norm": 4.385117053985596, + "learning_rate": 1e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8874527812004089, + "num_tokens": 898179277.0, + "step": 23544 + }, + { + "epoch": 2.995166009413561, + "ewc_loss": 0.008662299253046513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.662299660500139e-05, + "grad_norm": 4.406050205230713, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8818594813346863, + "num_tokens": 898218518.0, + "step": 23545 + }, + { + "epoch": 2.995293219692151, + "ewc_loss": 0.008666305802762508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.666305802762508e-05, + "grad_norm": 4.3443145751953125, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8726315498352051, + "num_tokens": 898259266.0, + "step": 23546 + }, + { + "epoch": 2.995420429970742, + "ewc_loss": 0.008631416596472263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.631416858406737e-05, + "grad_norm": 4.354918956756592, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8817300796508789, + "num_tokens": 898292648.0, + "step": 23547 + }, + { + "epoch": 2.995547640249332, + "ewc_loss": 0.008664807304739952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.664807683089748e-05, + "grad_norm": 4.336032867431641, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8879056572914124, + "num_tokens": 898328413.0, + "step": 23548 + }, + { + "epoch": 2.995674850527923, + "ewc_loss": 0.00865771621465683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.657715807203203e-05, + "grad_norm": 4.252137660980225, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8840612173080444, + "num_tokens": 898376963.0, + "step": 23549 + }, + { + "epoch": 2.995802060806513, + "ewc_loss": 0.00862863752990961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62863744259812e-05, + "grad_norm": 4.422982692718506, + "learning_rate": 1e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8945653438568115, + "num_tokens": 898407256.0, + "step": 23550 + }, + { + "epoch": 2.9959292710851035, + "ewc_loss": 0.008750933222472668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.750933193368837e-05, + "grad_norm": 4.306535720825195, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8855195641517639, + "num_tokens": 898448800.0, + "step": 23551 + }, + { + "epoch": 2.996056481363694, + "ewc_loss": 0.008618554100394249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.618553692940623e-05, + "grad_norm": 4.3752546310424805, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8801300525665283, + "num_tokens": 898488912.0, + "step": 23552 + }, + { + "epoch": 2.9961836916422846, + "ewc_loss": 0.008700057864189148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.700057514943182e-05, + "grad_norm": 4.310241222381592, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8776219487190247, + "num_tokens": 898530434.0, + "step": 23553 + }, + { + "epoch": 2.996310901920875, + "ewc_loss": 0.00862421840429306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.624218753539026e-05, + "grad_norm": 4.331787586212158, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8851704001426697, + "num_tokens": 898565044.0, + "step": 23554 + }, + { + "epoch": 2.9964381121994657, + "ewc_loss": 0.008657073602080345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.657074067741632e-05, + "grad_norm": 4.35218620300293, + "learning_rate": 1e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.8945410251617432, + "num_tokens": 898601798.0, + "step": 23555 + }, + { + "epoch": 2.996565322478056, + "ewc_loss": 0.008688783273100853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.688783418620005e-05, + "grad_norm": 4.281232833862305, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8741871118545532, + "num_tokens": 898644789.0, + "step": 23556 + }, + { + "epoch": 2.9966925327566467, + "ewc_loss": 0.008629251271486282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.629250805824995e-05, + "grad_norm": 4.354056358337402, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8843745589256287, + "num_tokens": 898683928.0, + "step": 23557 + }, + { + "epoch": 2.9968197430352372, + "ewc_loss": 0.008691880851984024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.691880793776363e-05, + "grad_norm": 4.301878452301025, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8801095485687256, + "num_tokens": 898726217.0, + "step": 23558 + }, + { + "epoch": 2.9969469533138278, + "ewc_loss": 0.008633112534880638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.633112884126604e-05, + "grad_norm": 4.371665954589844, + "learning_rate": 1e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.8931031227111816, + "num_tokens": 898760575.0, + "step": 23559 + }, + { + "epoch": 2.9970741635924183, + "ewc_loss": 0.008676192723214626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.676193101564422e-05, + "grad_norm": 4.31669282913208, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8793515563011169, + "num_tokens": 898801232.0, + "step": 23560 + }, + { + "epoch": 2.997201373871009, + "ewc_loss": 0.008604925125837326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.604925096733496e-05, + "grad_norm": 4.324937343597412, + "learning_rate": 1e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.887749195098877, + "num_tokens": 898837174.0, + "step": 23561 + }, + { + "epoch": 2.9973285841495994, + "ewc_loss": 0.008634988218545914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.634987898403779e-05, + "grad_norm": 4.3444061279296875, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8768991231918335, + "num_tokens": 898875849.0, + "step": 23562 + }, + { + "epoch": 2.99745579442819, + "ewc_loss": 0.008646553382277489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646553760627285e-05, + "grad_norm": 4.334798336029053, + "learning_rate": 1e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8910127878189087, + "num_tokens": 898909975.0, + "step": 23563 + }, + { + "epoch": 2.9975830047067804, + "ewc_loss": 0.008635814301669598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63581444718875e-05, + "grad_norm": 4.356677532196045, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8782697916030884, + "num_tokens": 898948250.0, + "step": 23564 + }, + { + "epoch": 2.997710214985371, + "ewc_loss": 0.008637849241495132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.637848804937676e-05, + "grad_norm": 4.291924476623535, + "learning_rate": 1e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8989062905311584, + "num_tokens": 898989122.0, + "step": 23565 + }, + { + "epoch": 2.9978374252639615, + "ewc_loss": 0.00860442966222763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.60442960401997e-05, + "grad_norm": 4.357041835784912, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8790982961654663, + "num_tokens": 899028715.0, + "step": 23566 + }, + { + "epoch": 2.997964635542552, + "ewc_loss": 0.00867235753685236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.672357216710225e-05, + "grad_norm": 4.417598724365234, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8877613544464111, + "num_tokens": 899064101.0, + "step": 23567 + }, + { + "epoch": 2.9980918458211425, + "ewc_loss": 0.008643651381134987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643651381134987e-05, + "grad_norm": 4.290977954864502, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8877312541007996, + "num_tokens": 899101759.0, + "step": 23568 + }, + { + "epoch": 2.9982190560997326, + "ewc_loss": 0.008563611656427383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.563611481804401e-05, + "grad_norm": 4.315652370452881, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8773744106292725, + "num_tokens": 899142273.0, + "step": 23569 + }, + { + "epoch": 2.9983462663783236, + "ewc_loss": 0.008638037368655205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638037252239883e-05, + "grad_norm": 4.407495021820068, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8870563507080078, + "num_tokens": 899175878.0, + "step": 23570 + }, + { + "epoch": 2.9984734766569137, + "ewc_loss": 0.008669527247548103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.669527596794069e-05, + "grad_norm": 4.328552722930908, + "learning_rate": 1e-06, + "loss": 0.301, + "mean_token_accuracy": 0.8946671485900879, + "num_tokens": 899208937.0, + "step": 23571 + }, + { + "epoch": 2.9986006869355046, + "ewc_loss": 0.008593574166297913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.593573875259608e-05, + "grad_norm": 4.349266529083252, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8834394216537476, + "num_tokens": 899244088.0, + "step": 23572 + }, + { + "epoch": 2.9987278972140947, + "ewc_loss": 0.008640673011541367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.640673331683502e-05, + "grad_norm": 4.401031970977783, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8789118528366089, + "num_tokens": 899280111.0, + "step": 23573 + }, + { + "epoch": 2.9988551074926852, + "ewc_loss": 0.008668473921716213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668474038131535e-05, + "grad_norm": 4.396504878997803, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8843866586685181, + "num_tokens": 899310882.0, + "step": 23574 + }, + { + "epoch": 2.9989823177712758, + "ewc_loss": 0.00864320620894432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643206092528999e-05, + "grad_norm": 4.332613945007324, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8706698417663574, + "num_tokens": 899346820.0, + "step": 23575 + }, + { + "epoch": 2.9991095280498663, + "ewc_loss": 0.008627216331660748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.62721644807607e-05, + "grad_norm": 4.294928550720215, + "learning_rate": 1e-06, + "loss": 0.2988, + "mean_token_accuracy": 0.8953608870506287, + "num_tokens": 899382044.0, + "step": 23576 + }, + { + "epoch": 2.999236738328457, + "ewc_loss": 0.008661462925374508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.661462925374508e-05, + "grad_norm": 4.315836429595947, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8840041160583496, + "num_tokens": 899418389.0, + "step": 23577 + }, + { + "epoch": 2.9993639486070474, + "ewc_loss": 0.008688629604876041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.688629895914346e-05, + "grad_norm": 4.317151069641113, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8828029036521912, + "num_tokens": 899458486.0, + "step": 23578 + }, + { + "epoch": 2.999491158885638, + "ewc_loss": 0.008685870096087456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.685870125191286e-05, + "grad_norm": 4.321619987487793, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.88315349817276, + "num_tokens": 899498168.0, + "step": 23579 + }, + { + "epoch": 2.9996183691642284, + "ewc_loss": 0.008689749985933304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.68975039338693e-05, + "grad_norm": 4.292153835296631, + "learning_rate": 1e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.8993210792541504, + "num_tokens": 899538655.0, + "step": 23580 + }, + { + "epoch": 2.999745579442819, + "ewc_loss": 0.008668345399200916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.668345253681764e-05, + "grad_norm": 4.339853286743164, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.875085711479187, + "num_tokens": 899581693.0, + "step": 23581 + }, + { + "epoch": 2.9998727897214095, + "ewc_loss": 0.008686917833983898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.686917863087729e-05, + "grad_norm": 4.310949802398682, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8806959390640259, + "num_tokens": 899623817.0, + "step": 23582 + }, + { + "epoch": 3.0, + "ewc_loss": 0.008638550527393818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638550207251683e-05, + "grad_norm": 4.269381999969482, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8887474536895752, + "num_tokens": 899664226.0, + "step": 23583 + }, + { + "epoch": 3.0, + "ewc_loss": 0.008638550527393818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.638550207251683e-05, + "step": 23583, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.3824227017627763, + "train_runtime": 45517.1597, + "train_samples_per_second": 8.289, + "train_steps_per_second": 0.518 + } + ], + "logging_steps": 1, + "max_steps": 23583, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 11792, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.62815163329864e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..c071a46 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:523a05e081ddcb68f7f39d780e9e440b7f46765bab4f09e49e1cfb12965736aa +size 13393